diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb03aeabe17c3..b55c625065f6a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -375,6 +375,12 @@ def FeatureGFX10A16 : SubtargetFeature<"a16", "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands" >; +def FeatureG16 : SubtargetFeature<"g16", + "HasG16", + "true", + "Support G16 for 16-bit gradient image operands" +>; + def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "HasNSAEncoding", "true", @@ -666,7 +672,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, - FeatureGFX10A16, FeatureFastDenormalF32 + FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16 ] >; @@ -1087,6 +1093,9 @@ def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">, AssemblerPredicate<(all_of FeatureGFX10A16)>; +def HasG16 : Predicate<"Subtarget->hasG16()">, + AssemblerPredicate<(all_of FeatureG16)>; + def HasDPP16 : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5afec2188d66b..6e156ffce9b97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1293,15 +1293,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; - const LLT S16 = LLT::scalar(16); const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, MI.getNumExplicitDefs()); int NumVAddr, NumGradients; std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); - const LLT AddrTy = MRI->getType(MI.getOperand(VAddrIdx).getReg()); - const bool IsA16 = AddrTy.getScalarType() == S16; - Register VDataIn, VDataOut; LLT VDataTy; int NumVDataDwords = -1; @@ -1324,6 +1320,14 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) return false; + const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); + const bool IsA16 = (Flags & 1) != 0; + const bool IsG16 = (Flags & 2) != 0; + + // A16 implies 16 bit gradients + if (IsA16 && !IsG16) + return false; + unsigned DMask = 0; unsigned DMaskLanes = 0; @@ -1396,6 +1400,14 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } } + // Set G16 opcode + if (IsG16 && !IsA16) { + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + assert(G16MappingInfo); + IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 + } + // TODO: Check this in verifier. assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 7706b085a0be4..dff315ca6f518 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3589,12 +3589,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, /// vector with s16 typed elements. static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl &PackedAddrs, - int AddrIdx, int DimIdx, int NumVAddrs, + int AddrIdx, int DimIdx, int EndIdx, int NumGradients) { const LLT S16 = LLT::scalar(16); const LLT V2S16 = LLT::vector(2, 16); - for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { + for (int I = AddrIdx; I < EndIdx; ++I) { MachineOperand &SrcOp = MI.getOperand(I); if (!SrcOp.isReg()) continue; // _L to _LZ may have eliminated this. @@ -3607,7 +3607,7 @@ static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. - if (((I + 1) >= (AddrIdx + NumVAddrs)) || + if (((I + 1) >= EndIdx) || ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || I == DimIdx + NumGradients - 1)) || @@ -3698,17 +3698,19 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // Index of first address argument const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); - // Check for 16 bit addresses and pack if true. - int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; - LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); - const bool IsA16 = AddrTy == S16; - int NumVAddrs, NumGradients; std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); const int DMaskIdx = BaseOpcode->Atomic ? -1 : getDMaskIdx(BaseOpcode, NumDefs); unsigned DMask = 0; + // Check for 16 bit addresses and pack if true. + int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); + LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); + const bool IsG16 = GradTy == S16; + const bool IsA16 = AddrTy == S16; + int DMaskLanes = 0; if (!BaseOpcode->Atomic) { DMask = MI.getOperand(DMaskIdx).getImm(); @@ -3799,30 +3801,34 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } } - // If the register allocator cannot place the address registers contiguously - // without introducing moves, then using the non-sequential address encoding - // is always preferable, since it saves VALU instructions and is usually a - // wash in terms of code size or even better. - // - // However, we currently have no way of hinting to the register allocator - // that MIMG addresses should be placed contiguously when it is possible to - // do so, so force non-NSA for the common 2-address case as a heuristic. - // - // SIShrinkInstructions will convert NSA encodings to non-NSA after register - // allocation when possible. - const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); - // Rewrite the addressing register layout before doing anything else. - if (IsA16) { - // FIXME: this feature is missing from gfx10. When that is fixed, this check - // should be introduced. - if (!ST.hasR128A16() && !ST.hasGFX10A16()) + if (IsA16 || IsG16) { + if (IsA16) { + // Target must support the feature and gradients need to be 16 bit too + if (!ST.hasA16() || !IsG16) + return false; + } else if (!ST.hasG16()) return false; if (NumVAddrs > 1) { SmallVector PackedRegs; - packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, - NumGradients); + // Don't compress addresses for G16 + const int PackEndIdx = + IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); + packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { + int AddrReg = MI.getOperand(I).getReg(); + assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); + PackedRegs.push_back(AddrReg); + } + } + + // See also below in the non-a16 branch + const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); if (!UseNSA && PackedRegs.size() > 1) { LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); @@ -3847,10 +3853,30 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( SrcOp.setReg(AMDGPU::NoRegister); } } - } else if (!UseNSA && NumVAddrs > 1) { - convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); + } else { + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator + // that MIMG addresses should be placed contiguously when it is possible to + // do so, so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); + + if (!UseNSA && NumVAddrs > 1) + convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); } + int Flags = 0; + if (IsA16) + Flags |= 1; + if (IsG16) + Flags |= 2; + MI.addOperand(MachineOperand::CreateImm(Flags)); if (BaseOpcode->Store) { // No TFE for stores? // TODO: Handle dmask trim diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index bd9cb9202a100..ffe7f952432c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -223,6 +223,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasDPP8(false), HasR128A16(false), HasGFX10A16(false), + HasG16(false), HasNSAEncoding(false), HasDLInsts(false), HasDot1Insts(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 604a7668108cb..af688e64501cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -343,6 +343,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, bool HasDPP8; bool HasR128A16; bool HasGFX10A16; + bool HasG16; bool HasNSAEncoding; bool HasDLInsts; bool HasDot1Insts; @@ -959,6 +960,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return HasGFX10A16; } + bool hasA16() const { return hasR128A16() || hasGFX10A16(); } + + bool hasG16() const { return HasG16; } + bool hasOffset3fBug() const { return HasOffset3fBug; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index a46ca9b34be39..1689fb5c27018 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -35,6 +35,7 @@ class MIMGBaseOpcode : PredicateControl { bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; + bit G16 = 0; bit Coordinates = 1; bit LodOrClampOrMip = 0; bit HasD16 = 0; @@ -47,9 +48,9 @@ def MIMGBaseOpcode : GenericEnum { def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", - "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", - "HasD16"]; + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", + "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; let PrimaryKey = ["BaseOpcode"]; @@ -117,6 +118,22 @@ def MIMGMIPMappingTable : GenericTable { let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGG16Mapping { + MIMGBaseOpcode G = g; + MIMGBaseOpcode G16 = g16; +} + +def MIMGG16MappingTable : GenericTable { + let FilterClass = "MIMGG16Mapping"; + let CppTypeName = "MIMGG16MappingInfo"; + let Fields = ["G", "G16"]; + GenericEnum TypeOf_G = MIMGBaseOpcode; + GenericEnum TypeOf_G16 = MIMGBaseOpcode; + + let PrimaryKey = ["G"]; + let PrimaryKeyName = "getMIMGG16MappingInfo"; +} + class MIMG_Base : InstSI { @@ -645,10 +662,11 @@ class MIMG_Sampler_BaseOpcode } multiclass MIMG_Sampler op, AMDGPUSampleVariant sample, bit wqm = 0, - bit isGetLod = 0, - string asm = "image_sample"#sample.LowerCaseMod> { + bit isG16 = 0, bit isGetLod = 0, + string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> { def "" : MIMG_Sampler_BaseOpcode { let HasD16 = !if(isGetLod, 0, 1); + let G16 = isG16; } let BaseOpcode = !cast(NAME), WQM = wqm, @@ -725,73 +743,89 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic , "image_atomic_dec">; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI //} // End let FPAtomic = 1 -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; - -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">; - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; + +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; @@ -839,3 +873,21 @@ def : MIMGLZMapping; // MIP to NONMIP Optimization Mapping def : MIMGMIPMapping; def : MIMGMIPMapping; + +// G to G16 Optimization Mapping +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9ca10edc40fcc..f1aba26e7925a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5744,6 +5744,35 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } +static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { + SDLoc DL(Op); + for (unsigned I = DimIdx; I < EndIdx; I++) { + SDValue Addr = Op.getOperand(I); + + // Gradients are packed with undef for each coordinate. + // In , notation, the registers look like this: + // 1D: undef,dx/dh; undef,dx/dv + // 2D: dy/dh,dx/dh; dy/dv,dx/dv + // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv + if (((I + 1) >= EndIdx) || + ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || + I == DimIdx + NumGradients - 1))) { + if (Addr.getValueType() != MVT::i16) + Addr = DAG.getBitcast(MVT::i16, Addr); + Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); + } else { + Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); + I++; + } + Addr = DAG.getBitcast(MVT::f32, Addr); + PackedAddrs.push_back(Addr); + } +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -5763,6 +5792,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector ResultTypes(Op->value_begin(), Op->value_end()); SmallVector OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsG16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; @@ -5869,45 +5899,67 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - // Check for 16 bit addresses and pack if true. + // Push back extra arguments. + for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++) + VAddrs.push_back(Op.getOperand(AddrIdx + I)); + + // Check for 16 bit addresses or derivatives and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + unsigned CoordIdx = DimIdx + NumGradients; + unsigned CoordsEnd = AddrIdx + NumMIVAddrs; + MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); - const MVT VAddrScalarVT = VAddrVT.getScalarType(); - if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16))) { - // Illegal to use a16 images - if (!ST->hasFeature(AMDGPU::FeatureR128A16) && !ST->hasFeature(AMDGPU::FeatureGFX10A16)) + MVT VAddrScalarVT = VAddrVT.getScalarType(); + MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + + VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType(); + VAddrScalarVT = VAddrVT.getScalarType(); + IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + if (IsA16 || IsG16) { + if (IsA16) { + if (!ST->hasA16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit addresses\n"); + return Op; + } + if (!IsG16) { + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "need 16 bit derivatives but got 32 bit derivatives\n"); + return Op; + } + } else if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); return Op; + } - IsA16 = true; - const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; - for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { - SDValue AddrLo; - // Push back extra arguments. - if (i < DimIdx) { - AddrLo = Op.getOperand(i); - } else { - // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, - // in 1D, derivatives dx/dh and dx/dv are packed with undef. - if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || - ((NumGradients / 2) % 2 == 1 && - (i == DimIdx + (NumGradients / 2) - 1 || - i == DimIdx + NumGradients - 1))) { - AddrLo = Op.getOperand(i); - if (AddrLo.getValueType() != MVT::i16) - AddrLo = DAG.getBitcast(MVT::i16, Op.getOperand(i)); - AddrLo = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, AddrLo); - } else { - AddrLo = DAG.getBuildVector(VectorVT, DL, - {Op.getOperand(i), Op.getOperand(i + 1)}); - i++; - } - AddrLo = DAG.getBitcast(MVT::f32, AddrLo); + if (BaseOpcode->Gradients && !IsA16) { + if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; } - VAddrs.push_back(AddrLo); + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } + + // Don't compress addresses for G16 + const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx; + packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (unsigned I = CoordIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } } else { - for (unsigned i = 0; i < NumMIVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); + for (unsigned I = DimIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } // If the register allocator cannot place the address registers contiguously diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 01ffcfaa640da..b72b7eb69f1ea 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,6 +63,8 @@ using namespace llvm; +#define DEBUG_TYPE "si-instr-info" + #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenInstrInfo.inc" @@ -3958,7 +3960,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, IsA16 = A16->getImm() != 0; } - bool PackDerivatives = IsA16; // Either A16 or G16 + bool PackDerivatives = IsA16 || BaseOpcode->G16; bool IsNSA = SRsrcIdx - VAddr0Idx > 1; unsigned AddrWords = BaseOpcode->NumExtraArgs; @@ -3995,6 +3997,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (VAddrWords != AddrWords) { + LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords + << " but got " << VAddrWords << "\n"); ErrInfo = "bad vaddr size"; return false; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index cba9857e4d158..ebe736f74f6ff 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -108,6 +108,7 @@ namespace AMDGPU { #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -943,6 +944,10 @@ bool hasGFX10A16(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16]; } +bool hasG16(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureG16]; +} + bool hasPackedD16(const MCSubtargetInfo &STI) { return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 224f797b3ef84..4df6ae1f4490c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -197,6 +197,7 @@ struct MIMGBaseOpcodeInfo { uint8_t NumExtraArgs; bool Gradients; + bool G16; bool Coordinates; bool LodOrClampOrMip; bool HasD16; @@ -233,11 +234,19 @@ struct MIMGMIPMappingInfo { MIMGBaseOpcode NONMIP; }; +struct MIMGG16MappingInfo { + MIMGBaseOpcode G; + MIMGBaseOpcode G16; +}; + LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); LLVM_READONLY -const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L); +const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); + +LLVM_READONLY +const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -541,6 +550,7 @@ bool hasXNACK(const MCSubtargetInfo &STI); bool hasSRAMECC(const MCSubtargetInfo &STI); bool hasMIMG_R128(const MCSubtargetInfo &STI); bool hasGFX10A16(const MCSubtargetInfo &STI); +bool hasG16(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll index 1475ccf83d2a1..f25baa383f59a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll @@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i16 %s) { ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[DEF]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -42,7 +42,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i16 %s) { ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -72,7 +72,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i16 %s, i16 %t) ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -96,7 +96,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i16 %s, i16 %t) ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i16 %s, i16 %t, ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -159,7 +159,8 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i16 %s, i16 %t, ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -189,7 +190,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i16 %s, i1 ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -213,7 +214,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i16 %s, i1 ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -248,7 +249,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i16 %s, i1 ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -276,7 +277,8 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i16 %s, i1 ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -311,7 +313,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i16 %s, i16 % ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -339,7 +341,8 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i16 %s, i16 % ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10: $vgpr0 = COPY [[UV]](s32) ; GFX10: $vgpr1 = COPY [[UV1]](s32) @@ -373,7 +376,7 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_1d ; GFX10: bb.1.main_body: @@ -396,7 +399,7 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -426,7 +429,7 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_2d ; GFX10: bb.1.main_body: @@ -450,7 +453,7 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -485,7 +488,7 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_3d ; GFX10: bb.1.main_body: @@ -513,7 +516,8 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %u, i16 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -543,7 +547,7 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_1darray ; GFX10: bb.1.main_body: @@ -567,7 +571,7 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -602,7 +606,7 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_2darray ; GFX10: bb.1.main_body: @@ -630,7 +634,8 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %u, i16 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -665,7 +670,7 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: store_mip_cube ; GFX10: bb.1.main_body: @@ -693,7 +698,8 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, 0, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10: S_ENDPGM 0 main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %u, i16 0, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll index a0a1366f7408c..060a538d17cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll @@ -18,7 +18,7 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_swap_1d @@ -36,7 +36,7 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -61,7 +61,7 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_1d @@ -79,7 +79,7 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -104,7 +104,7 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_sub_1d @@ -122,7 +122,7 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -147,7 +147,7 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_smin_1d @@ -165,7 +165,7 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -191,7 +191,7 @@ define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_umin_1d @@ -209,7 +209,7 @@ define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -234,7 +234,7 @@ define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_smax_1d @@ -252,7 +252,7 @@ define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -277,7 +277,7 @@ define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_umax_1d @@ -295,7 +295,7 @@ define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -320,7 +320,7 @@ define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_and_1d @@ -338,7 +338,7 @@ define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -363,7 +363,7 @@ define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) { ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_or_1d @@ -381,7 +381,7 @@ define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) { ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -406,7 +406,7 @@ define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_xor_1d @@ -424,7 +424,7 @@ define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -449,7 +449,7 @@ define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_inc_1d @@ -467,7 +467,7 @@ define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -492,7 +492,7 @@ define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_dec_1d @@ -510,7 +510,7 @@ define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -537,7 +537,7 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_cmpswap_1d @@ -557,7 +557,7 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -585,7 +585,7 @@ define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_2d @@ -606,7 +606,7 @@ define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -639,7 +639,7 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_3d @@ -664,7 +664,8 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -697,7 +698,7 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_cube @@ -722,7 +723,8 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -750,7 +752,7 @@ define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_1darray @@ -771,7 +773,7 @@ define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -804,7 +806,7 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_2darray @@ -829,7 +831,8 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -862,7 +865,7 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_2dmsaa @@ -887,7 +890,8 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -921,7 +925,7 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_2darraymsaa @@ -947,7 +951,8 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -972,7 +977,7 @@ define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_1d_slc @@ -990,7 +995,7 @@ define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i16 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -1020,7 +1025,7 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_cmpswap_2d @@ -1043,7 +1048,7 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -1078,7 +1083,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_cmpswap_3d @@ -1105,7 +1110,8 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 % ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -1141,7 +1147,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_cmpswap_2darraymsaa @@ -1169,7 +1175,8 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll index aee2404071937..86d7a2f4e4dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+r128-a16 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9-LABEL: name: load_1d @@ -18,7 +18,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -40,7 +40,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -74,7 +74,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -101,7 +101,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -142,7 +142,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -174,7 +174,8 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -216,7 +217,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -248,7 +249,8 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -284,7 +286,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -311,7 +313,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -352,7 +354,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -384,7 +386,8 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -426,7 +429,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -458,7 +461,8 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -502,7 +506,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -536,7 +540,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -573,7 +578,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -600,7 +605,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -641,7 +646,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -673,7 +678,8 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -717,7 +723,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -751,7 +757,8 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -796,7 +803,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -830,7 +837,8 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -873,7 +881,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -905,7 +913,8 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -949,7 +958,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -983,7 +992,8 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -1020,7 +1030,7 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d ; GFX10NSA: bb.1.main_body: @@ -1042,7 +1052,7 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1076,7 +1086,7 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2d ; GFX10NSA: bb.1.main_body: @@ -1103,7 +1113,7 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1144,7 +1154,7 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_3d ; GFX10NSA: bb.1.main_body: @@ -1176,7 +1186,8 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1218,7 +1229,7 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_cube ; GFX10NSA: bb.1.main_body: @@ -1250,7 +1261,8 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1286,7 +1298,7 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1darray ; GFX10NSA: bb.1.main_body: @@ -1313,7 +1325,7 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1354,7 +1366,7 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2darray ; GFX10NSA: bb.1.main_body: @@ -1386,7 +1398,8 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1428,7 +1441,7 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2dmsaa ; GFX10NSA: bb.1.main_body: @@ -1460,7 +1473,8 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1504,7 +1518,7 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2darraymsaa ; GFX10NSA: bb.1.main_body: @@ -1538,7 +1552,8 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1575,7 +1590,7 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_1d ; GFX10NSA: bb.1.main_body: @@ -1602,7 +1617,7 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1643,7 +1658,7 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_2d ; GFX10NSA: bb.1.main_body: @@ -1675,7 +1690,8 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1719,7 +1735,7 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_3d ; GFX10NSA: bb.1.main_body: @@ -1753,7 +1769,8 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1798,7 +1815,7 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_cube ; GFX10NSA: bb.1.main_body: @@ -1832,7 +1849,8 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1875,7 +1893,7 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_1darray ; GFX10NSA: bb.1.main_body: @@ -1907,7 +1925,8 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -1951,7 +1970,7 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_2darray ; GFX10NSA: bb.1.main_body: @@ -1985,7 +2004,8 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -2012,7 +2032,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2034,7 +2054,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2063,7 +2083,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2085,7 +2105,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2114,7 +2134,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2136,7 +2156,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2165,7 +2185,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2187,7 +2207,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2216,7 +2236,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2238,7 +2258,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2267,7 +2287,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2289,7 +2309,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2318,7 +2338,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2340,7 +2360,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2369,7 +2389,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2391,7 +2411,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2420,7 +2440,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: load_1d_V1 @@ -2438,7 +2458,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -2463,7 +2483,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2483,7 +2503,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2511,7 +2531,7 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V1 ; GFX10NSA: bb.1.main_body: @@ -2529,7 +2549,7 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2556,7 +2576,7 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V2 ; GFX10NSA: bb.1.main_body: @@ -2576,7 +2596,7 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2600,7 +2620,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2622,7 +2642,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2651,7 +2671,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2673,7 +2693,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2702,7 +2722,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) ; GFX9: $vgpr1 = COPY [[UV1]](s32) @@ -2724,7 +2744,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -2758,7 +2778,7 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc ; GFX10NSA: bb.1.main_body: @@ -2780,7 +2800,7 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2809,7 +2829,7 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_slc ; GFX10NSA: bb.1.main_body: @@ -2831,7 +2851,7 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2860,7 +2880,7 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc_slc ; GFX10NSA: bb.1.main_body: @@ -2882,7 +2902,7 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") + ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2952,7 +2972,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2976,7 +2996,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -3015,7 +3035,7 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -3044,7 +3064,7 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -3090,7 +3110,7 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -3124,7 +3144,8 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -3173,7 +3194,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -3209,7 +3230,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll index cd5a5638211d1..a8ab9c7582751 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -18,7 +18,7 @@ define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16) ; UNPACKED: $vgpr0 = COPY [[ANYEXT]](s32) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -37,7 +37,7 @@ define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16) ; PACKED: $vgpr0 = COPY [[ANYEXT]](s32) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -61,7 +61,7 @@ define amdgpu_ps <2 x half> @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -89,7 +89,7 @@ define amdgpu_ps <2 x half> @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -112,7 +112,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -153,7 +153,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>) ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[DEF]](<2 x s16>) @@ -185,7 +185,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -221,7 +221,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>) ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) @@ -247,7 +247,7 @@ define amdgpu_ps half @image_load_tfe_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -269,7 +269,7 @@ define amdgpu_ps half @image_load_tfe_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -299,7 +299,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -329,7 +329,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16(<8 x i32> inreg %rsrc, i32 %s, ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -359,7 +359,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -402,7 +402,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s, ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV1]](s32) @@ -441,7 +441,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; UNPACKED: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -479,7 +479,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16(<8 x i32> inreg %rsrc, i32 %s, ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV1]](s32) @@ -547,7 +547,7 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_1000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] @@ -573,7 +573,7 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_1000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -633,7 +633,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -672,7 +672,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) @@ -703,7 +703,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] @@ -738,7 +738,7 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) @@ -818,7 +818,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1110(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -853,7 +853,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1110(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>) ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) @@ -878,7 +878,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1100(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -911,7 +911,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1100(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) @@ -936,7 +936,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] @@ -965,7 +965,7 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) @@ -1032,7 +1032,7 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1054,7 +1054,7 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1084,7 +1084,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_1000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1113,7 +1113,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_1000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -1143,7 +1143,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1172,7 +1172,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -1202,7 +1202,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1243,7 +1243,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1281,7 +1281,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1319,7 +1319,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1357,7 +1357,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1395,7 +1395,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1433,7 +1433,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1110(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; UNPACKED: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -1470,7 +1470,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1110(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV1]](s32) @@ -1502,7 +1502,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1100(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; UNPACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -1537,7 +1537,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1100(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1569,7 +1569,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -1601,7 +1601,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1633,7 +1633,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -1665,7 +1665,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll index 4e904e7faf590..335f84f3e7bb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll @@ -17,7 +17,7 @@ define amdgpu_ps float @image_load_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -40,7 +40,7 @@ define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: $vgpr0 = COPY [[UV]](s32) ; GCN: $vgpr1 = COPY [[UV1]](s32) @@ -65,7 +65,7 @@ define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN: $vgpr0 = COPY [[UV]](s32) ; GCN: $vgpr1 = COPY [[UV1]](s32) @@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i3 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GCN: $vgpr0 = COPY [[UV]](s32) ; GCN: $vgpr1 = COPY [[UV1]](s32) @@ -119,7 +119,7 @@ define amdgpu_ps float @image_load_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -148,7 +148,7 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -178,7 +178,7 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GCN: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -209,7 +209,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GCN: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -261,7 +261,7 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GCN: $vgpr1 = COPY [[DEF]](s32) @@ -309,7 +309,7 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -336,7 +336,7 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GCN: $vgpr1 = COPY [[DEF]](s32) @@ -386,7 +386,7 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -414,7 +414,7 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -442,7 +442,7 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc, ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GCN: $vgpr1 = COPY [[DEF]](s32) @@ -495,7 +495,7 @@ define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -524,7 +524,7 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -555,7 +555,7 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -586,7 +586,7 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -618,7 +618,7 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -650,7 +650,7 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -682,7 +682,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -715,7 +715,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -748,7 +748,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -781,7 +781,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll index c2cc48d056cf2..b893207b8c0c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll @@ -20,7 +20,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX6: $vgpr0 = COPY [[UV]](s32) ; GFX6: $vgpr1 = COPY [[UV1]](s32) @@ -43,7 +43,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3 ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) @@ -75,7 +75,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX6: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX6: G_STORE [[UV4]](s32), [[MV]](p1) :: (store 4 into %ir.out, addrspace 1) ; GFX6: $vgpr0 = COPY [[UV]](s32) @@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[MV]](p1) :: (store 4 into %ir.out, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll index 519dec8e7e540..ba0fea1263ac2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll @@ -19,7 +19,7 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX6: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: image_load_3d_f32 @@ -37,7 +37,7 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call float @llvm.amdgcn.image.load.3d.f32.i32(i32 1, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -62,7 +62,7 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX6: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GFX6: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32) - ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX6: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX6: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX6: $vgpr0 = COPY [[UV]](s32) @@ -83,7 +83,7 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32 ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10NSA: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll new file mode 100644 index 0000000000000..8b72ce08ecac9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll @@ -0,0 +1,3677 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { + ; GFX9-LABEL: name: sample_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { + ; GFX9-LABEL: name: sample_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { + ; GFX9-LABEL: name: sample_3d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_3d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { + ; GFX9-LABEL: name: sample_cube + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cube + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { + ; GFX9-LABEL: name: sample_1darray + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_1darray + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { + ; GFX9-LABEL: name: sample_2darray + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_2darray + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { + ; GFX9-LABEL: name: sample_c_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { + ; GFX9-LABEL: name: sample_c_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_c_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_c_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { + ; GFX9-LABEL: name: sample_b_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_b_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { + ; GFX9-LABEL: name: sample_b_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_b_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { + ; GFX9-LABEL: name: sample_c_b_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_b_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { + ; GFX9-LABEL: name: sample_c_b_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_b_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_b_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_b_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_b_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_b_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_c_b_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_b_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_c_b_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_b_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { + ; GFX9-LABEL: name: sample_d_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_d_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { + ; GFX9-LABEL: name: sample_d_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_d_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { + ; GFX9-LABEL: name: sample_d_3d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[DEF]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[COPY28]](s32) + ; GFX9: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY29]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_d_3d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[DEF]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX10: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[COPY28]](s32) + ; GFX10: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY29]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { + ; GFX9-LABEL: name: sample_c_d_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_d_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { + ; GFX9-LABEL: name: sample_c_d_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_d_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_d_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_d_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_d_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_d_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_c_d_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_d_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_c_d_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_d_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { + ; GFX9-LABEL: name: sample_cd_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cd_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { + ; GFX9-LABEL: name: sample_cd_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cd_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { + ; GFX9-LABEL: name: sample_c_cd_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cd_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { + ; GFX9-LABEL: name: sample_c_cd_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cd_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_cd_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cd_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_cd_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_cd_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { + ; GFX9-LABEL: name: sample_c_cd_cl_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cd_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { + ; GFX9-LABEL: name: sample_c_cd_cl_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_cd_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { + ; GFX9-LABEL: name: sample_l_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_l_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { + ; GFX9-LABEL: name: sample_l_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_l_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { + ; GFX9-LABEL: name: sample_c_l_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_l_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { + ; GFX9-LABEL: name: sample_c_l_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_l_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { + ; GFX9-LABEL: name: sample_lz_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_lz_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { + ; GFX9-LABEL: name: sample_lz_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_lz_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { + ; GFX9-LABEL: name: sample_c_lz_1d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_lz_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { + ; GFX9-LABEL: name: sample_c_lz_2d + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; GFX10-LABEL: name: sample_c_lz_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { + ; GFX9-LABEL: name: sample_c_d_o_2darray_V1 + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[COPY26]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX10-LABEL: name: sample_c_d_o_2darray_V1 + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[COPY26]](s32) + ; GFX10: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX10: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { + ; GFX9-LABEL: name: sample_c_d_o_2darray_V2 + ; GFX9: bb.1.main_body: + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[COPY26]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; GFX10-LABEL: name: sample_c_d_o_2darray_V2 + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY18]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY19]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY25]](s32), [[COPY26]](s32) + ; GFX10: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY20]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY27]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32, i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32, i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll new file mode 100644 index 0000000000000..1620e2df4e722 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll @@ -0,0 +1,830 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { + ; GFX10-LABEL: name: sample_d_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { + ; GFX10-LABEL: name: sample_d_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { + ; GFX10-LABEL: name: sample_d_3d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[DEF]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY24]](s32), [[COPY25]](s32) + ; GFX10: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { + ; GFX10-LABEL: name: sample_c_d_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { + ; GFX10-LABEL: name: sample_c_d_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { + ; GFX10-LABEL: name: sample_d_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { + ; GFX10-LABEL: name: sample_d_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { + ; GFX10-LABEL: name: sample_c_d_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { + ; GFX10-LABEL: name: sample_c_d_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { + ; GFX10-LABEL: name: sample_cd_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { + ; GFX10-LABEL: name: sample_cd_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { + ; GFX10-LABEL: name: sample_c_cd_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { + ; GFX10-LABEL: name: sample_c_cd_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { + ; GFX10-LABEL: name: sample_cd_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY14]](s32), [[COPY15]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { + ; GFX10-LABEL: name: sample_cd_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { + ; GFX10-LABEL: name: sample_c_cd_cl_1d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY15]](s32), [[COPY16]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { + ; GFX10-LABEL: name: sample_c_cd_cl_2d + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY22]](s32), [[COPY23]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: $vgpr2 = COPY [[UV2]](s32) + ; GFX10: $vgpr3 = COPY [[UV3]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { + ; GFX10-LABEL: name: sample_c_d_o_2darray_V1 + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GFX10: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { + ; GFX10-LABEL: name: sample_c_d_o_2darray_V2 + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12 + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32) + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY16]](s32) + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY17]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY23]](s32), [[COPY24]](s32) + ; GFX10: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) + ; GFX10: $vgpr0 = COPY [[UV]](s32) + ; GFX10: $vgpr1 = COPY [[UV1]](s32) + ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll index fac839d8ea50d..e9b61ff2065cd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -20,7 +20,7 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: image_store_f16 ; PACKED: bb.1 (%ir-block.0): @@ -39,7 +39,7 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha ; PACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -68,7 +68,7 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY11]](s32), [[COPY12]](s32) - ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: image_store_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -86,7 +86,7 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -125,7 +125,7 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: image_store_v3f16 ; PACKED: bb.1 (%ir-block.0): @@ -147,7 +147,7 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -181,7 +181,7 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; UNPACKED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) - ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: image_store_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -201,7 +201,7 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll new file mode 100644 index 0000000000000..caba19977a498 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -0,0 +1,346 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v11, s12 +; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4 +; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index 8d7d356533d78..a5c12510ede44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -1,27 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s ; Natural mapping define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32 %s) { - ; CHECK-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc + ; FAST: bb.1 (%ir-block.0): + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0 + ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; FAST: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; FAST: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; FAST: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; FAST: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; GREEDY: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; GREEDY: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -29,25 +47,44 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32 ; Copy needed for VGPR argument define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32 inreg %s) { - ; CHECK-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc + ; FAST: bb.1 (%ir-block.0): + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10 + ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; FAST: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; FAST: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; FAST: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; FAST: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; GREEDY: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; GREEDY: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; GREEDY: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -55,60 +92,114 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32 ; Waterfall loop needed for rsrc define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { - ; CHECK-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %19, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec - ; CHECK: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec - ; CHECK: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec - ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc + ; FAST: bb.1 (%ir-block.0): + ; FAST: successors: %bb.2(0x80000000) + ; FAST: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; FAST: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; FAST: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; FAST: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; FAST: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; FAST: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; FAST: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; FAST: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; FAST: bb.2: + ; FAST: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %19, %bb.2 + ; FAST: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 + ; FAST: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; FAST: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; FAST: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; FAST: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; FAST: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; FAST: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; FAST: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; FAST: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; FAST: bb.3: + ; FAST: successors: %bb.4(0x80000000) + ; FAST: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; FAST: bb.4: + ; FAST: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: successors: %bb.2(0x80000000) + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %19, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; GREEDY: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; GREEDY: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: successors: %bb.4(0x80000000) + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -116,61 +207,116 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; Waterfall loop needed for rsrc, copy needed for vaddr define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg %s) { - ; CHECK-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %20, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec - ; CHECK: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec - ; CHECK: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec - ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc + ; FAST: bb.1 (%ir-block.0): + ; FAST: successors: %bb.2(0x80000000) + ; FAST: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; FAST: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; FAST: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; FAST: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; FAST: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) + ; FAST: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; FAST: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; FAST: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; FAST: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; FAST: bb.2: + ; FAST: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %20, %bb.2 + ; FAST: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 + ; FAST: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; FAST: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; FAST: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; FAST: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; FAST: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; FAST: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; FAST: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; FAST: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; FAST: bb.3: + ; FAST: successors: %bb.4(0x80000000) + ; FAST: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; FAST: bb.4: + ; FAST: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: successors: %bb.2(0x80000000) + ; GREEDY: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; GREEDY: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32) + ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %20, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; GREEDY: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; GREEDY: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: successors: %bb.4(0x80000000) + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index 384e74a2fbbd5..5e5f1154366f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -1,31 +1,53 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s ; Natural mapping define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { - ; CHECK-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 - ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp + ; FAST: bb.1 (%ir-block.0): + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; FAST: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; FAST: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; FAST: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; FAST: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; FAST: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 + ; FAST: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 + ; FAST: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 + ; FAST: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; GREEDY: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; GREEDY: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; GREEDY: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; GREEDY: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 + ; GREEDY: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 + ; GREEDY: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -33,29 +55,52 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre ; Copy required for VGPR input define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float inreg %s) { - ; CHECK-LABEL: name: sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 - ; CHECK: [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp + ; FAST: bb.1 (%ir-block.0): + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14 + ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; FAST: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; FAST: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; FAST: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; FAST: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; FAST: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 + ; FAST: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 + ; FAST: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 + ; FAST: [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; GREEDY: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; GREEDY: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; GREEDY: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10 + ; GREEDY: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11 + ; GREEDY: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12 + ; GREEDY: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13 + ; GREEDY: [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -63,64 +108,122 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre ; Waterfall loop for rsrc define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s) { - ; CHECK-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec - ; CHECK: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec - ; CHECK: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec - ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp + ; FAST: bb.1 (%ir-block.0): + ; FAST: successors: %bb.2(0x80000000) + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; FAST: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; FAST: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; FAST: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; FAST: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; FAST: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; FAST: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; FAST: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; FAST: bb.2: + ; FAST: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; FAST: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; FAST: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; FAST: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; FAST: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; FAST: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; FAST: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; FAST: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; FAST: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; FAST: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; FAST: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; FAST: bb.3: + ; FAST: successors: %bb.4(0x80000000) + ; FAST: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; FAST: bb.4: + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: successors: %bb.2(0x80000000) + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; GREEDY: [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; GREEDY: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; GREEDY: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: successors: %bb.4(0x80000000) + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -128,54 +231,102 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; Waterfall loop for sampler define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s) { - ; CHECK-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp + ; FAST: bb.1 (%ir-block.0): + ; FAST: successors: %bb.2(0x80000000) + ; FAST: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; FAST: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; FAST: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; FAST: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; FAST: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; FAST: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; FAST: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; FAST: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; FAST: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; FAST: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; FAST: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; FAST: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; FAST: bb.2: + ; FAST: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; FAST: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; FAST: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; FAST: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; FAST: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; FAST: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; FAST: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; FAST: bb.3: + ; FAST: successors: %bb.4(0x80000000) + ; FAST: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; FAST: bb.4: + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: successors: %bb.2(0x80000000) + ; GREEDY: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 + ; GREEDY: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 + ; GREEDY: [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9 + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: successors: %bb.4(0x80000000) + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void @@ -183,76 +334,146 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; Waterfall loop for rsrc and sampler define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsrc, <4 x i32> %samp, float %s) { - ; CHECK-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 - ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr9 - ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr10 - ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr11 - ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) - ; CHECK: [[UV4:%[0-9]+]]:vreg_64(s64), [[UV5:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec - ; CHECK: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec - ; CHECK: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec - ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) - ; CHECK: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub1(s64), implicit $exec - ; CHECK: [[MV4:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV4]](s64), [[UV4]](s64), implicit $exec - ; CHECK: [[S_AND_B64_3:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_4]], [[S_AND_B64_2]], implicit-def $scc - ; CHECK: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub1(s64), implicit $exec - ; CHECK: [[MV5:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec - ; CHECK: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) - ; CHECK: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: S_ENDPGM 0 + ; FAST-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp + ; FAST: bb.1 (%ir-block.0): + ; FAST: successors: %bb.2(0x80000000) + ; FAST: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; FAST: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; FAST: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; FAST: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; FAST: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; FAST: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr9 + ; FAST: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr10 + ; FAST: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr11 + ; FAST: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12 + ; FAST: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; FAST: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; FAST: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; FAST: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; FAST: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; FAST: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; FAST: [[UV4:%[0-9]+]]:vreg_64(s64), [[UV5:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; FAST: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; FAST: bb.2: + ; FAST: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; FAST: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; FAST: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; FAST: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; FAST: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; FAST: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; FAST: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; FAST: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; FAST: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; FAST: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; FAST: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; FAST: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; FAST: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub1(s64), implicit $exec + ; FAST: [[MV4:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV4]](s64), [[UV4]](s64), implicit $exec + ; FAST: [[S_AND_B64_3:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_4]], [[S_AND_B64_2]], implicit-def $scc + ; FAST: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub0(s64), implicit $exec + ; FAST: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub1(s64), implicit $exec + ; FAST: [[MV5:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) + ; FAST: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec + ; FAST: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc + ; FAST: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) + ; FAST: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; FAST: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; FAST: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; FAST: bb.3: + ; FAST: successors: %bb.4(0x80000000) + ; FAST: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; FAST: bb.4: + ; FAST: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; FAST: S_ENDPGM 0 + ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp + ; GREEDY: bb.1 (%ir-block.0): + ; GREEDY: successors: %bb.2(0x80000000) + ; GREEDY: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6 + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7 + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8 + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr9 + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr10 + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr11 + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12 + ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GREEDY: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64), [[UV2:%[0-9]+]]:vreg_64(s64), [[UV3:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) + ; GREEDY: [[UV4:%[0-9]+]]:vreg_64(s64), [[UV5:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GREEDY: bb.2: + ; GREEDY: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 + ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; GREEDY: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]].sub1(s64), implicit $exec + ; GREEDY: [[MV2:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV2]](s64), [[UV2]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_2]], [[S_AND_B64_]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]].sub1(s64), implicit $exec + ; GREEDY: [[MV3:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) + ; GREEDY: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]].sub1(s64), implicit $exec + ; GREEDY: [[MV4:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV4]](s64), [[UV4]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_3:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_4]], [[S_AND_B64_2]], implicit-def $scc + ; GREEDY: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub0(s64), implicit $exec + ; GREEDY: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]].sub1(s64), implicit $exec + ; GREEDY: [[MV5:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) + ; GREEDY: [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec + ; GREEDY: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc + ; GREEDY: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) + ; GREEDY: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GREEDY: bb.3: + ; GREEDY: successors: %bb.4(0x80000000) + ; GREEDY: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GREEDY: bb.4: + ; GREEDY: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; GREEDY: S_ENDPGM 0 %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll new file mode 100644 index 0000000000000..49891aebe7014 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 ; encoding: [0x09,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 ; encoding: [0x09,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll new file mode 100644 index 0000000000000..e4214ef54cf8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll new file mode 100644 index 0000000000000..02c8d67c525f9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll @@ -0,0 +1,17 @@ +; RUN: not --crash llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +; Make sure this doesn't assert on targets without the g16 feature, and instead +; generates a selection error. + +; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.sample.d.1d + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0 + +attributes #0 = { nounwind readonly }