diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c5e998b4122265..2fa7511dd0d119 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -18,6 +18,7 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -1500,8 +1501,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( const bool IsA16 = (Flags & 1) != 0; const bool IsG16 = (Flags & 2) != 0; - // A16 implies 16 bit gradients - if (IsA16 && !IsG16) + // A16 implies 16 bit gradients if subtarget doesn't support G16 + if (IsA16 && !STI.hasG16() && !IsG16) return false; unsigned DMask = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 7084728f6105b2..3c8a721c124652 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4008,14 +4008,16 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, return true; } -/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized +/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized /// vector with s16 typed elements. -static void packImageA16AddressToDwords( - MachineIRBuilder &B, MachineInstr &MI, - SmallVectorImpl &PackedAddrs, unsigned ArgOffset, - const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { +static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, + SmallVectorImpl &PackedAddrs, + unsigned ArgOffset, + const AMDGPU::ImageDimIntrinsicInfo *Intr, + bool IsA16, bool IsG16) { const LLT S16 = LLT::scalar(16); const LLT V2S16 = LLT::vector(2, 16); + auto EndIdx = Intr->VAddrEnd; for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); @@ -4027,6 +4029,10 @@ static void packImageA16AddressToDwords( if (I < Intr->GradientStart) { AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); + } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || + (I >= Intr->CoordStart && !IsA16)) { + // Handle any gradient or coordinate operands that should not be packed + PackedAddrs.push_back(AddrReg); } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. @@ -4222,29 +4228,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } // Rewrite the addressing register layout before doing anything else. - if (IsA16 || IsG16) { - if (IsA16) { - // Target must support the feature and gradients need to be 16 bit too - if (!ST.hasA16() || !IsG16) - return false; - } else if (!ST.hasG16()) - return false; + if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { + // 16 bit gradients are supported, but are tied to the A16 control + // so both gradients and addresses must be 16 bit + return false; + } + + if (IsA16 && !ST.hasA16()) { + // A16 not supported + return false; + } + if (IsA16 || IsG16) { if (Intr->NumVAddrs > 1) { SmallVector PackedRegs; - // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; - packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, - PackEndIdx); - - if (!IsA16) { - // Add uncompressed address - for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { - int AddrReg = MI.getOperand(ArgOffset + I).getReg(); - assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); - PackedRegs.push_back(AddrReg); - } - } + + packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, + IsG16); // See also below in the non-a16 branch const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 408648686feb58..3a9ebb8b74c6d6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5926,11 +5926,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } -static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, - MVT PackVectorVT, - SmallVectorImpl &PackedAddrs, - unsigned DimIdx, unsigned EndIdx, - unsigned NumGradients) { +static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { SDLoc DL(Op); for (unsigned I = DimIdx; I < EndIdx; I++) { SDValue Addr = Op.getOperand(I); @@ -6085,56 +6085,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); MVT VAddrScalarVT = VAddrVT.getScalarType(); - MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); VAddrScalarVT = VAddrVT.getScalarType(); + MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; - if (IsA16 || IsG16) { - if (IsA16) { - if (!ST->hasA16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit addresses\n"); - return Op; - } - if (!IsG16) { - LLVM_DEBUG( - dbgs() << "Failed to lower image intrinsic: 16 bit addresses " - "need 16 bit derivatives but got 32 bit derivatives\n"); - return Op; - } - } else if (!ST->hasG16()) { + + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { + // 16 bit gradients are supported, but are tied to the A16 control + // so both gradients and addresses must be 16 bit + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "require 16 bit args for both gradients and addresses"); + return Op; + } + + if (IsA16) { + if (!ST->hasA16()) { LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); + "support 16 bit addresses\n"); return Op; } + } - if (BaseOpcode->Gradients && !IsA16) { - if (!ST->hasG16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); - return Op; - } - // Activate g16 - const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = - AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); - IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 - } + // We've dealt with incorrect input so we know that if IsA16, IsG16 + // are set then we have to compress/pack operands (either address, + // gradient or both) + // In the case where a16 and gradients are tied (no G16 support) then we + // have already verified that both IsA16 and IsG16 are true + if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } - // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); - packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, - ArgOffset + Intr->GradientStart, PackEndIdx, - Intr->NumGradients); + // Add gradients (packed or unpacked) + if (IsG16) { + // Pack the gradients + // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); + packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, + ArgOffset + Intr->GradientStart, + ArgOffset + Intr->CoordStart, Intr->NumGradients); + } else { + for (unsigned I = ArgOffset + Intr->GradientStart; + I < ArgOffset + Intr->CoordStart; I++) + VAddrs.push_back(Op.getOperand(I)); + } - if (!IsA16) { - // Add uncompressed address - for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) - VAddrs.push_back(Op.getOperand(I)); - } + // Add addresses (packed or unpacked) + if (IsA16) { + packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, + ArgOffset + Intr->CoordStart, VAddrEnd, + 0 /* No gradients */); } else { - for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++) + // Add uncompressed address + for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) VAddrs.push_back(Op.getOperand(I)); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 1fd9a2f1e8d83f..03c5356417c286 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; This test is expected to fail until the new a16/g16 codegen changes are in place -; XFAIL: * define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: sample_1d: @@ -544,7 +542,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -575,7 +573,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX10-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -609,7 +607,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_d v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -626,7 +624,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -659,7 +657,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -680,7 +678,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -711,7 +709,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -732,7 +730,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -765,7 +763,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -782,7 +780,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -813,7 +811,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX10-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -830,7 +828,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -863,7 +861,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -884,7 +882,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -915,7 +913,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -936,7 +934,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -969,7 +967,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1165,7 +1163,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1199,7 +1197,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll new file mode 100644 index 00000000000000..fda482e41ce819 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -0,0 +1,933 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL %s +; TODO: global-isel produces more code - there will need to be some more combines in the postregbankselectcombine phase +; Depends on some other changes to pass this test - those are in review separately + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_d_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 +; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f16(i32 15, float %dsdh, float %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_d_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f16(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, half %s, half %t, half %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX10-NEXT: image_sample_d v[0:3], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_d_3d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f16(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 +; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_d_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f16(i32 15, float %dsdh, float %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_d_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f16(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, v4 +; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_cd_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 +; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f16(i32 15, float %dsdh, float %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_cd_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f16(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_cd_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 +; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_cd_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_cd_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f16(i32 15, float %dsdh, float %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v1, v2, v3, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_cd_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f16(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_cd_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, v4 +; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v2, v3, v4, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_cd_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f16(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v3, v4, v5, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f16(i32, float, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f16(i32, float, float, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32, float, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32, float, float, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f16(i32, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32, float, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f16(i32, float, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, float, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f16(i32, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f16(i32, i32, float, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32, i32, float, float, float, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +define amdgpu_ps <4 x float> @sample_g16_noa16_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_g16_noa16_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_d_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_g16_noa16_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_g16_noa16_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_d_3d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v9, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, v5, v9, s12 +; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_g16_noa16_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_g16_noa16_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v8, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v8, v4 +; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_g16_noa16_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_g16_noa16_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_g16_noa16_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_g16_noa16_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_g16_noa16_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v8, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v8, v4 +; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V1: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v9, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v3, v4, v9, v5 +; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V2: +; GFX10GISEL: ; %bb.0: ; %main_body +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v9, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v3, v4, v9, v5 +; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10GISEL-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone }