diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 62239ca705b9e..645591dc4c776 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -561,6 +561,7 @@ def llvm_v3i32_ty : LLVMType; // 3 x i32 def llvm_v4i32_ty : LLVMType; // 4 x i32 def llvm_v6i32_ty : LLVMType; // 6 x i32 def llvm_v8i32_ty : LLVMType; // 8 x i32 +def llvm_v10i32_ty : LLVMType; // 10 x i32 def llvm_v16i32_ty : LLVMType; // 16 x i32 def llvm_v32i32_ty : LLVMType; // 32 x i32 def llvm_v64i32_ty : LLVMType; // 64 x i32 @@ -591,6 +592,7 @@ def llvm_v2f32_ty : LLVMType; // 2 x float def llvm_v3f32_ty : LLVMType; // 3 x float def llvm_v4f32_ty : LLVMType; // 4 x float def llvm_v8f32_ty : LLVMType; // 8 x float +def llvm_v10f32_ty : LLVMType; // 10 x float def llvm_v16f32_ty : LLVMType; // 16 x float def llvm_v32f32_ty : LLVMType; // 32 x float def llvm_v1f64_ty : LLVMType; // 1 x double diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a0c38c303e638..307e4b8a01e5c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2813,6 +2813,17 @@ def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn; +// , , +// llvm.amdgcn.image.bvh.dual.intersect.ray , , +// , , +// , , +// +def int_amdgcn_image_bvh_dual_intersect_ray : + Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty], + [llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty, + llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty], + [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + // llvm.amdgcn.permlane16.var def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, Intrinsic<[llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index fdbabcb62c0bf..4532975612b1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1094,6 +1094,12 @@ def FeaturePrngInst : SubtargetFeature<"prng-inst", "Has v_prng_b32 instruction" >; +def FeatureBVHDualInst : SubtargetFeature<"bvh-dual-inst", + "HasBVHDualInst", + "true", + "Has image_bvh_dual_intersect_ray instruction" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1844,7 +1850,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, Feature1_5xVGPRs, - FeatureMemoryAtomicFAddF32DenormalSupport + FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureBVHDualInst ]>; def FeatureISAVersion12_Generic: FeatureSet< @@ -2500,6 +2507,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">, AssemblerPredicate<(all_of FeaturePrngInst)>; +def HasBVHDualInst : Predicate<"Subtarget->hasBVHDualInst()">, + AssemblerPredicate<(all_of FeatureBVHDualInst)>; + def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">, AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a222de6a61247..745621fc1e089 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3597,10 +3597,12 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( MachineInstr &MI) const { - MI.setDesc(TII.get(MI.getOperand(1).getImm())); - MI.removeOperand(1); + unsigned OpcodeOpIdx = + MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3; + MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm())); + MI.removeOperand(OpcodeOpIdx); MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); - return true; + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); } // FIXME: This should be removed and let the patterns select. We just need the @@ -4114,6 +4116,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: return selectBVHIntersectRayIntrinsic(I); case AMDGPU::G_SBFX: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8d1243cee300f..efa042b033628 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7198,6 +7198,57 @@ bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic( return true; } +bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI, + MachineIRBuilder &B) const { + const LLT S32 = LLT::scalar(32); + const LLT V2S32 = LLT::fixed_vector(2, 32); + + Register DstReg = MI.getOperand(0).getReg(); + Register DstOrigin = MI.getOperand(1).getReg(); + Register DstDir = MI.getOperand(2).getReg(); + Register NodePtr = MI.getOperand(4).getReg(); + Register RayExtent = MI.getOperand(5).getReg(); + Register InstanceMask = MI.getOperand(6).getReg(); + Register RayOrigin = MI.getOperand(7).getReg(); + Register RayDir = MI.getOperand(8).getReg(); + Register Offsets = MI.getOperand(9).getReg(); + Register TDescr = MI.getOperand(10).getReg(); + + if (!ST.hasBVHDualInst()) { + DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), + "intrinsic not supported on subtarget", + MI.getDebugLoc()); + B.getMF().getFunction().getContext().diagnose(BadIntrin); + return false; + } + + const unsigned NumVDataDwords = 10; + const unsigned NumVAddrDwords = 12; + int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, + NumVAddrDwords); + assert(Opcode != -1); + + auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr( + V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)}); + + B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) + .addDef(DstReg) + .addDef(DstOrigin) + .addDef(DstDir) + .addImm(Opcode) + .addUse(NodePtr) + .addUse(RayExtentInstanceMaskVec.getReg(0)) + .addUse(RayOrigin) + .addUse(RayDir) + .addUse(Offsets) + .addUse(TDescr) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const { const SITargetLowering *TLI = ST.getTargetLowering(); @@ -7546,6 +7597,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeRsqClampIntrinsic(MI, MRI, B); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntersectRayIntrinsic(MI, B); + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: + return legalizeBVHDualIntrinsic(MI, B); case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index a98e8ba7aaaf1..aba1f55330913 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -208,6 +208,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeBVHDualIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b46fc7d9c752a..acdf6a932a0bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3239,10 +3239,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); return; } - case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: { - unsigned N = MI.getNumExplicitOperands() - 2; + case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { + bool IsDual = MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY; + unsigned NumMods = IsDual ? 0 : 1; // Has A16 modifier + unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(B, MI, {N}); + executeInWaterfallLoop(B, MI, {LastRegOpIdx}); return; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: @@ -5032,11 +5035,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { assert(RSrcIntrin->IsImage); return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); } - case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: { - unsigned N = MI.getNumExplicitOperands() - 2; - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); - OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); - if (N == 3) { + case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: + case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: { + bool IsDual = MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY; + unsigned NumMods = IsDual ? 0 : 1; // Has A16 modifier + unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods; + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + if (IsDual) { + OpdsMapping[1] = AMDGPU::getValueMapping( + AMDGPU::VGPRRegBankID, + MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); + OpdsMapping[2] = AMDGPU::getValueMapping( + AMDGPU::VGPRRegBankID, + MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); + } + OpdsMapping[LastRegOpIdx] = + getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI); + if (LastRegOpIdx == 3) { // Sequential form: all operands combined into VGPR256/VGPR512 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); if (Size > 256) @@ -5044,7 +5060,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } else { // NSA form - for (unsigned I = 2; I < N; ++I) { + unsigned FirstSrcOpIdx = IsDual ? 4 : 2; + for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) { unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 728ce125eba2d..847121f251361 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -268,6 +268,7 @@ DECODE_OPERAND_REG_8(VReg_128) DECODE_OPERAND_REG_8(VReg_192) DECODE_OPERAND_REG_8(VReg_256) DECODE_OPERAND_REG_8(VReg_288) +DECODE_OPERAND_REG_8(VReg_320) DECODE_OPERAND_REG_8(VReg_352) DECODE_OPERAND_REG_8(VReg_384) DECODE_OPERAND_REG_8(VReg_512) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 651dbad8244cb..99892d9a60423 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -228,6 +228,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasRestrictedSOffset = false; bool HasBitOp3Insts = false; bool HasPrngInst = false; + bool HasBVHDualInst = false; bool HasPermlane16Swap = false; bool HasPermlane32Swap = false; bool HasVcmpxPermlaneHazard = false; @@ -1364,6 +1365,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPrngInst() const { return HasPrngInst; } + bool hasBVHDualInst() const { return HasBVHDualInst; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 1b94d6c43392d..63af4b2e351fb 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1509,7 +1509,7 @@ multiclass MIMG_Gather : MIMG_Gather; -class MIMG_IntersectRay_Helper { +class MIMG_IntersectRay_Helper { int num_addrs = !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11)); RegisterClass RegClass = MIMGAddrSize.RegClass; int VAddrDwords = !srl(RegClass.Size, 5); @@ -1517,9 +1517,10 @@ class MIMG_IntersectRay_Helper { int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32); list GFX11PlusAddrTypes = - !if(IsA16, + !if(isDual, [VReg_64, VReg_64, VReg_96, VReg_96, VReg_64], + !if(IsA16, [node_ptr_type, VGPR_32, VReg_96, VReg_96], - [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); + [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96])); } class MIMG_IntersectRay_gfx10 @@ -1553,15 +1554,26 @@ class MIMG_IntersectRay_nsa_gfx11 addr_types> - : VIMAGE_gfx12 { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16)); - let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16"; -} - -multiclass MIMG_IntersectRay { - defvar info = MIMG_IntersectRay_Helper; + let Constraints = !if(isDual, + "$ray_origin_out = $vaddr2, $ray_dir_out = $vaddr3", ""); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc), + !if(isDual, (ins), (ins A16:$a16))); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc"# + !if(isDual, "", "$a16"); + let SchedRW = !if(isDual, + [WriteVMEM, WriteVMEM, WriteVMEM], [WriteVMEM]); +} + +multiclass MIMG_IntersectRay { + defvar info = MIMG_IntersectRay_Helper; def "" : MIMGBaseOpcode { let BVH = 1; let A16 = IsA16; @@ -1599,7 +1611,8 @@ multiclass MIMG_IntersectRay { } } def _gfx12 : VIMAGE_IntersectRay_gfx12 { + isDual, info.GFX11PlusAddrTypes> { + let VDataDwords = !if(isDual, 10, 4); let VAddrDwords = info.num_addrs; } } @@ -1771,15 +1784,18 @@ defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler , "ima let OtherPredicates = [HasImageInsts, HasGFX10_AEncoding] in { defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load , "image_msaa_load">; -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 0, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 1, 0>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 0, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 1, 0>; } // End OtherPredicates = [HasImageInsts, HasGFX10_AEncoding] +defm IMAGE_BVH_DUAL_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_dual_intersect_ray", 1, 0, 1>; + let SubtargetPredicate = isGFX12Plus in { def : AMDGPUMnemonicAlias<"bvh_intersect_ray", "image_bvh_intersect_ray">; def : AMDGPUMnemonicAlias<"bvh64_intersect_ray", "image_bvh64_intersect_ray">; + def : AMDGPUMnemonicAlias<"bvh_dual_intersect_ray", "image_bvh_dual_intersect_ray">; } } // End let OtherPredicates = [HasImageInsts] diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 250963b3019a0..09f011410e0a4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1386,9 +1386,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh_intersect_ray: { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? + Info.memVT = + MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray + ? CI.getType() + : cast(CI.getType()) + ->getElementType(0)); // XXX: what is correct VT? Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; Info.align.reset(); @@ -9438,6 +9443,48 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: { + MemSDNode *M = cast(Op); + SDValue NodePtr = M->getOperand(2); + SDValue RayExtent = M->getOperand(3); + SDValue InstanceMask = M->getOperand(4); + SDValue RayOrigin = M->getOperand(5); + SDValue RayDir = M->getOperand(6); + SDValue Offsets = M->getOperand(7); + SDValue TDescr = M->getOperand(8); + + assert(NodePtr.getValueType() == MVT::i64); + assert(RayDir.getValueType() == MVT::v3f32); + + if (!Subtarget->hasBVHDualInst()) { + emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); + return SDValue(); + } + + const unsigned NumVDataDwords = 10; + const unsigned NumVAddrDwords = 12; + int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, + AMDGPU::MIMGEncGfx12, NumVDataDwords, + NumVAddrDwords); + assert(Opcode != -1); + + SmallVector Ops; + Ops.push_back(NodePtr); + Ops.push_back(DAG.getBuildVector( + MVT::v2i32, DL, + {DAG.getBitcast(MVT::i32, RayExtent), + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)})); + Ops.push_back(RayOrigin); + Ops.push_back(RayDir); + Ops.push_back(Offsets); + Ops.push_back(TDescr); + Ops.push_back(M->getChain()); + + auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); + MachineMemOperand *MemRef = M->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); + return SDValue(NewNode, 0); + } case Intrinsic::amdgcn_image_bvh_intersect_ray: { MemSDNode *M = cast(Op); SDValue NodePtr = M->getOperand(2); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index de77401eb0137..b2315bc80f0a4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4362,7 +4362,15 @@ def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { def G_AMDGPU_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); - let InOperandList = (ins unknown:$intrin, variable_ops); + let InOperandList = (ins unknown:$opcode, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +def G_AMDGPU_BVH_DUAL_INTERSECT_RAY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst, type1:$ray_origin, type1:$ray_dir); + let InOperandList = (ins unknown:$opcode, variable_ops); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 0; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll new file mode 100644 index 0000000000000..7e22d60cd710f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64, float, i8, <3 x float>, <3 x float>, <2 x i32>, <4 x i32>) + +; ERR: in function image_bvh_dual_intersect_ray{{.*}}intrinsic not supported on subtarget +define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off +; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off +; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} + +define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) { +; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray_1: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off +; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray_1: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1 +; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off +; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off +; GFX12-GISEL-NEXT: ; return to shader part epilog +main_body: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 1, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %r = bitcast <10 x i32> %a to <10 x float> + %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + store <3 x float> %o, ptr addrspace(1) %origin + %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + store <3 x float> %d, ptr addrspace(1) %dir + ret <10 x float> %r +} diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s index 8bf9b92e8d1d8..3ca8f4308a0ee 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s @@ -1066,6 +1066,9 @@ image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16 // GFX12: encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f] +image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] +// GFX12: encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] + image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D // GFX12: encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s index b9999b671f7e7..0148ff6cabc93 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s @@ -41,3 +41,6 @@ bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7] bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7] // GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7] ; encoding: [0x10,0x80,0xc6,0xd3,0x04,0x08,0x00,0x12,0x09,0x0b,0x0c,0x0f] + +bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] +// GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt index 233c2e1b9d083..afb7c3c24db17 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt @@ -1066,6 +1066,9 @@ # GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16 ; encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f] 0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f +# GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06] +0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06 + # GFX12: image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00] 0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00