diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 255f5106e543f..93b2e0b9450be 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1075,6 +1075,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) + // Note: volatile bit is **not** permitted here. [IntrNoMem, ImmArg>]>, AMDGPURsrcIntrinsic<0>; @@ -1102,6 +1103,10 @@ def int_amdgcn_buffer_store : AMDGPUBufferStore; // The versions of these intrinsics that take <4 x i32> arguments are deprecated // in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, // which allow for improved reasoning about memory accesses. +// +// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved +// through to final assembly selection and is used to signal that the buffer +// operation is volatile. class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < [data_ty], [llvm_v4i32_ty, // rsrc(SGPR) @@ -1110,7 +1115,8 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsi llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -1124,7 +1130,9 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) + [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1140,7 +1148,8 @@ class AMDGPUStructBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -1155,7 +1164,8 @@ class AMDGPUStructPtrBufferLoad : DefaultAttrsIn llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1171,7 +1181,8 @@ class AMDGPURawBufferStore : DefaultAttrsIntrins llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -1186,7 +1197,8 @@ class AMDGPURawPtrBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1203,7 +1215,8 @@ class AMDGPUStructBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -1219,7 +1232,8 @@ class AMDGPUStructPtrBufferStore : DefaultAttrsI llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1232,7 +1246,7 @@ class AMDGPURawBufferAtomic : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; @@ -1256,7 +1270,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1266,7 +1280,7 @@ class AMDGPURawPtrBufferAtomic : Intrinsic < AMDGPUBufferRsrcTy, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; @@ -1292,7 +1306,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< AMDGPUBufferRsrcTy, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1308,7 +1322,7 @@ class AMDGPUStructBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; @@ -1331,7 +1345,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1342,7 +1356,7 @@ class AMDGPUStructPtrBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; @@ -1366,7 +1380,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -1443,7 +1457,8 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1458,7 +1473,8 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1473,7 +1489,8 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1488,7 +1505,8 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrReadMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1503,7 +1521,8 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrReadMem, ReadOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1519,7 +1538,8 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrArgMemOnly, IntrWriteMem, WriteOnly>, NoCapture>, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1535,7 +1555,8 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+), - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1596,7 +1617,8 @@ class AMDGPURawBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; @@ -1612,7 +1634,8 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, IntrArgMemOnly, ReadOnly>, NoCapture>, WriteOnly>, NoCapture>, @@ -1632,7 +1655,8 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; @@ -1649,7 +1673,8 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, // bit 1 = slc, // bit 2 = dlc on gfx10+)) - // swizzled buffer (bit 3 = swz)) + // swizzled buffer (bit 3 = swz), + // volatile op (bit 31, stripped at lowering)) [IntrWillReturn, IntrArgMemOnly, ReadOnly>, NoCapture>, WriteOnly>, NoCapture>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 92ada7e84ed5f..f59e147f585b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -379,8 +379,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">, def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, GISDNodeXFormEquiv; -def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">, - GISDNodeXFormEquiv; +def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">, + GISDNodeXFormEquiv; def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 16642a76288c0..1d31c6b8fde93 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return false; int NumVAddrRegs = 0; @@ -5496,11 +5497,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, MIB.addImm(Swizzle); } -void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderExtractCpolSetGLC( + MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); + const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & + (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + MIB.addImm(Cpol | AMDGPU::CPol::GLC); } void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 9b39ebdf37717..12ea46c2895b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -331,8 +331,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { int OpIdx) const; void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; - void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; + void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 9e99d382ed9b3..d2769992b3c1a 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1628,12 +1628,12 @@ multiclass SIBufferAtomicPat_Common(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1641,7 +1641,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_IDXEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1649,7 +1649,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_OFFEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1657,7 +1657,7 @@ multiclass SIBufferAtomicPat_Common(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1726,35 +1726,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary)) >; } @@ -1791,8 +1791,9 @@ multiclass SIBufferAtomicCmpSwapPat_Common(SIbuffer_atomic_cmpswap # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), - (timm:$cachepolicy)); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), + (extract_cpol_set_glc $auxiliary), + (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT.ret; defvar DataRC = getVregSrcForVT.ret; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); @@ -1804,7 +1805,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common(CI.getArgOperand(CI.arg_size() - 1)); + if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + Info.flags |= MachineMemOperand::MOVolatile; Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { unsigned MaxNumLanes = 4; @@ -7639,7 +7642,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return Op; SmallVector Ops; @@ -8005,6 +8009,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { unsigned CPol = Op.getConstantOperandVal(3); + // s_buffer_load, because of how it's optimized, can't be volatile + // so reject ones with the volatile bit set. if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 4dc0033e03859..53b9d6d260648 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -892,8 +892,11 @@ def extract_swz : SDNodeXFormgetTargetConstant(Swizzle, SDLoc(N), MVT::i8); }]>; -def set_glc : SDNodeXFormgetTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); +def extract_cpol_set_glc : SDNodeXFormgetZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); }]>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll index cea9a13221579..7b8b028128dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -270,6 +270,25 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ret float %val } +define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648) + ret float %val +} + ; Natural mapping define amdgpu_ps <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index 6f35e3bad3eaf..2c99ce8694bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -327,6 +327,25 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ret void } +define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (volatile dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648) + ret void +} + define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32 ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll index 2b7ef147cae0f..e40e6f8410ee7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll @@ -129,6 +129,26 @@ main_body: ret float %out } +;CHECK-LABEL: {{^}}test_volatile: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK-DAG: s_waitcnt vmcnt(0) +define amdgpu_ps float @test_volatile(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648) + %out = bitcast i32 %t1 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test_volatile_noret: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen{{$}} +define amdgpu_ps void @test_volatile_noret(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648) + ret void +} + declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0 declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index 6a04a0cfed355..1670f41638d50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -76,6 +76,42 @@ main_body: ret {<4 x float>, <4 x float>, <4 x float>} %r2 } +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) { +; PREGFX10-LABEL: buffer_load_volatile: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: buffer_load_volatile: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: buffer_load_volatile: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648) + %data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647) + %data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; PREGFX10-LABEL: buffer_load_immoffs: ; PREGFX10: ; %bb.0: ; %main_body