-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Introduce ordering parameter to atomic intrinsics and introduce new llvm.amdgcn.image.atomic.load intrinsic. #73613
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: None (sstipanovic) ChangesThis intrinsic should behave mostly identically to an llvm.amdgcn.image.load, except that: MachineMemOperand now has appropriate ordering for all amdgpu image atomics. Patch is 159.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73613.diff 15 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 2c629f3f96a0c3d..03e844b263bcd2c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -36,6 +36,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/RecyclingAllocator.h"
@@ -1297,7 +1298,8 @@ class SelectionDAG {
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic);
inline SDValue getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1305,11 +1307,12 @@ class SelectionDAG {
MaybeAlign Alignment = std::nullopt,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic) {
// Ensure that codegen never sees alignment 0
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
Alignment.value_or(getEVTAlign(MemVT)), Flags,
- Size, AAInfo);
+ Size, AAInfo, Ordering);
}
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 187e000d0272d2e..1dfb97d23f300a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1131,6 +1131,8 @@ class TargetLoweringBase {
MaybeAlign align = Align(1); // alignment
MachineMemOperand::Flags flags = MachineMemOperand::MONone;
+
+ AtomicOrdering ordering = AtomicOrdering::NotAtomic;
IntrinsicInfo() = default;
};
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..2dd45cf0b388851 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -675,6 +675,7 @@ class AMDGPUDimProfile<string opmod,
bit ZCompare = false;
bit Gradients = false;
string LodClampMip = "";
+ bit IsAtomicLoad = false;
int NumRetAndDataAnyTypes =
!foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
@@ -731,10 +732,12 @@ class AMDGPUDimNoSampleProfile<string opmod,
AMDGPUDimProps dim,
list<LLVMType> retty,
list<AMDGPUArg> dataargs,
- bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
+ bit Mip = false,
+ bit AtomicLoad = false> : AMDGPUDimProfile<opmod, dim> {
let RetTypes = retty;
let DataArgs = dataargs;
let LodClampMip = !if(Mip, "mip", "");
+ let IsAtomicLoad = AtomicLoad;
}
class AMDGPUDimAtomicProfile<string opmod,
@@ -786,6 +789,7 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int UnormArgIndex = !add(SampArgIndex, 1);
int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
+ int AtomicOrderingIndex = !add(CachePolicyArgIndex, !if(!or(P_.IsAtomic, P_.IsAtomicLoad), 1, 0));
}
// All dimension-aware intrinsics are derived from this class.
@@ -801,13 +805,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
!if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
llvm_i1_ty], []), // unorm(imm)
[llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
- llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [llvm_i32_ty], [])), // atomic ordering
!listconcat(props,
!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
[ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
- ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
+ ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.AtomicOrderingIndex>>], [])),
"", sdnodeprops>,
@@ -852,7 +858,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
foreach dim = AMDGPUDims.All in {
def !strconcat(NAME, "_", dim.Name)
: AMDGPUImageDimIntrinsic<
- AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
+ AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip, !eq(NAME, "int_amdgcn_image_atomic_load")>,
props, sdnodeprops>;
}
}
@@ -861,6 +867,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
+ defm int_amdgcn_image_atomic_load
+ : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [],
+ [SDNPMemOperand]>,
+ AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
[IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 764567ac7baada6..0b210d1baaaa441 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2588,8 +2588,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- MIB.addMemOperand(
- MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
+ MIB.addMemOperand(MF->getMachineMemOperand(
+ MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
+ /*Ranges*/ nullptr, /*SSID*/ SyncScope::System, Info.ordering,
+ Info.ordering));
}
return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a61920b7c079ba..c955f36ca4cb786 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -54,8 +54,10 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -8308,15 +8310,17 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
SDValue SelectionDAG::getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
- MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
+ MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo,
+ AtomicOrdering Ordering) {
if (!Size && MemVT.isScalableVector())
Size = MemoryLocation::UnknownSize;
else if (!Size)
Size = MemVT.getStoreSize();
MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, Flags, Size, Alignment, AAInfo, /*Ranges*/ nullptr,
+ /*SSID*/ SyncScope::System, Ordering, Ordering);
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c5fd56795a5201a..d23962255b9b39b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4959,9 +4959,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
- Info.memVT, MPI, Info.align, Info.flags,
- Info.size, I.getAAMetadata());
+ Result = DAG.getMemIntrinsicNode(
+ Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT, MPI, Info.align,
+ Info.flags, Info.size, I.getAAMetadata(), Info.ordering);
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d924f733624a9ad..909c5f74a02c5b5 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1434,7 +1434,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> NumDmask = DimEval.NumDmaskArgs;
bits<8> NumData = DimEval.NumDataArgs;
bits<8> NumVAddrs = DimEval.NumVAddrArgs;
- bits<8> NumArgs = !add(DimEval.CachePolicyArgIndex, 1);
+ bits<8> NumArgs = !add(DimEval.AtomicOrderingIndex, 1);
bits<8> DMaskIndex = DimEval.DmaskArgIndex;
bits<8> VAddrStart = DimEval.VAddrArgIndex;
@@ -1451,6 +1451,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> UnormIndex = DimEval.UnormArgIndex;
bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+ bits<8> AtomicOrderingIndex = DimEval.AtomicOrderingIndex;
bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
!if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a5..4c309793b7bcc17 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -1111,6 +1113,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+ if (RsrcIntr->IsImage) {
+ auto Idx = CI.arg_size() - 1;
+ unsigned OrderingArg =
+ cast<ConstantInt>(CI.getArgOperand(Idx))->getZExtValue();
+ Info.ordering = static_cast<AtomicOrdering>(OrderingArg);
+ }
+
switch (IntrID) {
default:
break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 7c1b7bc86706311..a41444a63ffb5f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -22,7 +22,7 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -44,11 +44,11 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -72,7 +72,7 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -94,11 +94,11 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -122,7 +122,7 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -144,11 +144,11 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -172,7 +172,7 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECT...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: None (sstipanovic) ChangesThis intrinsic should behave mostly identically to an llvm.amdgcn.image.load, except that: MachineMemOperand now has appropriate ordering for all amdgpu image atomics. Patch is 159.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73613.diff 15 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 2c629f3f96a0c3d..03e844b263bcd2c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -36,6 +36,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/RecyclingAllocator.h"
@@ -1297,7 +1298,8 @@ class SelectionDAG {
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic);
inline SDValue getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1305,11 +1307,12 @@ class SelectionDAG {
MaybeAlign Alignment = std::nullopt,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic) {
// Ensure that codegen never sees alignment 0
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
Alignment.value_or(getEVTAlign(MemVT)), Flags,
- Size, AAInfo);
+ Size, AAInfo, Ordering);
}
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 187e000d0272d2e..1dfb97d23f300a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1131,6 +1131,8 @@ class TargetLoweringBase {
MaybeAlign align = Align(1); // alignment
MachineMemOperand::Flags flags = MachineMemOperand::MONone;
+
+ AtomicOrdering ordering = AtomicOrdering::NotAtomic;
IntrinsicInfo() = default;
};
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..2dd45cf0b388851 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -675,6 +675,7 @@ class AMDGPUDimProfile<string opmod,
bit ZCompare = false;
bit Gradients = false;
string LodClampMip = "";
+ bit IsAtomicLoad = false;
int NumRetAndDataAnyTypes =
!foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
@@ -731,10 +732,12 @@ class AMDGPUDimNoSampleProfile<string opmod,
AMDGPUDimProps dim,
list<LLVMType> retty,
list<AMDGPUArg> dataargs,
- bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
+ bit Mip = false,
+ bit AtomicLoad = false> : AMDGPUDimProfile<opmod, dim> {
let RetTypes = retty;
let DataArgs = dataargs;
let LodClampMip = !if(Mip, "mip", "");
+ let IsAtomicLoad = AtomicLoad;
}
class AMDGPUDimAtomicProfile<string opmod,
@@ -786,6 +789,7 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int UnormArgIndex = !add(SampArgIndex, 1);
int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
+ int AtomicOrderingIndex = !add(CachePolicyArgIndex, !if(!or(P_.IsAtomic, P_.IsAtomicLoad), 1, 0));
}
// All dimension-aware intrinsics are derived from this class.
@@ -801,13 +805,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
!if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
llvm_i1_ty], []), // unorm(imm)
[llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
- llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [llvm_i32_ty], [])), // atomic ordering
!listconcat(props,
!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
[ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
- ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
+ ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.AtomicOrderingIndex>>], [])),
"", sdnodeprops>,
@@ -852,7 +858,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
foreach dim = AMDGPUDims.All in {
def !strconcat(NAME, "_", dim.Name)
: AMDGPUImageDimIntrinsic<
- AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
+ AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip, !eq(NAME, "int_amdgcn_image_atomic_load")>,
props, sdnodeprops>;
}
}
@@ -861,6 +867,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
+ defm int_amdgcn_image_atomic_load
+ : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [],
+ [SDNPMemOperand]>,
+ AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
[IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 764567ac7baada6..0b210d1baaaa441 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2588,8 +2588,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- MIB.addMemOperand(
- MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
+ MIB.addMemOperand(MF->getMachineMemOperand(
+ MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
+ /*Ranges*/ nullptr, /*SSID*/ SyncScope::System, Info.ordering,
+ Info.ordering));
}
return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a61920b7c079ba..c955f36ca4cb786 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -54,8 +54,10 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -8308,15 +8310,17 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
SDValue SelectionDAG::getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
- MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
+ MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo,
+ AtomicOrdering Ordering) {
if (!Size && MemVT.isScalableVector())
Size = MemoryLocation::UnknownSize;
else if (!Size)
Size = MemVT.getStoreSize();
MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, Flags, Size, Alignment, AAInfo, /*Ranges*/ nullptr,
+ /*SSID*/ SyncScope::System, Ordering, Ordering);
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c5fd56795a5201a..d23962255b9b39b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4959,9 +4959,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
- Info.memVT, MPI, Info.align, Info.flags,
- Info.size, I.getAAMetadata());
+ Result = DAG.getMemIntrinsicNode(
+ Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT, MPI, Info.align,
+ Info.flags, Info.size, I.getAAMetadata(), Info.ordering);
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d924f733624a9ad..909c5f74a02c5b5 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1434,7 +1434,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> NumDmask = DimEval.NumDmaskArgs;
bits<8> NumData = DimEval.NumDataArgs;
bits<8> NumVAddrs = DimEval.NumVAddrArgs;
- bits<8> NumArgs = !add(DimEval.CachePolicyArgIndex, 1);
+ bits<8> NumArgs = !add(DimEval.AtomicOrderingIndex, 1);
bits<8> DMaskIndex = DimEval.DmaskArgIndex;
bits<8> VAddrStart = DimEval.VAddrArgIndex;
@@ -1451,6 +1451,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> UnormIndex = DimEval.UnormArgIndex;
bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+ bits<8> AtomicOrderingIndex = DimEval.AtomicOrderingIndex;
bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
!if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a5..4c309793b7bcc17 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -1111,6 +1113,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+ if (RsrcIntr->IsImage) {
+ auto Idx = CI.arg_size() - 1;
+ unsigned OrderingArg =
+ cast<ConstantInt>(CI.getArgOperand(Idx))->getZExtValue();
+ Info.ordering = static_cast<AtomicOrdering>(OrderingArg);
+ }
+
switch (IntrID) {
default:
break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 7c1b7bc86706311..a41444a63ffb5f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -22,7 +22,7 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -44,11 +44,11 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -72,7 +72,7 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -94,11 +94,11 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -122,7 +122,7 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -144,11 +144,11 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -172,7 +172,7 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECT...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: None (sstipanovic) ChangesThis intrinsic should behave mostly identically to an llvm.amdgcn.image.load, except that: MachineMemOperand now has appropriate ordering for all amdgpu image atomics. Patch is 159.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73613.diff 15 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 2c629f3f96a0c3d..03e844b263bcd2c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -36,6 +36,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/RecyclingAllocator.h"
@@ -1297,7 +1298,8 @@ class SelectionDAG {
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic);
inline SDValue getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1305,11 +1307,12 @@ class SelectionDAG {
MaybeAlign Alignment = std::nullopt,
MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore,
- uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+ uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes(),
+ AtomicOrdering Ordering = AtomicOrdering::NotAtomic) {
// Ensure that codegen never sees alignment 0
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
Alignment.value_or(getEVTAlign(MemVT)), Flags,
- Size, AAInfo);
+ Size, AAInfo, Ordering);
}
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 187e000d0272d2e..1dfb97d23f300a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1131,6 +1131,8 @@ class TargetLoweringBase {
MaybeAlign align = Align(1); // alignment
MachineMemOperand::Flags flags = MachineMemOperand::MONone;
+
+ AtomicOrdering ordering = AtomicOrdering::NotAtomic;
IntrinsicInfo() = default;
};
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..2dd45cf0b388851 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -675,6 +675,7 @@ class AMDGPUDimProfile<string opmod,
bit ZCompare = false;
bit Gradients = false;
string LodClampMip = "";
+ bit IsAtomicLoad = false;
int NumRetAndDataAnyTypes =
!foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
@@ -731,10 +732,12 @@ class AMDGPUDimNoSampleProfile<string opmod,
AMDGPUDimProps dim,
list<LLVMType> retty,
list<AMDGPUArg> dataargs,
- bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
+ bit Mip = false,
+ bit AtomicLoad = false> : AMDGPUDimProfile<opmod, dim> {
let RetTypes = retty;
let DataArgs = dataargs;
let LodClampMip = !if(Mip, "mip", "");
+ let IsAtomicLoad = AtomicLoad;
}
class AMDGPUDimAtomicProfile<string opmod,
@@ -786,6 +789,7 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int UnormArgIndex = !add(SampArgIndex, 1);
int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
+ int AtomicOrderingIndex = !add(CachePolicyArgIndex, !if(!or(P_.IsAtomic, P_.IsAtomicLoad), 1, 0));
}
// All dimension-aware intrinsics are derived from this class.
@@ -801,13 +805,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
!if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
llvm_i1_ty], []), // unorm(imm)
[llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
- llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [llvm_i32_ty], [])), // atomic ordering
!listconcat(props,
!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
[ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
- ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
+ ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
+ !if(!or(P_.IsAtomic, P_.IsAtomicLoad), [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.AtomicOrderingIndex>>], [])),
"", sdnodeprops>,
@@ -852,7 +858,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
foreach dim = AMDGPUDims.All in {
def !strconcat(NAME, "_", dim.Name)
: AMDGPUImageDimIntrinsic<
- AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
+ AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip, !eq(NAME, "int_amdgcn_image_atomic_load")>,
props, sdnodeprops>;
}
}
@@ -861,6 +867,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
+ defm int_amdgcn_image_atomic_load
+ : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [],
+ [SDNPMemOperand]>,
+ AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
[IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 764567ac7baada6..0b210d1baaaa441 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2588,8 +2588,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- MIB.addMemOperand(
- MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
+ MIB.addMemOperand(MF->getMachineMemOperand(
+ MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
+ /*Ranges*/ nullptr, /*SSID*/ SyncScope::System, Info.ordering,
+ Info.ordering));
}
return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a61920b7c079ba..c955f36ca4cb786 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -54,8 +54,10 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -8308,15 +8310,17 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
SDValue SelectionDAG::getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
- MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
+ MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo,
+ AtomicOrdering Ordering) {
if (!Size && MemVT.isScalableVector())
Size = MemoryLocation::UnknownSize;
else if (!Size)
Size = MemVT.getStoreSize();
MachineFunction &MF = getMachineFunction();
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, Flags, Size, Alignment, AAInfo, /*Ranges*/ nullptr,
+ /*SSID*/ SyncScope::System, Ordering, Ordering);
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c5fd56795a5201a..d23962255b9b39b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4959,9 +4959,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
else if (Info.fallbackAddressSpace)
MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
- Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
- Info.memVT, MPI, Info.align, Info.flags,
- Info.size, I.getAAMetadata());
+ Result = DAG.getMemIntrinsicNode(
+ Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT, MPI, Info.align,
+ Info.flags, Info.size, I.getAAMetadata(), Info.ordering);
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d924f733624a9ad..909c5f74a02c5b5 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1434,7 +1434,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> NumDmask = DimEval.NumDmaskArgs;
bits<8> NumData = DimEval.NumDataArgs;
bits<8> NumVAddrs = DimEval.NumVAddrArgs;
- bits<8> NumArgs = !add(DimEval.CachePolicyArgIndex, 1);
+ bits<8> NumArgs = !add(DimEval.AtomicOrderingIndex, 1);
bits<8> DMaskIndex = DimEval.DmaskArgIndex;
bits<8> VAddrStart = DimEval.VAddrArgIndex;
@@ -1451,6 +1451,7 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> UnormIndex = DimEval.UnormArgIndex;
bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+ bits<8> AtomicOrderingIndex = DimEval.AtomicOrderingIndex;
bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
!if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f170428b38c49a5..4c309793b7bcc17 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -1111,6 +1113,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+ if (RsrcIntr->IsImage) {
+ auto Idx = CI.arg_size() - 1;
+ unsigned OrderingArg =
+ cast<ConstantInt>(CI.getArgOperand(Idx))->getZExtValue();
+ Info.ordering = static_cast<AtomicOrdering>(OrderingArg);
+ }
+
switch (IntrID) {
default:
break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 7c1b7bc86706311..a41444a63ffb5f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -22,7 +22,7 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -44,11 +44,11 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -72,7 +72,7 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -94,11 +94,11 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -122,7 +122,7 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -144,11 +144,11 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
- %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %v to float
ret float %out
}
@@ -172,7 +172,7 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
- ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[BUILD_VECT...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
I think the PR description needs to be clearer:
If ordering parameter is to be added to all intrinsics then I suspect there needs to be more test coverage that the full range of these values make their way to MIR and manifest the expected effects. |
Initially I was about to add ordering parameter only for the new intrinsic, but I noticed that the ordering is not set for other atomics either, even though llpc sets the ordering for atomic intrinsics. Maybe, with this change, llpc won't have to add fences anymore. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem with doing this is generic passes that should really know this is an atomic operation do not know it. It sort of works for codegen, but it's also a different representation from a normal scope. In the past I've thought about introducing a call atomic flag to allow generically carrying the ordering and scope
Just based off of the function attributes, generic passes should assume that there may be an atomic operation in there based on the If there are passes that make assumptions because this is an intrinsic, perhaps those passes should be fixed? [0] Actually... |
Fixed. Atomics don't have |
@nhaehnle ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for making the nosync
adjustment. Though now, it looks like nosync
has to be added back to most of the other image intrinsics.
|
33960a4
to
08a5f1c
Compare
We don't have all the fields though. For example, MMOs have both a "success ordering" and a "fail ordering" (for cmpxchg). As for the complexity of the change, I'd worry that it's quite difficult to change all users of If creating the MMO is too cumbersome, then perhaps that side of it could be improved? |
Remove `nosync` as discussed in #73613
3637677
to
5430898
Compare
I think this needs to be split into separate patches: at least one adding |
Fixing |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if we should encode the ordering with metadata or something better than an integer
Sorry for the late response.
Is visibility the only reason for this suggestion? @nhaehnle what do you think? |
Partially. Previously I was thinking of introducing an IR change to allow arbitrary calls to carry this kind of information. i.e. either a control bit to add the atomicrmw operands, or to encode it as an operand bundle. Really deviating from how atomicrmw represents this is unfortunate |
9fa38ab
to
96369e7
Compare
This intrinsic should behave mostly identically to an llvm.amdgcn.image.load, except that: - It is not marked as IntrReadMem. This is to ensure that the implied memory semantics are preserved. - When lowering, it's MachineMemOperand is to get the "acquire" memory semantics.
Are you planning on looking into alternative ways of encoding it attached to the call, or do you need this in the intrinsic? |
Sorry, I was on paternity leave. I will look into it. |
This intrinsic should behave mostly identically to an llvm.amdgcn.image.load, except that:
- It is not marked as IntrReadMem. This is to ensure that the implied memory semantics are preserved.
- When lowering, it's MachineMemOperand is to get the "acquire" memory semantics. New intrinsic atomic.load does not implicitly have acquire semantics
This adds ordering value to the cachepolicy parameter to all existing atomic intrinsics. MachineMemOperand now has appropriate ordering for all amdgpu image atomics.