Skip to content

Commit 354a43c

Browse files
committed
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Summary: Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+. 32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý. Patch by: Vedran Miletić Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: jvesely, scchan, kanarayan, arsenm Differential Revision: http://reviews.llvm.org/D17280 llvm-svn: 265170
1 parent 1b14082 commit 354a43c

9 files changed

Lines changed: 206 additions & 3 deletions

File tree

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2812,6 +2812,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
28122812
NODE_NAME_CASE(INTERP_P2)
28132813
NODE_NAME_CASE(STORE_MSKOR)
28142814
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
2815+
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
28152816
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
28162817
}
28172818
return nullptr;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ enum NodeType : unsigned {
313313
STORE_MSKOR,
314314
LOAD_CONSTANT,
315315
TBUFFER_STORE_FORMAT,
316+
ATOMIC_CMP_SWAP,
316317
LAST_AMDGPU_ISD_NUMBER
317318
};
318319

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
183183
SDTypeProfile<0, 2, []>,
184184
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
185185

186+
def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
187+
SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
188+
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
189+
SDNPMemOperand]>;
190+
186191
def AMDGPUround : SDNode<"ISD::FROUND",
187192
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
188193

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,13 @@ def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
400400
def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
401401
def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
402402

403+
def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>;
404+
def atomic_cmp_swap_global_nortn : PatFrag<
405+
(ops node:$ptr, node:$value),
406+
(atomic_cmp_swap_global node:$ptr, node:$value),
407+
[{ return SDValue(N, 0).use_empty(); }]
408+
>;
409+
403410
//===----------------------------------------------------------------------===//
404411
// Misc Pattern Fragments
405412
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/CIInstructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,9 @@ def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
308308
def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
309309
def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
310310

311-
class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
312-
(vt (node i64:$addr, vt:$data)),
311+
class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt,
312+
ValueType data_vt = vt> : Pat <
313+
(vt (node i64:$addr, data_vt:$data)),
313314
(inst $addr, $data, 0, 0)
314315
>;
315316

@@ -322,6 +323,9 @@ def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
322323
def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
323324
def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
324325
def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
326+
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>;
325327
def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
326328

329+
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>;
330+
327331
} // End Predicates = [isCIVI]

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
257257
setOperationAction(ISD::FDIV, MVT::f32, Custom);
258258
setOperationAction(ISD::FDIV, MVT::f64, Custom);
259259

260+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
261+
// and output demarshalling
262+
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
263+
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
264+
265+
// We can't return success/failure, only the old value,
266+
// let LLVM add the comparison
267+
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
268+
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
269+
260270
setTargetDAGCombine(ISD::FADD);
261271
setTargetDAGCombine(ISD::FSUB);
262272
setTargetDAGCombine(ISD::FMINNUM);
@@ -1156,6 +1166,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11561166
return LowerTrig(Op, DAG);
11571167
case ISD::SELECT: return LowerSELECT(Op, DAG);
11581168
case ISD::FDIV: return LowerFDIV(Op, DAG);
1169+
case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
11591170
case ISD::STORE: return LowerSTORE(Op, DAG);
11601171
case ISD::GlobalAddress: {
11611172
MachineFunction &MF = DAG.getMachineFunction();
@@ -2003,6 +2014,34 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
20032014
}
20042015
}
20052016

2017+
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
2018+
AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
2019+
assert(AtomicNode->isCompareAndSwap());
2020+
unsigned AS = AtomicNode->getAddressSpace();
2021+
2022+
// No custom lowering required for local address space
2023+
if (!isFlatGlobalAddrSpace(AS))
2024+
return Op;
2025+
2026+
// Non-local address space requires custom lowering for atomic compare
2027+
// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
2028+
SDLoc DL(Op);
2029+
SDValue ChainIn = Op.getOperand(0);
2030+
SDValue Addr = Op.getOperand(1);
2031+
SDValue Old = Op.getOperand(2);
2032+
SDValue New = Op.getOperand(3);
2033+
EVT VT = Op.getValueType();
2034+
MVT SimpleVT = VT.getSimpleVT();
2035+
MVT VecType = MVT::getVectorVT(SimpleVT, 2);
2036+
2037+
SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, VecType,
2038+
New, Old);
2039+
SDValue Ops[] = { ChainIn, Addr, NewOld };
2040+
SDVTList VTList = DAG.getVTList(VT, MVT::Other);
2041+
return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
2042+
VTList, Ops, VT, AtomicNode->getMemOperand());
2043+
}
2044+
20062045
//===----------------------------------------------------------------------===//
20072046
// Custom DAG optimizations
20082047
//===----------------------------------------------------------------------===//
@@ -2849,8 +2888,31 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
28492888
if (!Node->hasAnyUseOfValue(0)) {
28502889
MI->setDesc(TII->get(NoRetAtomicOp));
28512890
MI->RemoveOperand(0);
2891+
return;
28522892
}
28532893

2894+
// For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
2895+
// instruction, because the return type of these instructions is a vec2 of
2896+
// the memory type, so it can be tied to the input operand.
2897+
// This means these instructions always have a use, so we need to add a
2898+
// special case to check if the atomic has only one extract_subreg use,
2899+
// which itself has no uses.
2900+
if ((Node->hasNUsesOfValue(1, 0) &&
2901+
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
2902+
!Node->use_begin()->hasAnyUseOfValue(0))) {
2903+
unsigned Def = MI->getOperand(0).getReg();
2904+
2905+
// Change this into a noret atomic.
2906+
MI->setDesc(TII->get(NoRetAtomicOp));
2907+
MI->RemoveOperand(0);
2908+
2909+
// If we only remove the def operand from the atomic instruction, the
2910+
// extract_subreg will be left with a use of a vreg without a def.
2911+
// So we need to insert an implicit_def to avoid machine verifier
2912+
// errors.
2913+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2914+
TII->get(AMDGPU::IMPLICIT_DEF), Def);
2915+
}
28542916
return;
28552917
}
28562918
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
4141
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
4242
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
4343
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
44+
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
4445
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
4546

4647
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1048,7 +1048,9 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
10481048
//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
10491049
//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
10501050
//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
1051-
//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
1051+
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic <
1052+
mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
1053+
>;
10521054
//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
10531055
//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
10541056
//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
@@ -3186,6 +3188,37 @@ def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
31863188
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
31873189
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
31883190

3191+
3192+
multiclass MUBUFCmpSwapPat <Instruction inst_addr64, Instruction inst_offset,
3193+
SDPatternOperator node, ValueType data_vt,
3194+
ValueType node_vt> {
3195+
3196+
let Predicates = [isSI] in {
3197+
def : Pat <
3198+
(node_vt (node (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
3199+
i16:$offset, i1:$slc), data_vt:$vdata_in)),
3200+
(EXTRACT_SUBREG
3201+
(inst_addr64 $vdata_in, $vaddr, $srsrc, $soffset, $offset, $slc), sub0)
3202+
>;
3203+
3204+
}
3205+
3206+
def : Pat <
3207+
(node_vt (node (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
3208+
i1:$slc), data_vt:$vdata_in)),
3209+
(EXTRACT_SUBREG
3210+
(inst_offset $vdata_in, $srsrc, $soffset, $offset, $slc), sub0)
3211+
>;
3212+
}
3213+
3214+
defm : MUBUFCmpSwapPat <BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64,
3215+
BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET,
3216+
atomic_cmp_swap_global, v2i32, i32>;
3217+
3218+
defm : MUBUFCmpSwapPat <BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64,
3219+
BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET,
3220+
atomic_cmp_swap_global, v2i64, i64>;
3221+
31893222
//===----------------------------------------------------------------------===//
31903223
// MTBUF Patterns
31913224
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AMDGPU/global_atomics.ll

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,95 @@ entry:
758758
ret void
759759
}
760760

761+
; CMP_SWAP
762+
763+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
764+
; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
765+
define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
766+
entry:
767+
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
768+
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
769+
ret void
770+
}
771+
772+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
773+
; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
774+
; GCN: buffer_store_dword v[[RET]]
775+
define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
776+
entry:
777+
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
778+
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
779+
%1 = extractvalue { i32, i1 } %0, 0
780+
store i32 %1, i32 addrspace(1)* %out2
781+
ret void
782+
}
783+
784+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
785+
; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
786+
define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
787+
entry:
788+
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
789+
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
790+
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
791+
ret void
792+
}
793+
794+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
795+
; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
796+
; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
797+
; GCN: buffer_store_dword v[[RET]]
798+
define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
799+
entry:
800+
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
801+
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
802+
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
803+
%1 = extractvalue { i32, i1 } %0, 0
804+
store i32 %1, i32 addrspace(1)* %out2
805+
ret void
806+
}
807+
808+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
809+
; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
810+
define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
811+
entry:
812+
%0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
813+
ret void
814+
}
815+
816+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
817+
; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
818+
; GCN: buffer_store_dword v[[RET]]
819+
define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
820+
entry:
821+
%0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
822+
%1 = extractvalue { i32, i1 } %0, 0
823+
store i32 %1, i32 addrspace(1)* %out2
824+
ret void
825+
}
826+
827+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
828+
; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
829+
; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
830+
define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
831+
entry:
832+
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
833+
%0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
834+
ret void
835+
}
836+
837+
; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
838+
; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
839+
; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
840+
; GCN: buffer_store_dword v[[RET]]
841+
define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
842+
entry:
843+
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
844+
%0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
845+
%1 = extractvalue { i32, i1 } %0, 0
846+
store i32 %1, i32 addrspace(1)* %out2
847+
ret void
848+
}
849+
761850
; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
762851
; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
763852
define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {

0 commit comments

Comments
 (0)