Skip to content

Commit

Permalink
AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2}
Browse files Browse the repository at this point in the history
Summary:
Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+.

32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý.

Patch by: Vedran Miletić

Reviewers: arsenm, tstellarAMD, nhaehnle

Subscribers: jvesely, scchan, kanarayan, arsenm

Differential Revision: http://reviews.llvm.org/D17280

llvm-svn: 265170
  • Loading branch information
tstellarAMD committed Apr 1, 2016
1 parent 1b14082 commit 354a43c
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 3 deletions.
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2812,6 +2812,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(INTERP_P2)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ enum NodeType : unsigned {
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
ATOMIC_CMP_SWAP,
LAST_AMDGPU_ISD_NUMBER
};

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
SDTypeProfile<0, 2, []>,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;

def AMDGPUround : SDNode<"ISD::FROUND",
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;

Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,13 @@ def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;

def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>;
def atomic_cmp_swap_global_nortn : PatFrag<
(ops node:$ptr, node:$value),
(atomic_cmp_swap_global node:$ptr, node:$value),
[{ return SDValue(N, 0).use_empty(); }]
>;

//===----------------------------------------------------------------------===//
// Misc Pattern Fragments
//===----------------------------------------------------------------------===//
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/CIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,9 @@ def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;

class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr, vt:$data)),
class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : Pat <
(vt (node i64:$addr, data_vt:$data)),
(inst $addr, $data, 0, 0)
>;

Expand All @@ -322,6 +323,9 @@ def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>;
def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;

def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>;

} // End Predicates = [isCIVI]
62 changes: 62 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);

// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);

// We can't return success/failure, only the old value,
// let LLVM add the comparison
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);

setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
Expand Down Expand Up @@ -1156,6 +1166,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerTrig(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::FDIV: return LowerFDIV(Op, DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::GlobalAddress: {
MachineFunction &MF = DAG.getMachineFunction();
Expand Down Expand Up @@ -2003,6 +2014,34 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
}
}

SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
assert(AtomicNode->isCompareAndSwap());
unsigned AS = AtomicNode->getAddressSpace();

// No custom lowering required for local address space
if (!isFlatGlobalAddrSpace(AS))
return Op;

// Non-local address space requires custom lowering for atomic compare
// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
SDLoc DL(Op);
SDValue ChainIn = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
SDValue Old = Op.getOperand(2);
SDValue New = Op.getOperand(3);
EVT VT = Op.getValueType();
MVT SimpleVT = VT.getSimpleVT();
MVT VecType = MVT::getVectorVT(SimpleVT, 2);

SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, VecType,
New, Old);
SDValue Ops[] = { ChainIn, Addr, NewOld };
SDVTList VTList = DAG.getVTList(VT, MVT::Other);
return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
VTList, Ops, VT, AtomicNode->getMemOperand());
}

//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -2849,8 +2888,31 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
if (!Node->hasAnyUseOfValue(0)) {
MI->setDesc(TII->get(NoRetAtomicOp));
MI->RemoveOperand(0);
return;
}

// For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
// instruction, because the return type of these instructions is a vec2 of
// the memory type, so it can be tied to the input operand.
// This means these instructions always have a use, so we need to add a
// special case to check if the atomic has only one extract_subreg use,
// which itself has no uses.
if ((Node->hasNUsesOfValue(1, 0) &&
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
!Node->use_begin()->hasAnyUseOfValue(0))) {
unsigned Def = MI->getOperand(0).getReg();

// Change this into a noret atomic.
MI->setDesc(TII->get(NoRetAtomicOp));
MI->RemoveOperand(0);

// If we only remove the def operand from the atomic instruction, the
// extract_subreg will be left with a use of a vreg without a def.
// So we need to insert an implicit_def to avoid machine verifier
// errors.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), Def);
}
return;
}
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;

void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
Expand Down
35 changes: 34 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,9 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic <
mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
>;
//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
Expand Down Expand Up @@ -3186,6 +3188,37 @@ def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;


multiclass MUBUFCmpSwapPat <Instruction inst_addr64, Instruction inst_offset,
SDPatternOperator node, ValueType data_vt,
ValueType node_vt> {

let Predicates = [isSI] in {
def : Pat <
(node_vt (node (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), data_vt:$vdata_in)),
(EXTRACT_SUBREG
(inst_addr64 $vdata_in, $vaddr, $srsrc, $soffset, $offset, $slc), sub0)
>;

}

def : Pat <
(node_vt (node (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
i1:$slc), data_vt:$vdata_in)),
(EXTRACT_SUBREG
(inst_offset $vdata_in, $srsrc, $soffset, $offset, $slc), sub0)
>;
}

defm : MUBUFCmpSwapPat <BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64,
BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET,
atomic_cmp_swap_global, v2i32, i32>;

defm : MUBUFCmpSwapPat <BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64,
BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET,
atomic_cmp_swap_global, v2i64, i64>;

//===----------------------------------------------------------------------===//
// MTBUF Patterns
//===----------------------------------------------------------------------===//
Expand Down
89 changes: 89 additions & 0 deletions llvm/test/CodeGen/AMDGPU/global_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,95 @@ entry:
ret void
}

; CMP_SWAP

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword v[[RET]]
define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
%1 = extractvalue { i32, i1 } %0, 0
store i32 %1, i32 addrspace(1)* %out2
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
; GCN: buffer_store_dword v[[RET]]
define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
%1 = extractvalue { i32, i1 } %0, 0
store i32 %1, i32 addrspace(1)* %out2
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword v[[RET]]
define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
entry:
%0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
%1 = extractvalue { i32, i1 } %0, 0
store i32 %1, i32 addrspace(1)* %out2
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
%0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
ret void
}

; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
; GCN: buffer_store_dword v[[RET]]
define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
%0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
%1 = extractvalue { i32, i1 } %0, 0
store i32 %1, i32 addrspace(1)* %out2
ret void
}

; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
Expand Down

0 comments on commit 354a43c

Please sign in to comment.