From c7f7a2f30bfce94f07d7cb81d9d571f5573b5982 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Fri, 28 Mar 2025 17:33:18 +0000 Subject: [PATCH 1/2] [NVPTX] Cleanup and refactor atomic lowering --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 744 ++++---------------- llvm/test/CodeGen/NVPTX/atomics.ll | 22 +- 4 files changed, 161 insertions(+), 623 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8a4b83365ae84..b566cdd4b6bfc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -994,6 +994,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand); // No FPOW or FREM in PTX. // Now deduce the information based on the above mentioned diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index fe9bb621b481c..7d0c47fa464c5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -216,16 +216,25 @@ class fpimm_pos_inf // Utility class to wrap up information about a register and DAG type for more // convenient iteration and parameterization -class RegTyInfo { +class RegTyInfo { ValueType Ty = ty; NVPTXRegClass RC = rc; Operand Imm = imm; + SDNode ImmNode = imm_node; + bit SupportsImm = supports_imm; int Size = ty.Size; } -def I16RT : RegTyInfo; -def I32RT : RegTyInfo; -def I64RT : RegTyInfo; +def I16RT : RegTyInfo; +def I32RT : RegTyInfo; +def I64RT : RegTyInfo; + +def F32RT : RegTyInfo; +def F64RT : RegTyInfo; +def F16RT : RegTyInfo; +def BF16RT : RegTyInfo; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b2e05a567b4fe..a0a6cdcaafc2a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1975,529 +1975,135 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$ // Atomic Functions //----------------------------------- -class ATOMIC_GLOBAL_CHK - : PatFrag; -class ATOMIC_SHARED_CHK - : PatFrag; -class ATOMIC_GENERIC_CHK - : PatFrag; - -multiclass F_ATOMIC_2< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, SDNode IMM, list Pred = []> { - let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def r : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires; - if !not(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16"))) then - def i : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, IMMType:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, IMM:$b))]>, - Requires; - } -} +class ATOMIC_GLOBAL_CHK + : PatFrag; +class ATOMIC_SHARED_CHK + : PatFrag; +class ATOMIC_GENERIC_CHK + : PatFrag; + -// has 2 operands, neg the second one -multiclass F_ATOMIC_2_NEG< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - list Pred = []> { +multiclass F_ATOMIC_2 preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def reg : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - !strconcat( - "{{ \n\t", - ".reg \t.s", TypeStr, " temp; \n\t", - "neg.s", TypeStr, " \ttemp, $b; \n\t", - "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t", - "}}"), - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires; + def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>, + Requires; + if t.SupportsImm then + def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>, + Requires; } } // has 3 operands -multiclass F_ATOMIC_3< - ValueType regT, NVPTXRegClass regclass, string SemStr, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, list Pred = []> { +multiclass F_ATOMIC_3 preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, regT:$c))]>, - Requires; - - def ir : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, regT:$c))]>, - Requires; - - def ri : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, imm:$c))]>, - Requires; - - def ii : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, imm:$c))]>, - Requires; + def rr : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, + Requires; + + def ir : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, + Requires; + + def ri : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, + Requires; + + def ii : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, + Requires; } } +multiclass F_ATOMIC_2_AS preds = []> { + defvar frag_pat = (frag node:$a, node:$b); + defm _G : F_ATOMIC_2, preds>; + defm _S : F_ATOMIC_2, preds>; + defm _GEN : F_ATOMIC_2, preds>; +} + +multiclass F_ATOMIC_3_AS preds = []> { + defvar frag_pat = (frag node:$a, node:$b, node:$c); + defm _G : F_ATOMIC_3, preds>; + defm _S : F_ATOMIC_3, preds>; + defm _GEN : F_ATOMIC_3, preds>; +} + // atom_add +defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS; -def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; - -defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2, hasPTX<63>]>; - -defm INT_PTX_ATOM_ADD_G_BF16 : F_ATOMIC_2, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_S_BF16 : F_ATOMIC_2, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_GEN_BF16 : F_ATOMIC_2, hasPTX<78>]>; - -defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; - -// atom_sub - -def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG; +defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS, hasPTX<63>]>; +defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2_AS, hasPTX<78>]>; +defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS; // atom_swap - -def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_SWAP_64 : F_ATOMIC_2_AS; // atom_max - -def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOMIC_MAX_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_MAX_64 : F_ATOMIC_2_AS]>; +defm INT_PTX_ATOMIC_UMAX_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_UMAX_64 : F_ATOMIC_2_AS]>; // atom_min - -def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOMIC_MIN_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_MIN_64 : F_ATOMIC_2_AS]>; +defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS]>; // atom_inc atom_dec - -def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; - -defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS; // atom_and - -def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_AND_64 : F_ATOMIC_2_AS]>; // atom_or - -def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2]>; +defm INT_PTX_ATOM_OR_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_OR_64 : F_ATOMIC_2_AS]>; // atom_xor +defm INT_PTX_ATOM_XOR_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS]>; -def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2]>; - -multiclass ternary_atomic_op_as { - // one record per address space - def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.generic>; - - def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.global>; - - def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.shared>; -} - -// generate pattern fragments for size x memory order -// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs -// using larger-bitwidth cas -foreach size = ["i16", "i32", "i64"] in { - foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in { - defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as; - } -} - -// eg. with type = 32, order = ".acquire", addrspace = ".global", -// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. -// preds = [hasSM<70>, hasPTX<63>] -// F_ATOMIC_3, hasPTX<63>]> -multiclass INT_PTX_ATOM_CAS preds> - : F_ATOMIC_3("i"#type), - !cast("Int"#type#"Regs"), - order, - addrspace, - ".b"#type, - ".cas", - !cast(atomic_cmp_swap_pat), - !cast("i"#type#"imm"), - preds>; // Define atom.cas for all combinations of size x addrspace x memory order // supported in PTX *and* on the hardware. -foreach size = ["32", "64"] in { - foreach addrspace = ["generic", "global", "shared"] in { - defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); - foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { - defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#size#_#order#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - cas_order_string, cas_addrspace_string, - [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - "", cas_addrspace_string, []>; - } +foreach t = [I32RT, I64RT] in { + foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { + defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size#_#order); + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. + defm INT_PTX_ATOM_CAS_#t.Size#_#order + : F_ATOMIC_3_AS, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old + : F_ATOMIC_3_AS; } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -2505,173 +2111,101 @@ defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3 Preds, - dag ins, dag Operands> - : NVPTXInst<(outs regclass:$result), ins, - AsmStr, - [(set regT:$result, Operands)]>, - Requires; // Define instruction variants for all addressing modes. -multiclass ATOM2P_impl Preds> { - let AddedComplexity = 1 in { - def : ATOM23_impl; - } - // tablegen can't infer argument types from Intrinsic (though it can - // from Instruction) so we have to enforce specific type on - // immediates via explicit cast to ImmTy. - def : ATOM23_impl; -} - -multiclass ATOM3P_impl Preds> { - // Variants for register/immediate permutations of $b and $c - let AddedComplexity = 2 in { - def : ATOM23_impl; - } - let AddedComplexity = 1 in { - def : ATOM23_impl; - def : ATOM23_impl; - } - def : ATOM23_impl; -} // Constructs intrinsic name and instruction asm strings. multiclass ATOM2N_impl Preds> { - defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b;", + RegTyInfo t, list Preds> { + defm "" : F_ATOMIC_2( "int_nvvm_atomic_" # OpStr # "_" # SpaceStr # "_" # IntTypeStr # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; + Preds>; } multiclass ATOM3N_impl Preds> { - defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b, $c;", + RegTyInfo t, list Preds> { + defm "" : F_ATOMIC_3( "int_nvvm_atomic_" # OpStr # "_" # SpaceStr # "_" # IntTypeStr # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; -} - -// Constructs variants for different address spaces. -// For now we only need variants for generic space pointers. -multiclass ATOM2A_impl Preds> { - defm _gen_ : ATOM2N_impl; -} -multiclass ATOM3A_impl Preds> { - defm _gen_ : ATOM3N_impl; + Preds>; } // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl Preds> { + RegTyInfo t, list Preds> { // .gpu scope is default and is currently covered by existing // atomics w/o explicitly specified scope. - defm _cta : ATOM2A_impl; - defm _sys : ATOM2A_impl; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM2N_impl; + } + } } multiclass ATOM3S_impl Preds> { + RegTyInfo t, list Preds> { // No need to define ".gpu"-scoped atomics. They do the same thing // as the regular, non-scoped atomics defined elsewhere. - defm _cta : ATOM3A_impl; - defm _sys : ATOM3A_impl; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM3N_impl; + } + } } // atom.add multiclass ATOM2_add_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; - defm _bf16 : ATOM2S_impl, hasPTX<78>]>; - defm _f16 : ATOM2S_impl, hasPTX<63>]>; - defm _f32 : ATOM2S_impl; - defm _f64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; + defm _bf16 : ATOM2S_impl, hasPTX<78>]>; + defm _f16 : ATOM2S_impl; + defm _f32 : ATOM2S_impl; + defm _f64 : ATOM2S_impl; } // atom.{and,or,xor} multiclass ATOM2_bitwise_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.exch multiclass ATOM2_exch_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.{min,max} multiclass ATOM2_minmax_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _s64 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _s64 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; } // atom.{inc,dec} multiclass ATOM2_incdec_impl { - defm _u32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; } // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : ATOM3S_impl; - defm _b32 : ATOM3S_impl; - defm _b64 : ATOM3S_impl; + defm _b16 : ATOM3S_impl; + defm _b32 : ATOM3S_impl; + defm _b64 : ATOM3S_impl; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index e1fbb53891902..e1d9aaf7cfb20 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -40,18 +40,15 @@ define i64 @atom1(ptr %addr, i64 %val) { define i32 @atom2(ptr %subr, i32 %val) { ; CHECK-LABEL: atom2( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; ; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s32 temp; -; CHECK-NEXT: neg.s32 temp, %r1; -; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: neg.s32 %r2, %r1; +; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i32 %val seq_cst ret i32 %ret @@ -61,17 +58,14 @@ define i32 @atom2(ptr %subr, i32 %val) { define i64 @atom3(ptr %subr, i64 %val) { ; CHECK-LABEL: atom3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; ; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s64 temp; -; CHECK-NEXT: neg.s64 temp, %rd2; -; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: neg.s64 %rd3, %rd2; +; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i64 %val seq_cst ret i64 %ret From d04b4e8273b97795dfbb95d220c6b4992454d052 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Mon, 31 Mar 2025 23:51:52 +0000 Subject: [PATCH 2/2] address comments --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 101 ++++++++++++----------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a0a6cdcaafc2a..34cb63e44ca71 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2048,10 +2048,10 @@ multiclass F_ATOMIC_3_AS; defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS; -defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS, hasPTX<63>]>; +defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS, hasPTX<63>]>; defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2_AS, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS; -defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS; // atom_swap defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2_AS; @@ -2118,25 +2118,28 @@ defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS Preds> { - defm "" : F_ATOMIC_2( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - Preds>; + defm "" : F_ATOMIC_2( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } multiclass ATOM3N_impl Preds> { - defm "" : F_ATOMIC_3( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - Preds>; + defm "" : F_ATOMIC_3( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } // Constructs variants for different scopes of atomic op. @@ -2167,57 +2170,57 @@ multiclass ATOM3S_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; - defm _bf16 : ATOM2S_impl, hasPTX<78>]>; - defm _f16 : ATOM2S_impl; - defm _f32 : ATOM2S_impl; - defm _f64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; + defm _bf16 : ATOM2S_impl, hasPTX<78>]>; + defm _f16 : ATOM2S_impl; + defm _f32 : ATOM2S_impl; + defm _f64 : ATOM2S_impl; } // atom.{and,or,xor} multiclass ATOM2_bitwise_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.exch multiclass ATOM2_exch_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.{min,max} multiclass ATOM2_minmax_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _s64 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _s64 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; } // atom.{inc,dec} multiclass ATOM2_incdec_impl { - defm _u32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; } // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : ATOM3S_impl; - defm _b32 : ATOM3S_impl; - defm _b64 : ATOM3S_impl; -} - -defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; -defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; -defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; -defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; -defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">; -defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; -defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; -defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; -defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; -defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; + defm _b16 : ATOM3S_impl; + defm _b32 : ATOM3S_impl; + defm _b64 : ATOM3S_impl; +} + +defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; +defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; +defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; +defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; +defm INT_PTX_SATOM_EXCH : ATOM2_exch_impl<"exch">; +defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; +defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; +defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; +defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; +defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; //----------------------------------- // Support for ldu on sm_20 or later