[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

gonzalobg · 2024-07-11T21:10:12Z

This PR Builds on #98022 .
In this PR, the fence.sc.sys is inserted as part of the load/store instruction.
The next PR will refactor a few things, add fence instruction support, and split this out.

llvmbot · 2024-07-11T21:10:40Z

@llvm/pr-subscribers-backend-nvptx

Author: None (gonzalobg)

Changes

This PR Builds on #98022 .
In this PR, the fence.sc.sys is inserted as part of the load/store instruction.
The next PR will refactor a few things, add fence instruction support, and split this out.

Patch is 119.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98551.diff

7 Files Affected:

(modified) llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (+40-2)
(modified) llvm/lib/Target/NVPTX/NVPTX.h (+47-1)
(modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+268-76)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+72-72)
(modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+33-20)
(added) llvm/test/CodeGen/NVPTX/load-store-sm-70.ll (+1006)
(modified) llvm/test/CodeGen/NVPTX/load-store.ll (+577-6)

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b7a20c351f5ff..9f1e56cd84e51 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -224,9 +224,47 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
   if (Modifier) {
     const MCOperand &MO = MI->getOperand(OpNum);
     int Imm = (int) MO.getImm();
-    if (!strcmp(Modifier, "volatile")) {
-      if (Imm)
+    if (!strcmp(Modifier, "sem")) {
+      auto Ordering =
+          NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+      switch (Ordering) {
+      case NVPTX::Ordering::NotAtomic:
+        break;
+      case NVPTX::Ordering::Volatile:
         O << ".volatile";
+        break;
+      case NVPTX::Ordering::Relaxed:
+        O << ".relaxed.sys";
+        break;
+      case NVPTX::Ordering::Acquire:
+        O << ".acquire.sys";
+        break;
+      case NVPTX::Ordering::Release:
+        O << ".release.sys";
+        break;
+      case NVPTX::Ordering::RelaxedMMIO:
+        O << ".mmio.relaxed.sys";
+        break;
+      default:
+        SmallString<256> Msg;
+        raw_svector_ostream OS(Msg);
+        OS << "NVPTX LdStCode Printer does not support \"" << Ordering
+           << "\" sem modifier.";
+        report_fatal_error(OS.str());
+        break;
+      }
+    } else if (!strcmp(Modifier, "sc")) {
+      auto Ordering =
+          NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+      switch (Ordering) {
+        // TODO: refactor fence insertion in ISelDagToDag instead of here
+        // as part of implementing atomicrmw seq_cst.
+      case NVPTX::Ordering::SequentiallyConsistent:
+        O << "fence.sc.sys;\n\t";
+        break;
+      default:
+        break;
+      }
     } else if (!strcmp(Modifier, "addsp")) {
       switch (Imm) {
       case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b0cb24c63c3ce..c9cce23788ca4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -106,6 +106,52 @@ enum LoadStore {
   isStoreShift = 6
 };
 
+// Extends LLVM AtomicOrdering with PTX Orderings:
+using OrderingUnderlyingType = unsigned int;
+enum class Ordering : OrderingUnderlyingType {
+  NotAtomic = 0, // PTX calls these: "Weak"
+  // Unordered = 1, // TODO: NVPTX should map this to "Relaxed"
+  Relaxed = 2,
+  // Consume = 3,   // Unimplemented in LLVM; NVPTX would map to "Acquire"
+  Acquire = 4,
+  Release = 5,
+  // AcquireRelease = 6, // TODO
+  SequentiallyConsistent = 7,
+  Volatile = 8,
+  RelaxedMMIO = 9,
+  LAST = RelaxedMMIO
+};
+
+template <typename OStream> OStream &operator<<(OStream &O, Ordering Order) {
+  switch (Order) {
+  case Ordering::NotAtomic:
+    O << "NotAtomic";
+    return O;
+  case Ordering::Relaxed:
+    O << "Relaxed";
+    return O;
+  case Ordering::Acquire:
+    O << "Acquire";
+    return O;
+  case Ordering::Release:
+    O << "Release";
+    return O;
+  // case Ordering::AcquireRelease:
+  //   O << "AcquireRelease";
+  //   return O;
+  case Ordering::SequentiallyConsistent:
+    O << "SequentiallyConsistent";
+    return O;
+  case Ordering::Volatile:
+    O << "Volatile";
+    return O;
+  case Ordering::RelaxedMMIO:
+    O << "RelaxedMMIO";
+    return O;
+  }
+  report_fatal_error("unknown ordering");
+}
+
 namespace PTXLdStInstCode {
 enum AddressSpace {
   GENERIC = 0,
@@ -126,7 +172,7 @@ enum VecType {
   V2 = 2,
   V4 = 4
 };
-}
+} // namespace PTXLdStInstCode
 
 /// PTXCvtMode - Conversion code enumeration
 namespace PTXCvtMode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1e1cbb15e33d4..41a3c2102427f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,6 +700,183 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+struct OperationOrderings {
+  NVPTX::OrderingUnderlyingType InstrOrdering;
+  NVPTX::OrderingUnderlyingType FenceOrdering;
+  OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
+                     NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
+      : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+        FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
+};
+
+static OperationOrderings
+getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
+  AtomicOrdering Ordering = N->getSuccessOrdering();
+  auto CodeAddrSpace = getCodeAddrSpace(N);
+
+  bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
+  bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
+
+  // clang-format off
+
+  // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
+  //
+  // | Atomic  | Volatile | Statespace                    | Lowering sm_60- | Lowering sm_70+                                      |
+  // |---------|----------|-------------------------------|-----------------|------------------------------------------------------|
+  // | No      | No       | All                           | plain           | .weak                                                |
+  // | No      | Yes      | Generic / Shared / Global [0] | .volatile       | .volatile                                            |
+  // | No      | Yes      | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Relaxed | No       | Generic / Shared / Global [0] | .volatile       | <atomic sem>                                         |
+  // | Other   | No       | Generic / Shared / Global [0] | Error [2]       | <atomic sem>                                         |
+  // | Yes     | No       | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Relaxed | Yes      | Generic / Shared [0]          | .volatile       | .volatile                                            |
+  // | Relaxed | Yes      | Global [0]                    | .volatile       | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) |
+  // | Relaxed | Yes      | Local / Const / Param         | plain [1]       | .weak [1]                                            |
+  // | Other   | Yes      | Generic / Shared / Global [0] | Error [2]       | <atomic sem> [3]                                     |
+
+  // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
+  // by following the ABI proven sound in:
+  //   Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
+  //   https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
+  //
+  // | CUDA C++ Atomic Operation or Atomic Fence                                   | PTX Atomic Operation or Fence           |
+  // |-----------------------------------------------------------------------------|-----------------------------------------|
+  // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>;                       |
+  // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>)         | fence.sc.<scope>; ld.acquire.<scope>;   |
+  // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>)        | fence.sc.<scope>; st.release.<scope>;   |  
+  // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>)   | fence.sc.<scope>; atom.acq_rel.<scope>; |
+
+  // clang-format on
+
+  // [0]: volatile and atomics are only supported on generic addressing to
+  //      shared or global, or shared, or global.
+  //      MMIO requires generic addressing to global or global, but
+  // (TODO) we only implement it for global.
+
+  // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it
+  //      fails to preserve the side-effects of atomics and volatile
+  //      accesses in LLVM IR to local / const / param, causing
+  //      well-formed LLVM-IR & CUDA C++ programs to be miscompiled
+  //      in sm_70+.
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+    return NVPTX::Ordering::NotAtomic;
+  }
+
+  // [2]: Atomics with Ordering different than Relaxed are not supported on
+  //      sm_60 and older; this includes volatile atomics.
+  if (!(Ordering == AtomicOrdering::NotAtomic ||
+        Ordering == AtomicOrdering::Monotonic) &&
+      !HasMemoryOrdering) {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "PTX does not support \"atomic\" for orderings different than"
+          "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
+       << toIRString(Ordering) << "\".";
+    report_fatal_error(OS.str());
+  }
+
+  // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
+  // the volatile semantics and preserve the atomic ones.
+
+  // PTX volatile and PTX atomics are not available for statespace that differ
+  // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
+  // atomics is undefined if the generic address does not refer to a .global or
+  // .shared memory location.
+  bool AddrGenericOrGlobalOrShared =
+      (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+  bool UseRelaxedMMIO =
+      HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+
+  switch (Ordering) {
+  case AtomicOrdering::NotAtomic:
+    return N->isVolatile() && AddrGenericOrGlobalOrShared
+               ? NVPTX::Ordering::Volatile
+               : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::Monotonic:
+    if (N->isVolatile())
+      return UseRelaxedMMIO                ? NVPTX::Ordering::RelaxedMMIO
+             : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+                                           : NVPTX::Ordering::NotAtomic;
+    else
+      return HasMemoryOrdering             ? NVPTX::Ordering::Relaxed
+             : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+                                           : NVPTX::Ordering::NotAtomic;
+  // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
+  // Acquire.
+  case AtomicOrdering::Acquire:
+    if (!N->readMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Acquire Ordering on reads: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire
+                                       : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::Release:
+    if (!N->writeMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Release Ordering on writes: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release
+                                       : NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::AcquireRelease: {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write "
+          "yet and PTX does not support it on loads or stores: "
+       << N->getOperationName();
+    N->print(OS);
+    report_fatal_error(OS.str());
+  }
+  case AtomicOrdering::SequentiallyConsistent: {
+    // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
+    // sequence including a "fence.sc.sco" and the memory instruction with an
+    // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
+    // whether the memory operation is a read, write, or read-modify-write.
+    //
+    // This sets the ordering of the fence to SequentiallyConsistent, and
+    // sets the corresponding ordering for the instruction.
+    NVPTX::Ordering InstrOrder;
+    if (N->readMem()) {
+      InstrOrder = NVPTX::Ordering::Acquire;
+    } else if (N->writeMem()) {
+      InstrOrder = NVPTX::Ordering::Release;
+    } else {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "NVPTX does not support SequentiallyConsistent Ordering on "
+            "read-modify-writes yet: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared
+               ? OperationOrderings(InstrOrder,
+                                    NVPTX::Ordering::SequentiallyConsistent)
+               : OperationOrderings(NVPTX::Ordering::NotAtomic);
+  }
+  case AtomicOrdering::Unordered:
+    break;
+  }
+
+  SmallString<256> Msg;
+  raw_svector_ostream OS(Msg);
+  OS << "NVPTX backend does not support AtomicOrdering \""
+     << toIRString(Ordering) << "\" yet.";
+  report_fatal_error(OS.str());
+}
+
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
                           unsigned CodeAddrSpace, MachineFunction *F) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -902,32 +1079,19 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!LoadedVT.isSimple())
     return false;
 
-  AtomicOrdering Ordering = LD->getSuccessOrdering();
-  // In order to lower atomic loads with stronger guarantees we would need to
-  // use load.acquire or insert fences. However these features were only added
-  // with PTX ISA 6.0 / sm_70.
-  // TODO: Check if we can actually use the new instructions and implement them.
-  if (isStrongerThanMonotonic(Ordering))
-    return false;
-
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
+  // Memory Semantic Setting
+  auto [InstructionOrdering, FenceOrdering] =
+      getOperationOrderings(LD, Subtarget);
+
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only available for .global and .shared
-  // - .volatile has the same memory synchronization semantics as .relaxed.sys
-  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    isVolatile = false;
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
@@ -968,9 +1132,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Addr,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -979,9 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -996,9 +1171,15 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
@@ -1012,9 +1193,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), N1, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+                     getI32Imm(InstructionOrdering, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     N1,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
@@ -1051,13 +1237,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool IsVolatile = MemSD->isVolatile();
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    IsVolatile = false;
+  // Memory Semantic Setting
+  auto [InstructionOrdering, FenceOrdering] =
+      getOperationOrderings(MemSD, Subtarget);
 
   // Vector Setting
   MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1124,9 +1306,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+                     getI32Imm(InstructionOrdering, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Addr,
+                     Chain};
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1149,9 +1336,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),...
[truncated]

gonzalobg · 2024-07-11T21:22:59Z

@AlexMaclean @Artem-B @jlebar hello all three, I'm looking for some reviewers for this series of PRs to improve the support of atomics in LLVM NVPTX backend. It's my first time touching these parts of the LLVM project, but everyone in the LLVM Discord has been super nice helping me get onboarded.

This PR includes all the changes of #96436 and #98022, which haven't really gotten any replies yet.. I just kept building on them. Not sure whether I should close them and continue building on them, or whether it is easier to review them piece meal.

I'm happy to help review other people's PR and help with the NVPTX backend in general. I'm "gonzalob" in the LLVM Discord in case anyone wants to chat :)

Artem-B · 2024-07-11T22:36:09Z

Sorry, I was out of office for a while. I'm catching up on the pending reviews now. Stay tuned.

llvmbot added the backend:NVPTX label Jul 11, 2024

Artem-B self-requested a review July 11, 2024 21:51

gonzalobg force-pushed the nvptx_volta_seq_cst branch 2 times, most recently from 266e75a to 31ac2d8 Compare July 15, 2024 21:14

gonzalobg added 2 commits July 15, 2024 14:58

[NVPTX] Volta SequentiallyConsistent Load/Store Ops

0f1f96b

[NVPTX] Align Memory Ordering enum with LLVM

748d598

gonzalobg force-pushed the nvptx_volta_seq_cst branch 2 times, most recently from f451b51 to 78e3b5e Compare July 15, 2024 22:11

[NVPTX]: Fix typos

2ecbb77

gonzalobg force-pushed the nvptx_volta_seq_cst branch from 78e3b5e to 2ecbb77 Compare July 15, 2024 22:12

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

gonzalobg commented Jul 11, 2024

llvmbot commented Jul 11, 2024

gonzalobg commented Jul 11, 2024

Artem-B commented Jul 11, 2024

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

Are you sure you want to change the base?

[NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations #98551

Conversation

gonzalobg commented Jul 11, 2024

llvmbot commented Jul 11, 2024

gonzalobg commented Jul 11, 2024

Artem-B commented Jul 11, 2024