From bbf17fdd1569106509f62db2002be3979529f4c5 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 14 Nov 2025 14:38:39 +0100 Subject: [PATCH 1/3] [NVVM] Make nanosleep op duration SSA value --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 1cc5b74a3cb67..a7d0d05dd8192 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -79,6 +79,44 @@ def NVVM_Dialect : Dialect { sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to embed PTX inline as a last-resort escape hatch, with explicit operands and results. + + + **Memory Spaces:** The NVVM dialect introduces the following memory spaces, + each with distinct scopes and lifetimes: + + | Memory Space | Scope | Lifetime | + |-------------------|----------------------|-------------------| + | `generic` | All threads | Context-dependent | + | `global` | All threads (device) | Application | + | `shared` | Thread block (CTA) | Kernel execution | + | `constant` | All threads (RO) | Application | + | `local` | Single thread | Kernel execution | + | `tensor` | Thread block (CTA) | Kernel execution | + | `shared_cluster` | Thread block cluster | Kernel execution | + + **Memory Space Details:** + - **generic**: Can point to any memory space; requires runtime resolution of + actual address space. Use when pointer origin is unknown at compile time. + Performance varies based on the underlying memory space. + - **global**: Accessible by all threads across all blocks; persists across + kernel launches. Highest latency but largest capacity (device memory). Best + for large data and inter-kernel communication. + - **shared**: Shared within a thread block (CTA); very fast on-chip memory for + cooperation between threads in the same block. Limited capacity. Ideal for + block-level collaboration, caching, and reducing global memory traffic. + - **constant**: Read-only memory cached per SM; optimized for broadcast + patterns where all threads access the same location. Fast access when cached. + Size typically limited to 64KB. Best for read-only data and uniform values + accessed by all threads. + - **local**: Private to each thread; used for stack frames and register spills. + Actually resides in global memory but cached in L1. Use for per-thread + private data and automatic variables that don't fit in registers. + - **tensor**: Special memory space for Tensor Memory Accelerator (TMA) + operations on SM 80+ architectures; used with async tensor operations and + wgmma instructions. Provides very fast access for matrix operations. + - **shared_cluster**: Shared across thread blocks within a cluster (SM 90+); + enables collaboration beyond single-block scope with distributed shared + memory. Fast access across cluster threads. }]; let name = "nvvm"; From 891547d701dc50de08d77a5fe673bf0e8bc1dc0f Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Mon, 17 Nov 2025 15:07:06 +0100 Subject: [PATCH 2/3] fx --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 40 ++++++++++----------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index a7d0d05dd8192..119c8caa32c31 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -84,15 +84,15 @@ def NVVM_Dialect : Dialect { **Memory Spaces:** The NVVM dialect introduces the following memory spaces, each with distinct scopes and lifetimes: - | Memory Space | Scope | Lifetime | - |-------------------|----------------------|-------------------| - | `generic` | All threads | Context-dependent | - | `global` | All threads (device) | Application | - | `shared` | Thread block (CTA) | Kernel execution | - | `constant` | All threads (RO) | Application | - | `local` | Single thread | Kernel execution | - | `tensor` | Thread block (CTA) | Kernel execution | - | `shared_cluster` | Thread block cluster | Kernel execution | + | Memory Space | Address Space | Scope | Lifetime | + |-------------------|---------------|----------------------|-------------------| + | `generic` | 0 | All threads | Context-dependent | + | `global` | 1 | All threads (device) | Application | + | `shared` | 3 | Thread block (CTA) | Kernel execution | + | `constant` | 4 | All threads (RO) | Application | + | `local` | 5 | Single thread | Kernel execution | + | `tensor` | 6 | Thread block (CTA) | Kernel execution | + | `shared_cluster` | 7 | Thread block cluster | Kernel execution | **Memory Space Details:** - **generic**: Can point to any memory space; requires runtime resolution of @@ -104,19 +104,15 @@ def NVVM_Dialect : Dialect { - **shared**: Shared within a thread block (CTA); very fast on-chip memory for cooperation between threads in the same block. Limited capacity. Ideal for block-level collaboration, caching, and reducing global memory traffic. - - **constant**: Read-only memory cached per SM; optimized for broadcast - patterns where all threads access the same location. Fast access when cached. - Size typically limited to 64KB. Best for read-only data and uniform values - accessed by all threads. - - **local**: Private to each thread; used for stack frames and register spills. - Actually resides in global memory but cached in L1. Use for per-thread - private data and automatic variables that don't fit in registers. - - **tensor**: Special memory space for Tensor Memory Accelerator (TMA) - operations on SM 80+ architectures; used with async tensor operations and - wgmma instructions. Provides very fast access for matrix operations. - - **shared_cluster**: Shared across thread blocks within a cluster (SM 90+); - enables collaboration beyond single-block scope with distributed shared - memory. Fast access across cluster threads. + - **constant**: Read-only memory cached per SM. Size typically limited to + 64KB. Best for read-only data and uniform values accessed by all threads. + - **local**: Private to each thread. Use for per-thread private data and + automatic variables that don't fit in registers. + - **tensor**: Special memory space for tensor core operations. Used by + `tcgen05` instructions on SM 100+ for tensor input/output operations. + - **shared_cluster**: Distributed shared memory across thread blocks within + a cluster (SM 90+). Enables collaboration beyond single-block scope with + fast access across cluster threads. }]; let name = "nvvm"; From 262145e03e1e22c45d59edf1475c2c05829ecd4b Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Mon, 17 Nov 2025 15:18:28 +0100 Subject: [PATCH 3/3] fx --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 119c8caa32c31..d25e51e96ed8e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -83,7 +83,7 @@ def NVVM_Dialect : Dialect { **Memory Spaces:** The NVVM dialect introduces the following memory spaces, each with distinct scopes and lifetimes: - +``` | Memory Space | Address Space | Scope | Lifetime | |-------------------|---------------|----------------------|-------------------| | `generic` | 0 | All threads | Context-dependent | @@ -93,7 +93,7 @@ def NVVM_Dialect : Dialect { | `local` | 5 | Single thread | Kernel execution | | `tensor` | 6 | Thread block (CTA) | Kernel execution | | `shared_cluster` | 7 | Thread block cluster | Kernel execution | - +``` **Memory Space Details:** - **generic**: Can point to any memory space; requires runtime resolution of actual address space. Use when pointer origin is unknown at compile time.