From 4f5d7d665f21f150ff65a4d842ed722b58e803ee Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 5 Feb 2025 15:31:28 +0100 Subject: [PATCH] [MLIR][NVVM] Fix links in OP definition Some of links are broken in https://mlir.llvm.org/docs/Dialects/NVVMDialect/ This PR fixes the links. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 82 ++++++++++----------- 1 file changed, 38 insertions(+), 44 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 23db9375fbffe..2613879043552 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -476,8 +476,7 @@ def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> The default barrier id is 0 that is similar to `nvvm.barrier` Op. When `barrierId` is not present, the default barrier id is used. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar) }]; let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict"; @@ -503,8 +502,7 @@ def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> { The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) }]; string llvmBuilder = [{ @@ -530,8 +528,7 @@ def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> { ordering and visibility guarantees provided for the memory accesses performed prior to `cluster.arrive`. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) }]; string llvmBuilder = [{ @@ -552,8 +549,7 @@ def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> { of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster) }]; string llvmBuilder = [{ @@ -605,8 +601,8 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">, let description = [{ Fence operation with proxy to establish an ordering between memory accesses that may happen through different proxies. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; let assemblyFormat = "attr-dict"; @@ -656,8 +652,8 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">, value for the `size` operand is 128 and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand `addr` must fall within the `.global` state space. Otherwise, the behavior is undefined - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; @@ -684,8 +680,8 @@ def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">, subsequent memory access performed via the tensormap proxy. `fence.proxy.release` operation can form a release sequence that synchronizes with an acquire sequence that contains the fence.proxy.acquire proxy fence operation - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict"; @@ -723,8 +719,8 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> { let arguments = (ins ); let description = [{ Fence operation that applies on the prior nvvm.mbarrier.init - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar) }]; let assemblyFormat = "attr-dict"; @@ -767,8 +763,8 @@ def NVVM_ShflOp : the source. The `mask_and_clamp` contains two packed values specifying a mask for logically splitting warps into sub-segments and an upper bound for clamping the source lane index. - [For more information, refer PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync) }]; string llvmBuilder = [{ auto intId = getShflIntrinsicId( @@ -813,8 +809,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync"> of this Op. The predicate result is set to `True` for the leader thread, and `False` for all other threads. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync) }]; let results = (outs I1:$pred); @@ -898,8 +893,8 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> { The `addr` operand specifies the address of the mbarrier object in generic address space. The `noinc` attr impacts how the mbarrier's state is updated. - [For more information, refer PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) }]; let assemblyFormat = "$addr attr-dict `:` type(operands)"; @@ -922,8 +917,9 @@ def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.share track all prior cp.async operations initiated by the executing thread. The `addr` operand specifies the address of the mbarrier object in shared memory. The `noinc` attr impacts how the mbarrier's state - is updated. [For more information, refer PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) + is updated. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) }]; let assemblyFormat = "$addr attr-dict `:` type(operands)"; @@ -981,8 +977,8 @@ def NVVM_CvtFloatToTF32Op : NVVM_Op<"cvt.float.to.tf32"> { The `relu` attribute, when set, lowers to the '.relu' variant of the cvt instruction. The `rnd` and `sat` attributes specify the the rounding and saturation modes respectively. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) }]; let hasVerifier = 1; @@ -1632,8 +1628,8 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">, let description = [{ Collectively store one or more matrices across all threads in a warp to the location indicated by the address operand $ptr in shared memory. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix) }]; let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)"; @@ -1845,8 +1841,7 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">, This Op commits all prior initiated but uncommitted cp.async.bulk instructions into a cp.async.bulk-group. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group) }]; string llvmBuilder = [{ @@ -1870,8 +1865,7 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">, async operations in the specified bulk async-group have completed reading from their source locations. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group) }]; string llvmBuilder = [{ @@ -1916,8 +1910,7 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) }]; let assemblyFormat = [{ @@ -2033,8 +2026,7 @@ def NVVM_CpAsyncBulkTensorPrefetchOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor) }]; let assemblyFormat = [{ @@ -2133,8 +2125,7 @@ def NVVM_CpAsyncBulkTensorReduceOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor) }]; let assemblyFormat = [{ @@ -2193,8 +2184,8 @@ def NVVM_CpAsyncBulkGlobalToSharedClusterOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) }]; let arguments = (ins @@ -2251,8 +2242,7 @@ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp : Initiates an asynchronous copy operation from Shared CTA memory to Shared cluster memory. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) }]; let arguments = (ins @@ -2282,8 +2272,8 @@ def NVVM_CpAsyncBulkSharedCTAToGlobalOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. - [For more information, see PTX ISA] - (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk) }]; let arguments = (ins @@ -2523,6 +2513,8 @@ def NVVM_GriddepcontrolWaitOp : NVVM_IntrOp<"griddepcontrol.wait", [], 0> { Causes the executing thread to wait until all prerequisite grids in flight have completed and all the memory operations from the prerequisite grids are performed and made visible to the current grid. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol) }]; } @@ -2535,6 +2527,8 @@ def NVVM_GriddepcontrolLaunchDependentsOp Signals that specific dependents the runtime system designated to react to this instruction can be scheduled as soon as all other CTAs in the grid issue the same instruction or have completed. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol) }]; }