-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AMDGPU] add utils for common usage #75097
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mlir-amdgpu Author: None (erman-gurses) ChangesThis PR generalizes helper functions for the common usage in swizzling implementation for AMDGPU and NVGPU Dialects. Full diff: https://github.com/llvm/llvm-project/pull/75097.diff 8 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ffb302fcedd732..110bdcb571fdf3 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,6 +29,27 @@ def AMDGPU_Dialect : Dialect {
"gpu::GPUDialect"
];
let useDefaultAttributePrinterParser = 1;
+ let extraClassDeclaration = [{
+ /// Return true if the given MemRefType has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool hasSharedMemoryAddressSpace(MemRefType type);
+
+ /// Return true if the given Attribute has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool isSharedMemoryAddressSpace(Attribute type);
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in global memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kGlobaldMemoryAddressSpace = 1;
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in shared memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kSharedMemoryAddressSpace = 3;
+ }];
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
index 64bce441722af8..b845aef888ca6d 100644
--- a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
@@ -11,12 +11,6 @@
namespace mlir {
namespace nvgpu {
-/// Get the indices that the given load/store operation is operating on.
-Operation::operand_range getIndices(Operation *op);
-
-/// Set the indices that the given load/store operation is operating on.
-void setIndices(Operation *op, ArrayRef<Value> indices);
-
/// Get the value that is stored by the given store operation.
Value getValueStored(Operation *op);
diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index f51a8b28b7548e..295add03127925 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -14,6 +14,9 @@
#ifndef MLIR_DIALECT_UTILS_INDEXINGUTILS_H
#define MLIR_DIALECT_UTILS_INDEXINGUTILS_H
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Builders.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
@@ -387,6 +390,16 @@ class StaticTileOffsetRange {
IteratorTy beginValue;
IteratorTy pastEndValue;
};
+
+
+//===----------------------------------------------------------------------===//
+// load/store utils.
+//===----------------------------------------------------------------------===//
+/// Get the indices that the given load/store operation is operating on.
+Operation::operand_range getIndices(Operation *op);
+
+/// Set the indices that the given load/store operation is operating on.
+void setIndices(Operation *op, ArrayRef<Value> indices);
} // namespace mlir
#endif // MLIR_DIALECT_UTILS_INDEXINGUTILS_H
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 2575ad4984814b..f54ec46c7476dd 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -9,7 +9,7 @@
// This file implements the AMDGPU dialect and its operations.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
>();
}
+bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
+ if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
+ return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
+ if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+ return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
+ return false;
+}
+
+bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+ Attribute memorySpace = type.getMemorySpace();
+ return isSharedMemoryAddressSpace(memorySpace);
+}
+
//===----------------------------------------------------------------------===//
// 8-bit float ops
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index f63825cdc8f617..235cc28abd8f06 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -254,9 +255,9 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
// bypass_l1 only possible with 16 byte transfer.
Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
writeOp->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
- /*dst=*/storeBase, /*dstIndices=*/nvgpu::getIndices(writeOp),
+ /*dst=*/storeBase, /*dstIndices=*/getIndices(writeOp),
/*src=*/loadBase,
- /*srcIndices=*/nvgpu::getIndices(readOp),
+ /*srcIndices=*/getIndices(readOp),
/*dstElements=*/rewriter.getIndexAttr(numElements),
/*srcElements=*/numReadElements,
/*bypassL1=*/bypassL1 && sizeInBytes == 16 ? rewriter.getUnitAttr()
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
index 693bb53cacff62..bfffeaa32fbe2c 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
@@ -9,7 +9,7 @@
// This file implements transforms to optimize accesses to shared memory.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Passes.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
index a782ed5ddd85e4..213b453d4bf9ff 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
@@ -15,46 +15,6 @@
using namespace mlir;
using namespace mlir::nvgpu;
-Operation::operand_range nvgpu::getIndices(Operation *op) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndices();
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndices();
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndices();
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndices();
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndices();
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndices();
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndices();
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndices();
- llvm_unreachable("unsupported op type");
-}
-
-void nvgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndicesMutable().assign(indices);
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndicesMutable().assign(indices);
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndicesMutable().assign(indices);
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndicesMutable().assign(indices);
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndicesMutable().assign(indices);
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndicesMutable().assign(indices);
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndicesMutable().assign(indices);
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndicesMutable().assign(indices);
- llvm_unreachable("unsupported op type");
-}
-
Value nvgpu::getValueStored(Operation *op) {
if (auto storeOp = dyn_cast<memref::StoreOp>(op))
return storeOp.getValueToStore();
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index bb8a0d5912d7c1..8bbd84918eed9b 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -8,6 +8,8 @@
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/NVGPU/Transforms/Utils.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
@@ -352,3 +354,43 @@ mlir::detail::TileOffsetRangeImpl::getDynamicTileOffsets(
return mlir::computeElementwiseMul(tileCoords,
getAffineConstantExprs(tileShape, ctx));
}
+
+Operation::operand_range mlir::getIndices(Operation *op) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndices();
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndices();
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndices();
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndices();
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndices();
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndices();
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndices();
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndices();
+ llvm_unreachable("unsupported op type");
+}
+
+void mlir::setIndices(Operation *op, ArrayRef<Value> indices) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndicesMutable().assign(indices);
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndicesMutable().assign(indices);
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndicesMutable().assign(indices);
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndicesMutable().assign(indices);
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndicesMutable().assign(indices);
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndicesMutable().assign(indices);
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndicesMutable().assign(indices);
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndicesMutable().assign(indices);
+ llvm_unreachable("unsupported op type");
+}
|
@llvm/pr-subscribers-mlir-gpu Author: None (erman-gurses) ChangesThis PR generalizes helper functions for the common usage in swizzling implementation for AMDGPU and NVGPU Dialects. Full diff: https://github.com/llvm/llvm-project/pull/75097.diff 8 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ffb302fcedd732..110bdcb571fdf3 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,6 +29,27 @@ def AMDGPU_Dialect : Dialect {
"gpu::GPUDialect"
];
let useDefaultAttributePrinterParser = 1;
+ let extraClassDeclaration = [{
+ /// Return true if the given MemRefType has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool hasSharedMemoryAddressSpace(MemRefType type);
+
+ /// Return true if the given Attribute has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool isSharedMemoryAddressSpace(Attribute type);
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in global memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kGlobaldMemoryAddressSpace = 1;
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in shared memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kSharedMemoryAddressSpace = 3;
+ }];
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
index 64bce441722af8..b845aef888ca6d 100644
--- a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
@@ -11,12 +11,6 @@
namespace mlir {
namespace nvgpu {
-/// Get the indices that the given load/store operation is operating on.
-Operation::operand_range getIndices(Operation *op);
-
-/// Set the indices that the given load/store operation is operating on.
-void setIndices(Operation *op, ArrayRef<Value> indices);
-
/// Get the value that is stored by the given store operation.
Value getValueStored(Operation *op);
diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index f51a8b28b7548e..295add03127925 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -14,6 +14,9 @@
#ifndef MLIR_DIALECT_UTILS_INDEXINGUTILS_H
#define MLIR_DIALECT_UTILS_INDEXINGUTILS_H
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Builders.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
@@ -387,6 +390,16 @@ class StaticTileOffsetRange {
IteratorTy beginValue;
IteratorTy pastEndValue;
};
+
+
+//===----------------------------------------------------------------------===//
+// load/store utils.
+//===----------------------------------------------------------------------===//
+/// Get the indices that the given load/store operation is operating on.
+Operation::operand_range getIndices(Operation *op);
+
+/// Set the indices that the given load/store operation is operating on.
+void setIndices(Operation *op, ArrayRef<Value> indices);
} // namespace mlir
#endif // MLIR_DIALECT_UTILS_INDEXINGUTILS_H
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 2575ad4984814b..f54ec46c7476dd 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -9,7 +9,7 @@
// This file implements the AMDGPU dialect and its operations.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
>();
}
+bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
+ if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
+ return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
+ if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+ return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
+ return false;
+}
+
+bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+ Attribute memorySpace = type.getMemorySpace();
+ return isSharedMemoryAddressSpace(memorySpace);
+}
+
//===----------------------------------------------------------------------===//
// 8-bit float ops
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index f63825cdc8f617..235cc28abd8f06 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -254,9 +255,9 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
// bypass_l1 only possible with 16 byte transfer.
Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
writeOp->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
- /*dst=*/storeBase, /*dstIndices=*/nvgpu::getIndices(writeOp),
+ /*dst=*/storeBase, /*dstIndices=*/getIndices(writeOp),
/*src=*/loadBase,
- /*srcIndices=*/nvgpu::getIndices(readOp),
+ /*srcIndices=*/getIndices(readOp),
/*dstElements=*/rewriter.getIndexAttr(numElements),
/*srcElements=*/numReadElements,
/*bypassL1=*/bypassL1 && sizeInBytes == 16 ? rewriter.getUnitAttr()
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
index 693bb53cacff62..bfffeaa32fbe2c 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
@@ -9,7 +9,7 @@
// This file implements transforms to optimize accesses to shared memory.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Passes.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
index a782ed5ddd85e4..213b453d4bf9ff 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
@@ -15,46 +15,6 @@
using namespace mlir;
using namespace mlir::nvgpu;
-Operation::operand_range nvgpu::getIndices(Operation *op) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndices();
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndices();
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndices();
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndices();
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndices();
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndices();
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndices();
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndices();
- llvm_unreachable("unsupported op type");
-}
-
-void nvgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndicesMutable().assign(indices);
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndicesMutable().assign(indices);
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndicesMutable().assign(indices);
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndicesMutable().assign(indices);
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndicesMutable().assign(indices);
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndicesMutable().assign(indices);
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndicesMutable().assign(indices);
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndicesMutable().assign(indices);
- llvm_unreachable("unsupported op type");
-}
-
Value nvgpu::getValueStored(Operation *op) {
if (auto storeOp = dyn_cast<memref::StoreOp>(op))
return storeOp.getValueToStore();
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index bb8a0d5912d7c1..8bbd84918eed9b 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -8,6 +8,8 @@
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/NVGPU/Transforms/Utils.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
@@ -352,3 +354,43 @@ mlir::detail::TileOffsetRangeImpl::getDynamicTileOffsets(
return mlir::computeElementwiseMul(tileCoords,
getAffineConstantExprs(tileShape, ctx));
}
+
+Operation::operand_range mlir::getIndices(Operation *op) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndices();
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndices();
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndices();
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndices();
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndices();
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndices();
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndices();
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndices();
+ llvm_unreachable("unsupported op type");
+}
+
+void mlir::setIndices(Operation *op, ArrayRef<Value> indices) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndicesMutable().assign(indices);
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndicesMutable().assign(indices);
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndicesMutable().assign(indices);
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndicesMutable().assign(indices);
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndicesMutable().assign(indices);
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndicesMutable().assign(indices);
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndicesMutable().assign(indices);
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndicesMutable().assign(indices);
+ llvm_unreachable("unsupported op type");
+}
|
@llvm/pr-subscribers-mlir-nvgpu Author: None (erman-gurses) ChangesThis PR generalizes helper functions for the common usage in swizzling implementation for AMDGPU and NVGPU Dialects. Full diff: https://github.com/llvm/llvm-project/pull/75097.diff 8 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ffb302fcedd732..110bdcb571fdf3 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,6 +29,27 @@ def AMDGPU_Dialect : Dialect {
"gpu::GPUDialect"
];
let useDefaultAttributePrinterParser = 1;
+ let extraClassDeclaration = [{
+ /// Return true if the given MemRefType has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool hasSharedMemoryAddressSpace(MemRefType type);
+
+ /// Return true if the given Attribute has an integer address
+ /// space that matches the ROCDL shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool isSharedMemoryAddressSpace(Attribute type);
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in global memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kGlobaldMemoryAddressSpace = 1;
+
+ /// Defines the MemRef memory space attribute numeric value that indicates
+ /// a memref is located in shared memory. This should correspond to the
+ /// value used in ROCDL.
+ static constexpr unsigned kSharedMemoryAddressSpace = 3;
+ }];
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
index 64bce441722af8..b845aef888ca6d 100644
--- a/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/Utils.h
@@ -11,12 +11,6 @@
namespace mlir {
namespace nvgpu {
-/// Get the indices that the given load/store operation is operating on.
-Operation::operand_range getIndices(Operation *op);
-
-/// Set the indices that the given load/store operation is operating on.
-void setIndices(Operation *op, ArrayRef<Value> indices);
-
/// Get the value that is stored by the given store operation.
Value getValueStored(Operation *op);
diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index f51a8b28b7548e..295add03127925 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -14,6 +14,9 @@
#ifndef MLIR_DIALECT_UTILS_INDEXINGUTILS_H
#define MLIR_DIALECT_UTILS_INDEXINGUTILS_H
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Builders.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
@@ -387,6 +390,16 @@ class StaticTileOffsetRange {
IteratorTy beginValue;
IteratorTy pastEndValue;
};
+
+
+//===----------------------------------------------------------------------===//
+// load/store utils.
+//===----------------------------------------------------------------------===//
+/// Get the indices that the given load/store operation is operating on.
+Operation::operand_range getIndices(Operation *op);
+
+/// Set the indices that the given load/store operation is operating on.
+void setIndices(Operation *op, ArrayRef<Value> indices);
} // namespace mlir
#endif // MLIR_DIALECT_UTILS_INDEXINGUTILS_H
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 2575ad4984814b..f54ec46c7476dd 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -9,7 +9,7 @@
// This file implements the AMDGPU dialect and its operations.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
>();
}
+bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
+ if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
+ return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
+ if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+ return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
+ return false;
+}
+
+bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+ Attribute memorySpace = type.getMemorySpace();
+ return isSharedMemoryAddressSpace(memorySpace);
+}
+
//===----------------------------------------------------------------------===//
// 8-bit float ops
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index f63825cdc8f617..235cc28abd8f06 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Transforms.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -254,9 +255,9 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
// bypass_l1 only possible with 16 byte transfer.
Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
writeOp->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
- /*dst=*/storeBase, /*dstIndices=*/nvgpu::getIndices(writeOp),
+ /*dst=*/storeBase, /*dstIndices=*/getIndices(writeOp),
/*src=*/loadBase,
- /*srcIndices=*/nvgpu::getIndices(readOp),
+ /*srcIndices=*/getIndices(readOp),
/*dstElements=*/rewriter.getIndexAttr(numElements),
/*srcElements=*/numReadElements,
/*bypassL1=*/bypassL1 && sizeInBytes == 16 ? rewriter.getUnitAttr()
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
index 693bb53cacff62..bfffeaa32fbe2c 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
@@ -9,7 +9,7 @@
// This file implements transforms to optimize accesses to shared memory.
//
//===----------------------------------------------------------------------===//
-
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/NVGPU/Transforms/Passes.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
index a782ed5ddd85e4..213b453d4bf9ff 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/Utils.cpp
@@ -15,46 +15,6 @@
using namespace mlir;
using namespace mlir::nvgpu;
-Operation::operand_range nvgpu::getIndices(Operation *op) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndices();
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndices();
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndices();
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndices();
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndices();
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndices();
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndices();
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndices();
- llvm_unreachable("unsupported op type");
-}
-
-void nvgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
- if (auto ldmatrixOp = dyn_cast<LdMatrixOp>(op))
- return ldmatrixOp.getIndicesMutable().assign(indices);
- if (auto copyOp = dyn_cast<DeviceAsyncCopyOp>(op))
- return copyOp.getDstIndicesMutable().assign(indices);
- if (auto loadOp = dyn_cast<memref::LoadOp>(op))
- return loadOp.getIndicesMutable().assign(indices);
- if (auto storeOp = dyn_cast<memref::StoreOp>(op))
- return storeOp.getIndicesMutable().assign(indices);
- if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
- return vectorReadOp.getIndicesMutable().assign(indices);
- if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
- return vectorStoreOp.getIndicesMutable().assign(indices);
- if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
- return transferReadOp.getIndicesMutable().assign(indices);
- if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
- return transferWriteOp.getIndicesMutable().assign(indices);
- llvm_unreachable("unsupported op type");
-}
-
Value nvgpu::getValueStored(Operation *op) {
if (auto storeOp = dyn_cast<memref::StoreOp>(op))
return storeOp.getValueToStore();
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index bb8a0d5912d7c1..8bbd84918eed9b 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -8,6 +8,8 @@
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/NVGPU/Transforms/Utils.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
@@ -352,3 +354,43 @@ mlir::detail::TileOffsetRangeImpl::getDynamicTileOffsets(
return mlir::computeElementwiseMul(tileCoords,
getAffineConstantExprs(tileShape, ctx));
}
+
+Operation::operand_range mlir::getIndices(Operation *op) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndices();
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndices();
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndices();
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndices();
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndices();
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndices();
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndices();
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndices();
+ llvm_unreachable("unsupported op type");
+}
+
+void mlir::setIndices(Operation *op, ArrayRef<Value> indices) {
+ if (auto ldmatrixOp = dyn_cast<nvgpu::LdMatrixOp>(op))
+ return ldmatrixOp.getIndicesMutable().assign(indices);
+ if (auto copyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op))
+ return copyOp.getDstIndicesMutable().assign(indices);
+ if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+ return loadOp.getIndicesMutable().assign(indices);
+ if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+ return storeOp.getIndicesMutable().assign(indices);
+ if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+ return vectorReadOp.getIndicesMutable().assign(indices);
+ if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+ return vectorStoreOp.getIndicesMutable().assign(indices);
+ if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+ return transferReadOp.getIndicesMutable().assign(indices);
+ if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+ return transferWriteOp.getIndicesMutable().assign(indices);
+ llvm_unreachable("unsupported op type");
+}
|
|
fc930fe
to
b58b5b5
Compare
b58b5b5
to
ea8ebf3
Compare
ea8ebf3
to
29604f2
Compare
@@ -8,6 +8,8 @@ | |||
|
|||
#include "mlir/Dialect/Utils/IndexingUtils.h" | |||
|
|||
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" | |||
#include "mlir/Dialect/NVGPU/Transforms/Utils.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think IndexingUtils.cpp
can take a dependency on NVGPU or Vector. I assume the portion of the helper that you want to reuse isn't specific to any NVGPU ops; maybe split out the vector and memref portions and make them available in the vector dialect, or just duplicate the portion of the helper that is needed in AMDGPU?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is a good point. We can discuss about it today but probably I will create separated .cpp util file for that. Main reason for this patch to avoid code duplication as mush as we can.
if (!memorySpace) | ||
return false; | ||
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace)) | ||
return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding is that the convention is to keep the shared memory address space general for as long as possible, which to me seems like ROCDL would be the expected translation point. Can you elaborate on the reason to add an earlier dialect specific value for this?
(not necessarily a direct stakeholder here, mainly concerned about how magic memory space constants can be difficult to manage).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, but this is, I think, designed to not break in the case where the shared memory's already been converted.
That being said, I'd argue for the value of 3
to be over in rocdl, and for his "is it shared memory" thing being a GPUDialect
utility
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 that's how I'm interpreting this as well. Using the GPUDialect
utility for as long as possible seems consistent with other dialects and IMHO is more readable. It would be good to understand what cases we're hitting that has this early conversion of the shared memory space and whether this is the right level to handle it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the comments, based on the that and some discussions, it will be better bring a new PR that includes swizzling implementation as well.
/// Defines the MemRef memory space attribute numeric value that indicates | ||
/// a memref is located in shared memory. This should correspond to the | ||
/// value used in ROCDL. | ||
static constexpr unsigned kSharedMemoryAddressSpace = 3; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd add the incantations for private
address space as well if we're doing this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, will do that.
if (!memorySpace) | ||
return false; | ||
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace)) | ||
return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, but this is, I think, designed to not break in the case where the shared memory's already been converted.
That being said, I'd argue for the value of 3
to be over in rocdl, and for his "is it shared memory" thing being a GPUDialect
utility
@@ -15,46 +15,6 @@ | |||
using namespace mlir; | |||
using namespace mlir::nvgpu; | |||
|
|||
Operation::operand_range nvgpu::getIndices(Operation *op) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... this could be an interface or something, idk
Or an annotation on arguments
Or perhaps refactored out to GPU Common Indexing Utils or something
Thanks for the comments, based on the that and some discussions, it will be better bring a new PR that includes swizzling implementation as well. |
This PR generalizes helper functions for the common usage in swizzling implementation for AMDGPU and NVGPU Dialects.