diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td index 73d86283a5940..4c11725405ea5 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td @@ -30,6 +30,7 @@ include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td" class NVGPU_Op traits = []> : Op {} +// Promises IndexedAccessOpInterface. def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [ MemoryEffects<[MemRead]>, PredOpTrait<"srcMemref and res have same element type", @@ -183,6 +184,7 @@ def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> { let extraClassDeclaration = extraBaseClassDeclaration; } +// Promises IndexedMemCopyOpInterface. def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [ AttrSizedOperandSegments]> { let summary = "device-side asynchronous copy"; diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h new file mode 100644 index 0000000000000..50d2223912a27 --- /dev/null +++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h @@ -0,0 +1,21 @@ +//===- MemoryAccessOpInterfacesImpl.h -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H +#define MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H + +namespace mlir { + +class DialectRegistry; + +namespace nvgpu { +void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry ®istry); +} // namespace nvgpu +} // namespace mlir + +#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp index dafbd613d0933..df42cfeefa1c1 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp @@ -18,7 +18,6 @@ #include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" @@ -75,10 +74,6 @@ static Value getMemRefOperand(vector::TransferReadOp op) { return op.getBase(); } -static Value getMemRefOperand(nvgpu::LdMatrixOp op) { - return op.getSrcMemref(); -} - static Value getMemRefOperand(vector::LoadOp op) { return op.getBase(); } static Value getMemRefOperand(vector::StoreOp op) { return op.getBase(); } @@ -181,17 +176,6 @@ class SubViewOfSubViewFolder : public OpRewritePattern { } }; -/// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern -/// is folds subview on src and dst memref of the copy. -class NVGPUAsyncCopyOpSubViewOpFolder final - : public OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCopyOp copyOp, - PatternRewriter &rewriter) const override; -}; - /// Merges subview operations with load/store like operations unless such a /// merger would cause the strides between dimensions accessed by that operaton /// to change. @@ -345,11 +329,6 @@ LogicalResult LoadOpOfSubViewOpFolder::matchAndRewrite( subViewOp.getDroppedDims())), op.getPadding(), op.getMask(), op.getInBoundsAttr()); }) - .Case([&](nvgpu::LdMatrixOp op) { - rewriter.replaceOpWithNewOp( - op, op.getType(), subViewOp.getSource(), sourceIndices, - op.getTranspose(), op.getNumTiles()); - }) .DefaultUnreachable("unexpected operation"); return success(); } @@ -785,57 +764,6 @@ LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite( return success(); } -LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite( - nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const { - - LLVM_DEBUG(DBGS() << "copyOp : " << copyOp << "\n"); - - auto srcSubViewOp = - copyOp.getSrc().template getDefiningOp(); - auto dstSubViewOp = - copyOp.getDst().template getDefiningOp(); - - if (!(srcSubViewOp || dstSubViewOp)) - return rewriter.notifyMatchFailure(copyOp, "does not use subview ops for " - "source or destination"); - - // If the source is a subview, we need to resolve the indices. - SmallVector foldedSrcIndices(copyOp.getSrcIndices().begin(), - copyOp.getSrcIndices().end()); - - if (srcSubViewOp) { - LLVM_DEBUG(DBGS() << "srcSubViewOp : " << srcSubViewOp << "\n"); - affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, copyOp.getLoc(), srcSubViewOp.getMixedOffsets(), - srcSubViewOp.getMixedStrides(), srcSubViewOp.getDroppedDims(), - copyOp.getSrcIndices(), foldedSrcIndices); - } - - // If the destination is a subview, we need to resolve the indices. - SmallVector foldedDstIndices(copyOp.getDstIndices().begin(), - copyOp.getDstIndices().end()); - - if (dstSubViewOp) { - LLVM_DEBUG(DBGS() << "dstSubViewOp : " << dstSubViewOp << "\n"); - affine::resolveIndicesIntoOpWithOffsetsAndStrides( - rewriter, copyOp.getLoc(), dstSubViewOp.getMixedOffsets(), - dstSubViewOp.getMixedStrides(), dstSubViewOp.getDroppedDims(), - copyOp.getDstIndices(), foldedDstIndices); - } - - // Replace the copy op with a new copy op that uses the source and destination - // of the subview. - rewriter.replaceOpWithNewOp( - copyOp, nvgpu::DeviceAsyncTokenType::get(copyOp.getContext()), - (dstSubViewOp ? dstSubViewOp.getSource() : copyOp.getDst()), - foldedDstIndices, - (srcSubViewOp ? srcSubViewOp.getSource() : copyOp.getSrc()), - foldedSrcIndices, copyOp.getDstElements(), copyOp.getSrcElements(), - copyOp.getBypassL1Attr()); - - return success(); -} - void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) { patterns.add< // Interface-based patterns to which we will be migrating. @@ -844,7 +772,6 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) { IndexedMemCopyOpOfExpandShapeOpFolder, IndexedMemCopyOpOfCollapseShapeOpFolder, // The old way of doing things. Don't add more of these. - LoadOpOfSubViewOpFolder, LoadOpOfSubViewOpFolder, LoadOpOfSubViewOpFolder, LoadOpOfSubViewOpFolder, @@ -860,8 +787,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) { LoadOpOfCollapseShapeOpFolder, StoreOpOfCollapseShapeOpFolder, StoreOpOfCollapseShapeOpFolder, - SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>( - patterns.getContext()); + SubViewOfSubViewFolder>(patterns.getContext()); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp index 43e3b526c58b8..b4d8270177544 100644 --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" @@ -40,6 +41,9 @@ void NVGPUDialect::initialize() { #define GET_OP_LIST #include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc" >(); + declarePromisedInterfaces(); + declarePromisedInterfaces(); } bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) { diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt index 3f967d2b189be..8852ed7fb30a8 100644 --- a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRNVGPUTransforms CreateAsyncGroups.cpp + MemoryAccessOpInterfacesImpl.cpp OptimizeSharedMemory.cpp MmaSyncTF32Transform.cpp Utils.cpp diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp new file mode 100644 index 0000000000000..5dc0a5f30b347 --- /dev/null +++ b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp @@ -0,0 +1,96 @@ +//===- MemoryAccessOpInterfacesImpl.cpp -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Implement memref dialect interfaces that enable manipulating memref indexing +// in passes like FoldMemRefAliasOps. +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h" + +#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h" +#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" + +using namespace mlir; +using namespace mlir::memref; +using namespace mlir::nvgpu; + +namespace { +struct LdMatrixOpInterface final + : IndexedAccessOpInterface::ExternalModel { + TypedValue getAccessedMemref(Operation *op) const { + return cast(op).getSrcMemref(); + } + + Operation::operand_range getIndices(Operation *op) const { + return cast(op).getIndices(); + } + + SmallVector getAccessedShape(Operation *op) const { + VectorType vecTy = cast(op).getRes().getType(); + // The 2-D nature of the result is an artifact of this operation returning + // a struct of vectors and doesn't reflect any strides that need to be + // preserved. + return SmallVector{vecTy.getNumElements()}; + } + + std::optional> + updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref, + ValueRange newIndices) const { + auto ldMatrixOp = cast(op); + rewriter.modifyOpInPlace(ldMatrixOp, [&]() { + ldMatrixOp.getSrcMemrefMutable().assign(newMemref); + ldMatrixOp.getIndicesMutable().assign(newIndices); + }); + return std::nullopt; + } + + bool hasInboundsIndices(Operation *) const { return true; } +}; + +struct DeviceAsyncCopyOpInterface final + : IndexedMemCopyOpInterface::ExternalModel { + TypedValue getSrc(Operation *op) const { + return cast(op).getSrc(); + } + + Operation::operand_range getSrcIndices(Operation *op) const { + return cast(op).getSrcIndices(); + } + + TypedValue getDst(Operation *op) const { + return cast(op).getDst(); + } + + Operation::operand_range getDstIndices(Operation *op) const { + return cast(op).getDstIndices(); + } + + void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc, + ValueRange newSrcIndices, Value newDst, + ValueRange newDstIndices) const { + auto copyOp = cast(op); + rewriter.modifyOpInPlace(copyOp, [&]() { + copyOp.getSrcMutable().assign(newSrc); + copyOp.getSrcIndicesMutable().assign(newSrcIndices); + copyOp.getDstMutable().assign(newDst); + copyOp.getDstIndicesMutable().assign(newDstIndices); + }); + } +}; +} // namespace + +void mlir::nvgpu::registerMemoryAccessOpInterfacesExternalModels( + DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, nvgpu::NVGPUDialect *dialect) { + LdMatrixOp::attachInterface(*ctx); + DeviceAsyncCopyOp::attachInterface(*ctx); + }); +} diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp index 7a79e3408f1b8..589730b785133 100644 --- a/mlir/lib/RegisterAllDialects.cpp +++ b/mlir/lib/RegisterAllDialects.cpp @@ -60,6 +60,7 @@ #include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h" #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/PDL/IR/PDL.h" @@ -181,6 +182,7 @@ void mlir::registerAllDialects(DialectRegistry ®istry) { memref::registerValueBoundsOpInterfaceExternalModels(registry); memref::registerMemorySlotExternalModels(registry); ml_program::registerBufferizableOpInterfaceExternalModels(registry); + nvgpu::registerMemoryAccessOpInterfacesExternalModels(registry); scf::registerBufferDeallocationOpInterfaceExternalModels(registry); scf::registerBufferizableOpInterfaceExternalModels(registry); scf::registerValueBoundsOpInterfaceExternalModels(registry); diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir index fb8ac2e9858e7..2084dbc0e35a4 100644 --- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir +++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir @@ -616,86 +616,6 @@ func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %ar // ----- - -func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) { - - %c0 = arith.constant 0 : index - %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> - %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> - %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space> - return -} - -// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx -// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index) -// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> -// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> - -// ----- - - -func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) { - %c0 = arith.constant 0 : index - %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> - %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> - %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space> - return -} - -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> -// CHECK: func.func @fold_src_nvgpu_device_async_copy -// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index) -// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]] -// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]] -// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> - -// ----- - - -func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) { - %c0 = arith.constant 0 : index - %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> - %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> - %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space> - %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space> - return -} - -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> -// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy -// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index) -// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]] -// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]] -// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]] -// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]] -// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> - -// ----- - -#map = affine_map<()[s0] -> (-s0 + 4)> -#map1 = affine_map<()[s0] -> (-s0 + 32)> - -func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> { - %c0 = arith.constant 0 : index - %0 = affine.apply #map()[%arg1] - %1 = affine.apply #map1()[%arg2] - %2 = affine.apply #map1()[%arg3] - %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref, 3> - %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref, 3> -> vector<4x2xf16> - return %3 : vector<4x2xf16> -} - -// CHECK: func @test_ldmatrix -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3> -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index -// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index -// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16> - -// ----- - func.func @fold_vector_load_subview(%src : memref<24x64xf32>, %off1 : index, %off2 : index, diff --git a/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir new file mode 100644 index 0000000000000..44dcffcc1f00d --- /dev/null +++ b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir @@ -0,0 +1,93 @@ +// RUN: mlir-opt -fold-memref-alias-ops -split-input-file %s | FileCheck %s + +func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) { + %c0 = arith.constant 0 : index + %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> + %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> + %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space> + return +} + +// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx +// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index) +// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> +// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> + +// ----- + + +func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) { + %c0 = arith.constant 0 : index + %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> + %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> + %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space> + return +} + +// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK: func.func @fold_src_nvgpu_device_async_copy +// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index) +// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]] +// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]] +// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> + +// ----- + + +func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) { + %c0 = arith.constant 0 : index + %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space> + %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>> + %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space> + %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space> + return +} + +// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy +// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index) +// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]] +// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]] +// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]] +// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]] +// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space> + +// ----- + +#map = affine_map<()[s0] -> (-s0 + 4)> +#map1 = affine_map<()[s0] -> (-s0 + 32)> + +func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> { + %c0 = arith.constant 0 : index + %0 = affine.apply #map()[%arg1] + %1 = affine.apply #map1()[%arg2] + %2 = affine.apply #map1()[%arg3] + %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref, 3> + %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref, 3> -> vector<4x2xf16> + return %3 : vector<4x2xf16> +} + +// CHECK: func @test_ldmatrix +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index +// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16> + +// ----- + +func.func @ldmatrix_expand(%arg0: memref<4096xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> { + %exp = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [4, 32, 32] : memref<4096xf16, 3> into memref<4x32x32xf16, 3> + %3 = nvgpu.ldmatrix %exp[%arg1, %arg2, %arg3] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16> + return %3 : vector<4x2xf16> +} + +// CHECK: func @ldmatrix_expand +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4096xf16, 3> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index +// CHECK: %[[LIN:[a-zA-Z0-9_]+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG2]], %[[ARG3]]] by (4, 32, 32) +// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[LIN]]] {numTiles = 4 : i32, transpose = false} : memref<4096xf16, 3> -> vector<4x2xf16>