Skip to content

Commit

Permalink
[AMDGPU][MLIR]Add shmem-optimization as an op using transform dialect (
Browse files Browse the repository at this point in the history
…#81550)

This PR adds functionality to use shared memory optimization as an op
using transform dialect.
  • Loading branch information
erman-gurses committed Feb 14, 2024
1 parent 7180c23 commit 29d1aca
Show file tree
Hide file tree
Showing 12 changed files with 356 additions and 19 deletions.
1 change: 1 addition & 0 deletions mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
add_subdirectory(IR)
add_subdirectory(TransformOps)
add_subdirectory(Transforms)
48 changes: 48 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
//===- AMDGPUTransformOps.h - AMDGPU transform ops ---------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
#define MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H

#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/RegionKindInterface.h"

namespace mlir {
namespace transform {
class TransformHandleTypeInterface;
} // namespace transform
} // namespace mlir

namespace mlir {
class DialectRegistry;

namespace linalg {
class LinalgOp;
} // namespace linalg

namespace scf {
class ForOp;
} // namespace scf

namespace amdgpu {
void registerTransformDialectExtension(DialectRegistry &registry);
} // namespace amdgpu
} // namespace mlir

//===----------------------------------------------------------------------===//
// AMDGPU Transform Operations
//===----------------------------------------------------------------------===//

#define GET_OP_CLASSES
#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h.inc"

#endif // MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===- AMDGPUTransformOps.td - AMDGPU transform ops --------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef AMDGPU_TRANSFORM_OPS
#define AMDGPU_TRANSFORM_OPS

include "mlir/Dialect/Transform/IR/TransformAttrs.td"
include "mlir/Dialect/Transform/IR/TransformDialect.td"
include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
include "mlir/Dialect/Transform/IR/TransformTypes.td"
include "mlir/Interfaces/SideEffectInterfaces.td"

//===----------------------------------------------------------------------===//
// ApplyOptimizeSharedMemoryReadsAndWritesOp
//===----------------------------------------------------------------------===//

def ApplyOptimizeSharedMemoryReadsAndWritesOp :
Op<Transform_Dialect, "amdgpu.optimize_shared_memory_reads_and_writes",
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
TransformOpInterface, TransformEachOpTrait]> {
let summary = "Reduce shared memory bank conflicts";
let description = [{ This op attempts to optimize GPU Shared memory
reads/writes with the goal of avoiding bank conflicts.
}];

let arguments = (ins TransformHandleTypeInterface:$target);
let results = (outs);

let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
::mlir::func::FuncOp funcOp,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);
}];
}

#endif // AMDGPU_TRANSFORM_OPS
4 changes: 4 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
set(LLVM_TARGET_DEFINITIONS AMDGPUTransformOps.td)
mlir_tablegen(AMDGPUTransformOps.h.inc -gen-op-decls)
mlir_tablegen(AMDGPUTransformOps.cpp.inc -gen-op-defs)
add_public_tablegen_target(MLIRAMDGPUTransformOpsIncGen)
3 changes: 3 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_

#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/Operation.h"
#include "mlir/Support/LogicalResult.h"

Expand Down Expand Up @@ -48,6 +49,8 @@ namespace amdgpu {
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
Value memrefValue);

void optimizeSharedMemoryReadsAndWritesOp(mlir::func::FuncOp funcOp);

} // namespace amdgpu
} // namespace mlir

Expand Down
2 changes: 2 additions & 0 deletions mlir/include/mlir/InitAllExtensions.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
#include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
Expand Down Expand Up @@ -66,6 +67,7 @@ inline void registerAllExtensions(DialectRegistry &registry) {
ub::registerConvertUBToLLVMInterface(registry);

// Register all transform dialect extensions.
amdgpu::registerTransformDialectExtension(registry);
affine::registerTransformDialectExtension(registry);
bufferization::registerTransformDialectExtension(registry);
func::registerTransformDialectExtension(registry);
Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Dialect/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(IR)
add_subdirectory(Transforms)
add_subdirectory(Utils)
add_subdirectory(TransformOps)
add_subdirectory(Transforms)
66 changes: 66 additions & 0 deletions mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//===- AMDGPUTransformOps.cpp - Implementation of AMDGPU transform ops-----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"

using namespace mlir;
using namespace mlir::amdgpu;
using namespace mlir::transform;
using namespace mlir::func;

#define DEBUG_TYPE "amdgpu-transforms"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define DBGSNL() (llvm::dbgs() << "\n")
#define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")

DiagnosedSilenceableFailure
ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
TransformState &state) {
optimizeSharedMemoryReadsAndWritesOp(funcOp);
return DiagnosedSilenceableFailure::success();
}

void ApplyOptimizeSharedMemoryReadsAndWritesOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
onlyReadsHandle(getTarget(), effects);
modifiesPayload(effects);
}

//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//

namespace {
class AMDGPUTransformDialectExtension
: public TransformDialectExtension<AMDGPUTransformDialectExtension> {
public:
AMDGPUTransformDialectExtension() {
declareGeneratedDialect<arith::ArithDialect>();
declareGeneratedDialect<affine::AffineDialect>();
declareGeneratedDialect<amdgpu::AMDGPUDialect>();
declareGeneratedDialect<vector::VectorDialect>();
registerTransformOps<
#define GET_OP_LIST
#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
>();
}
};
} // namespace

#define GET_OP_CLASSES
#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"

void amdgpu::registerTransformDialectExtension(DialectRegistry &registry) {
registry.addExtensions<AMDGPUTransformDialectExtension>();
}
25 changes: 25 additions & 0 deletions mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
add_mlir_dialect_library(MLIRAMDGPUTransformOps
AMDGPUTransformOps.cpp

ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/TransformOps

DEPENDS
MLIRAMDGPUTransformOpsIncGen

LINK_LIBS PUBLIC
MLIRAffineDialect
MLIRArithDialect
MLIRIR
MLIRLinalgDialect
MLIRAMDGPUDialect
MLIRAMDGPUTransforms
MLIRParser
MLIRSideEffectInterfaces
MLIRSCFDialect
MLIRSCFTransforms
MLIRTransformDialect
MLIRTransformDialectUtils
MLIRVectorTransforms

)
48 changes: 30 additions & 18 deletions mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Support/LogicalResult.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/MathExtras.h"

namespace mlir {
namespace amdgpu {
Expand All @@ -52,12 +50,12 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
static Value permuteVectorOffset(OpBuilder &b, Location loc,
ArrayRef<Value> indices, MemRefType memrefTy,
int64_t srcDim, int64_t tgtDim) {
// Adjust the src index to change how often the permutation changes
// if necessary.
/// Adjust the src index to change how often the permutation changes
/// if necessary.
Value src = indices[srcDim];

// We only want to permute every N iterations of the target dim where N is
// ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
/// We only want to permute every N iterations of the target dim where N is
/// ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
const int64_t permuteEveryN = std::max<int64_t>(
1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
memrefTy.getElementTypeBitWidth()) /
Expand All @@ -83,8 +81,8 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
srcBits = b.create<arith::AndIOp>(loc, src, srcBits);

// Use the src bits to permute the target bits b[N:M] containing the
// vector offset.
/// Use the src bits to permute the target bits b[N:M] containing the
/// vector offset.
if (permuteEveryN > 1) {
int64_t shlBits = n - llvm::Log2_64(permuteEveryN);
if (shlBits > 0) {
Expand Down Expand Up @@ -133,8 +131,8 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
writeOps.push_back(op);
});

// Restrict to a supported set of ops. We also require at least 2D access,
// although this could be relaxed.
/// Restrict to a supported set of ops. We also require at least 2D access,
/// although this could be relaxed.
if (llvm::any_of(readOps, [](Operation *op) {
return !isa<memref::LoadOp, vector::LoadOp, vector::TransferReadOp>(
op) ||
Expand All @@ -159,15 +157,15 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
return failure();

// Abort if the given value has any sub-views; we do not do any alias
// analysis.
/// Abort if the given value has any sub-views; we do not do any alias
/// analysis.
bool hasSubView = false;
parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });
if (hasSubView)
return failure();

// Check if this is necessary given the assumption of 128b accesses:
// If dim[rank-1] is small enough to fit 8 rows in a 128B line.
/// Check if this is necessary given the assumption of 128b accesses:
/// If dim[rank-1] is small enough to fit 8 rows in a 128B line.
const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
const int64_t rowsPerLine =
(8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
Expand All @@ -177,8 +175,8 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
if (rowsPerLine >= threadGroupSize)
return failure();

// Get sets of operations within the function that read/write to shared
// memory.
/// Get sets of operations within the function that read/write to shared
/// memory.
SmallVector<Operation *, 16> shmReadOps;
SmallVector<Operation *, 16> shmWriteOps;
if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,
Expand All @@ -193,7 +191,7 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
int64_t tgtDim = memRefType.getRank() - 1;
int64_t srcDim = memRefType.getRank() - 2;

// Transform indices for the ops writing to shared memory.
/// Transform indices for the ops writing to shared memory.
while (!shmWriteOps.empty()) {
Operation *shmWriteOp = shmWriteOps.pop_back_val();
builder.setInsertionPoint(shmWriteOp);
Expand All @@ -205,7 +203,7 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
amdgpu::setIndices(shmWriteOp, transformedIndices);
}

// Transform indices for the ops reading from shared memory.
/// Transform indices for the ops reading from shared memory.
while (!shmReadOps.empty()) {
Operation *shmReadOp = shmReadOps.pop_back_val();
builder.setInsertionPoint(shmReadOp);
Expand All @@ -220,6 +218,20 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
return success();
}

void amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
SmallVector<memref::AllocOp> shmAllocOps;
funcOp.walk([&](memref::AllocOp allocOp) {
if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
return;
shmAllocOps.push_back(allocOp);
});
for (auto allocOp : shmAllocOps) {
if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
allocOp.getMemref())))
return;
}
}

struct OptimizeSharedMemoryPass
: public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
public:
Expand Down
Loading

0 comments on commit 29d1aca

Please sign in to comment.