Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
/// gpu.address_space to integer values.
void populateGpuMemorySpaceAttributeConversions(
TypeConverter &typeConverter, const MemorySpaceMapping &mapping);

/// Insert gpu.wait calls before gpu operations with multiple async dependencies
/// when the gpu operation does not support multiple async dependencies, i.e.
/// gpu.launch_func.
void populateGpuMultipleAsyncDepsConversionPatterns(
RewritePatternSet &patterns);
} // namespace mlir

#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
1 change: 1 addition & 0 deletions mlir/include/mlir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
"runtime with the kernel bare pointer calling convention, to enable "
"dynamic binding of buffers as arguments without static type info."
>
// TODO should an option be made to turn this feature on?
];

let dependentDialects = [
Expand Down
80 changes: 80 additions & 0 deletions mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,14 @@ class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};

template <typename Op>
struct ConvertMultipleAsyncDepsToGpuWaitPattern final : OpRewritePattern<Op> {
using OpRewritePattern<Op>::OpRewritePattern;

LogicalResult matchAndRewrite(Op op,
PatternRewriter &rewriter) const override;
};

/// Generic rewriting rule for operation on sparse matrices.
/// Currently supports CUDA (by means of cuSparse and cuSparseLt).
#define DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(op_name) \
Expand Down Expand Up @@ -538,6 +546,22 @@ void GpuToLLVMConversionPass::runOnOperation() {
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();
}
// Insert gpu.wait operations before operations that do not support multiple
// async dependencies.
// TODO should this only be enabled upon an option?
{
RewritePatternSet patternss(&getContext());
populateGpuMultipleAsyncDepsConversionPatterns(patternss);
if (failed(applyPatternsGreedily(getOperation(), std::move(patternss)))) {
return signalPassFailure();
}

LLVM_DEBUG(llvm::dbgs()
<< "--- IR After Adding Additional gpu.waits: ---\n");
LLVM_DEBUG(getOperation()->print(llvm::dbgs()));
LLVM_DEBUG(llvm::dbgs()
<< "---------------------------------------------\n");
}

LowerToLLVMOptions options(context);
options.useBarePtrCallConv = hostBarePtrCallConv;
Expand Down Expand Up @@ -1787,6 +1811,29 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}

template <class Op>
LogicalResult ConvertMultipleAsyncDepsToGpuWaitPattern<Op>::matchAndRewrite(
Op op, PatternRewriter &rewriter) const {
if (op.getAsyncDependencies().size() <= 1)
return rewriter.notifyMatchFailure(
op, "Can only convert ops with multiple async dependencies.");

// Create a new gpu.wait with the original async deps.
Type tokenType = rewriter.getType<gpu::AsyncTokenType>();
Value waitToken = gpu::WaitOp::create(rewriter, op.getLoc(), tokenType,
op.getAsyncDependencies())
.getAsyncToken();

// TODO is it safe to just do getAsyncDependenciesMutable on the original op?
Operation *newOp = rewriter.clone(*op.getOperation());
auto iface = dyn_cast<Op>(newOp);
assert(iface && "Expected cloned op to have same type as original op.");
iface.getAsyncDependenciesMutable().assign({waitToken});
rewriter.replaceOp(op, newOp);

return success();
}

void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns,
bool kernelBarePtrCallConv, bool kernelIntersperseSizeCallConv) {
Expand Down Expand Up @@ -1830,6 +1877,39 @@ void mlir::populateGpuToLLVMConversionPatterns(
kernelIntersperseSizeCallConv);
}

void mlir::populateGpuMultipleAsyncDepsConversionPatterns(
RewritePatternSet &patterns) {
// TODO: Other ops to consider handling:
// - gpu::AllocOp,
// - gpu::DeallocOp,
// - gpu::MemcpyOp,
// - gpu::MemsetOp,
// - gpu::CreateDnTensorOp,
// - gpu::DestroyDnTensorOp,
// - gpu::CreateCooOp,
// - gpu::CreateCooAoSOp,
// - gpu::CreateCsrOp,
// - gpu::Create2To4SpMatOp,
// - gpu::DestroySpMatOp,
// - gpu::SpMVBufferSizeOp,
// - gpu::SpMVOp,
// - gpu::SpMMBufferSizeOp,
// - gpu::SDDMMBufferSizeOp,
// - gpu::SpMMOp,
// - gpu::SDDMMOp,
// - gpu::SpGEMMCreateDescrOp,
// - gpu::SpGEMMDestroyDescrOp,
// - gpu::SpGEMMWorkEstimationOrComputeOp,
// - gpu::SpGEMMCopyOp,
// - gpu::SpMatGetSizeOp,
// - gpu::SetCsrPointersOp,
// - gpu::CreateCscOp,
// - gpu::CreateBsrOp,
// - gpu::LaunchFuncOp
patterns.add<ConvertMultipleAsyncDepsToGpuWaitPattern<gpu::LaunchFuncOp>>(
patterns.getContext());
}

//===----------------------------------------------------------------------===//
// GPUModuleOp convert to LLVM op interface
//===----------------------------------------------------------------------===//
Expand Down
81 changes: 81 additions & 0 deletions mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s

module attributes {gpu.container_module} {

gpu.module @foo {
gpu.func @bar() kernel {
gpu.return
}
}

// CHECK-LABEL: func @main
func.func @main() {
%c1 = arith.constant 1 : index

// Check that pass does not modify launch_func ops with only 1 dependency:

// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
%t0 = gpu.wait async
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
%t1 = gpu.wait async [%t0]
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
%0 = gpu.wait async [%t0, %t1]
// CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
%good_call = gpu.launch_func async [%0] @foo::@bar
blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
// CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
gpu.wait [%good_call]

// Check that launch_func ops with multiple dependencies are properly
// handled and do not result in a failure:

// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
%t2 = gpu.wait async
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
%t3 = gpu.wait async [%t2]
// Inserted gpu.wait:
// CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
// CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
// gpu.launch_func only has 1 async dependency:
// CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
%bad_call = gpu.launch_func async [%t2, %t3] @foo::@bar
blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
// CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
// CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
gpu.wait [%bad_call]
return
}

// func.func @foo(%size : index) -> memref<?xf32> {
// %t0 = gpu.wait async
// %t1 = gpu.wait async [%t0]
// %0 = gpu.alloc [%t0, %t1] (%size) : memref<?xf32>
// // gpu.wait [%1]
// return %0 : memref<?xf32>
// }

}