Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TPP Runner Wrapper pass #905

Merged
merged 2 commits into from Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 50 additions & 0 deletions include/TPP/Passes.td
Expand Up @@ -542,4 +542,54 @@ def ConvertAddInplacePass: Pass<"linalg-convert-add-in-place",
let dependentDialects = ["linalg::LinalgDialect"];
}

def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
let summary = "Create main function runner wrapper";
let description = [{
Creates a runner wrapper - maps the arguments and random initialize them.
Optionally, inserts benchmark wrapper calling the main kernel repeatedly
and taking measurements, or printing the result in the end.
}];
let dependentDialects = ["func::FuncDialect",
"tensor::TensorDialect",
"memref::MemRefDialect",
"gpu::GPUDialect",
"arith::ArithDialect",
"scf::SCFDialect",
"vector::VectorDialect",
"bufferization::BufferizationDialect",
"perf::PerfDialect"];
let options = [
Option<"kernelName", "kernel-name", "std::string",
/*default=*/"\"entry\"",
"The kernel function to be called.">,
Option<"kernelType", "kernel-type", "std::string",
/*default=*/"\"void\"",
"The type of the kernel function.">,
Option<"backend", "backend", "std::string",
/*default=*/"\"cpu\"",
"Kernel target device backend (cpu, cuda, vulkan).">,
Option<"offloadToDevice", "offload-on-device", "bool",
/*default=*/"true",
"Offload kernel arguments to the target device.">,
Option<"numBenchLoops", "bench-loops", "int64_t",
/*default=*/"1",
"Number of benchmarking loops.">,
Option<"benchWarmup", "bench-warmup", "bool",
/*default=*/"true",
"Add benchmark warmup loops.">,
Option<"printResult", "print", "bool",
/*default=*/"false",
"Print kernel results.">,
Option<"randomSplat", "random-splat", "bool",
/*default=*/"false",
"Replace splat dense tensors with random values.">,
Option<"seed", "seed", "int64_t",
/*default=*/"0",
"Initialization random seed.">,
Option<"initType", "init-type", "std::string",
/*default=*/"",
"Initializer type (const, simple, cont, rand, normal).">,
];
}

#endif // TPP_DIALECT_TPP_PASSES
44 changes: 26 additions & 18 deletions tools/tpp-run/MLIRBench.h → include/TPP/Runner/MLIRBench.h
@@ -1,14 +1,20 @@
#ifndef TPP_RUN_MLIRBENCH_H
#define TPP_RUN_MLIRBENCH_H

//===- MLIRBench.h - MLIR Benchmark Producer ------------------------------===//
//===- MLIRBench.h - MLIR Benchmark Producer ---------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Producer for benchmark wrapper methods. Upon selecting a kernel to run, maps
// the arguments, random initialize them and call the kernel as many times as
// requested, taking measurements and printing the result in the end.
//
//===----------------------------------------------------------------------===//

#ifndef TPP_RUNNER_MLIRBENCH_H
#define TPP_RUNNER_MLIRBENCH_H

#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/Builders.h"
Expand All @@ -25,6 +31,7 @@ class ModuleOp;
class MemRefType;
class Operation;
class Value;

namespace func {
class FuncOp;
} // namespace func
Expand All @@ -33,11 +40,15 @@ class FuncOp;
// pipeline.
struct MLIRBenchConfig {
MLIRBenchConfig() = default;
MLIRBenchConfig(int seed, TensorInitType initType)
: seed(seed), initType(initType) {}
MLIRBenchConfig(int seed, TensorInitType initType, std::string backend,
bool offloadToDevice)
: seed(seed), initType(initType), backend(backend),
offloadToDevice(offloadToDevice) {}

int seed = 0;
TensorInitType initType = TensorInitType::Auto;
std::string backend = "cpu";
bool offloadToDevice = true;
};

/// MLIRBench - Creates wrapper for calling kernel methods.
Expand All @@ -47,15 +58,6 @@ struct MLIRBenchConfig {
/// inteface is a bit weird, but it will get better once we clear the
/// API design, with time.
class MLIRBench {
/// Min number of warmup loops
static unsigned constexpr minIters = 1;

/// Max number of warmup loops
static unsigned constexpr maxIters = 100;

/// Target ratio of warmup loops: ( total iterations / warmupRatio )
static unsigned constexpr warmupRatio = 10;

/// MLIR OpBulder
OpBuilder builder;

Expand Down Expand Up @@ -86,6 +88,12 @@ class MLIRBench {
/// Tensor init type
TensorInitType initType;

/// Target device backend
std::string backend;

/// Allocate arguments on target device
bool offloadToDevice;

/// Gets module's main block
Block &getModuleBlock();

Expand Down Expand Up @@ -143,8 +151,8 @@ class MLIRBench {
/// Prints the result of a kernel call
LogicalResult printResult(Operation *kernelCall);

/// Terminates the function, issuing a return, lower to LLVM
LogicalResult finalize();
/// Terminates the function, issuing a return.
LogicalResult terminate();

/// Reports error on the current module's location
LogicalResult emitError(llvm::Twine);
Expand All @@ -155,4 +163,4 @@ class MLIRBench {

} // namespace mlir

#endif
#endif // TPP_RUNNER_MLIRBENCH_H
1 change: 1 addition & 0 deletions lib/TPP/CMakeLists.txt
Expand Up @@ -2,6 +2,7 @@ add_subdirectory(Dialect)
add_subdirectory(Conversion)
add_subdirectory(IR)
add_subdirectory(GPU)
add_subdirectory(Runner)
add_subdirectory(Transforms)

get_property(mlir_dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
Expand Down
14 changes: 14 additions & 0 deletions lib/TPP/Runner/CMakeLists.txt
@@ -0,0 +1,14 @@
add_mlir_library(TPPRunner
MLIRBench.cpp
TppRunnerWrapper.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/TPP

DEPENDS
${mlir_dialect_libs}
MLIRIR
MLIRPass
TPPPerfDialect
TPPTransformsUtils
)
55 changes: 15 additions & 40 deletions tools/tpp-run/MLIRBench.cpp → lib/TPP/Runner/MLIRBench.cpp
@@ -1,12 +1,18 @@
//===- MLIRBench.cpp - MLIR Benchmark Producer ----------------------------===//
//===- MLIRBench.cpp - MLIR Benchmark Producer -----------------*----C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Producer for benchmark wrapper methods. Upon selecting a Kernel to run, maps
// the arguments, random initialize them and call the Kernel as many times as
// requested, taking measurements and printing the result in the end.
//
//===----------------------------------------------------------------------===//

#include "MLIRBench.h"
#include "TPP/Runner/MLIRBench.h"

#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
Expand Down Expand Up @@ -47,33 +53,15 @@

using namespace mlir;

// Select target GPU backend for the pipeline.
llvm::cl::opt<std::string>
defGpuBackend("gpu", llvm::cl::desc("Target GPU backend for lowering"),
llvm::cl::value_desc("cuda,vulkan"), llvm::cl::init(""));

// Kernel buffers - arguments and return values - are expected to be allocated
// on GPU.
llvm::cl::opt<bool>
defGpuArgs("gpu-args",
llvm::cl::desc("Kernel buffers are allocated on GPU"),
llvm::cl::init(true));

MLIRBench::MLIRBench(mlir::Operation *op, const MLIRBenchConfig &config)
: builder(op->getContext()), unkLoc(builder.getUnknownLoc()) {
seed = config.seed;
backend = config.backend;
initType = config.initType;
offloadToDevice = config.offloadToDevice;

module = dyn_cast<ModuleOp>(op);
assert(module && "expected a 'builtin.Module' op");
auto *ctx = module->getContext();
ctx->getOrLoadDialect<tensor::TensorDialect>();
ctx->getOrLoadDialect<vector::VectorDialect>();
ctx->getOrLoadDialect<scf::SCFDialect>();
ctx->getOrLoadDialect<math::MathDialect>();
ctx->getOrLoadDialect<bufferization::BufferizationDialect>();
ctx->getOrLoadDialect<perf::PerfDialect>();
ctx->getOrLoadDialect<gpu::GPUDialect>();
}

LogicalResult MLIRBench::findKernel(StringRef name) {
Expand Down Expand Up @@ -187,10 +175,10 @@ LogicalResult MLIRBench::renameKernel() {

Value MLIRBench::registerOnGpu(Value buf, MemRefType memRefTy) {
// Do nothing when not using GPU
if (defGpuBackend.empty() || !defGpuArgs)
if (!offloadToDevice || !(backend == "cuda" || backend == "vulkan"))
return buf;

if (defGpuBackend == "vulkan") {
if (backend == "vulkan") {
// Copy to heap as global memory is not shared between host and device
auto localBuf = builder.create<memref::AllocOp>(unkLoc, memRefTy);
auto copy = builder.create<memref::CopyOp>(unkLoc, buf, localBuf);
Expand Down Expand Up @@ -396,7 +384,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {

// Kernels must return a single result
Value result = kernelCall->getResult(0);
if (defGpuBackend == "cuda" && defGpuArgs) {
if (backend == "cuda" && offloadToDevice) {
auto resType = cast<ShapedType>(result.getType());
auto memrefType =
MemRefType::get(resType.getShape(), resType.getElementType());
Expand Down Expand Up @@ -424,7 +412,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {
return printShapedType(result);
}

LogicalResult MLIRBench::finalize() {
LogicalResult MLIRBench::terminate() {
// If we created a main at all...
// return void and add func to Module
if (main) {
Expand All @@ -433,19 +421,6 @@ LogicalResult MLIRBench::finalize() {
builder.create<func::ReturnOp>(unkLoc);
}

// A set of default passes that lower any input IR to LLVM
PassManager passManager(module->getContext());

tpp::DefaultPipelineOptions options{defGpuBackend};
passManager.addPass(tpp::createDefaultPipeline(options));

auto result = passManager.run(module);
if (failed(result)) {
llvm::errs() << "ERROR: Failed to lower IR to LLVM dialect\n";
module->print(llvm::errs());
return result;
}

return success();
}

Expand All @@ -461,4 +436,4 @@ LogicalResult MLIRBench::emitError(llvm::Twine desc) {
return module.emitError(desc);
}

std::string MLIRBench::getGPUName() { return defGpuBackend; }
std::string MLIRBench::getGPUName() { return backend; }