Skip to content
11 changes: 11 additions & 0 deletions mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass
encountered to the local workgroup. Within each mapping, the first three
dimensions are mapped to x/y/z hardware ids and all following dimensions are
mapped to sequential loops.

Ordering of the loop mapping against the different dimensions is controlled
by the `mapping-policy` option.
Two policies are supported:
1. `outermost-first` (default): the outermost loop maps to X, then Y
and finally Z.
2. `innermost-first`: the innermost loop maps to X, then Y and finally Z.
}];
let dependentDialects = ["mlir::gpu::GPUDialect"];
let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string",
/*default=*/"\"outermost-first\"",
"Policy outlining how to assign loops to GPU dimensions."
"Supported values are `outermost-first` and `innermost-first`.">];
}

def GpuEliminateBarriers
Expand Down
65 changes: 55 additions & 10 deletions mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp,
namespace gpu {
namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
enum class MappingPolicy { OutermostFirst, InnermostFirst };
} // namespace

static constexpr int kNumHardwareIds = 3;
Expand All @@ -65,16 +66,30 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
return mappingLevel;
}

// Map the policy string to a typed mapping policy.
// TODO: Revisit this and possibly use a loop interchange pass instead.
static FailureOr<MappingPolicy> getMappingPolicyFromStr(StringRef policy) {
std::string policyCanonical = policy.trim().lower();

std::optional<MappingPolicy> option =
llvm::StringSwitch<std::optional<MappingPolicy>>(policyCanonical)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use auto.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Type is visible on the right-hand side. So I presume the use of auto here is fine?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Policy says:

use auto with initializers like cast(...) or other places where the type is already obvious from the context

I argue the type in llvm::StringSwitch<std::optional<MappingPolicy>> is not immediately clear for anyone that's not familiar with llvm::StringSwitch, so it's better to have it explicit.

.Case("innermost-first", MappingPolicy::InnermostFirst)
.Case("outermost-first", MappingPolicy::OutermostFirst)
.Default(std::nullopt);

if (!option)
return failure();
return *option;
}

/// Computed the hardware id to use for a given mapping level. Will
/// assign x,y and z hardware ids for the first 3 dimensions and use
/// sequential after.
/// TODO: Make this use x for the inner-most loop that is
/// distributed to map to x, the next innermost to y and the next innermost to
/// z.
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {

if (dimension >= kNumHardwareIds || level == Sequential)
return Processor::Sequential;

switch (level) {
case MapGrid:
switch (dimension) {
Expand Down Expand Up @@ -107,20 +122,35 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
/// Add mapping information to the given parallel loop. Do not add
/// mapping information if the loop already has it. Also, don't
/// start a mapping at a nested loop.
static void mapParallelOp(ParallelOp parallelOp,
MappingLevel mappingLevel = MapGrid) {
static void
mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid,
MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) {
// Do not try to add a mapping to already mapped loops or nested loops.
if (parallelOp->getAttr(getMappingAttrName()) ||
((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
return;

const int numLoops = static_cast<int>(parallelOp.getNumLoops());
const int loopsToMap = std::min(numLoops, kNumHardwareIds);

MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
attrs.reserve(parallelOp.getNumLoops());
for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
attrs.reserve(numLoops);

for (int i = 0; i < numLoops; ++i) {

// Determine the mapping to use for this loop.
// If the are more loops to map than HW IDs map to sequential.
int hwMapping = kNumHardwareIds;
if (i < loopsToMap) {
hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst)
? i
: (loopsToMap - 1 - i);
}

attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(),
b.getDimIdentityMap()));
}
(void)setMappingAttr(parallelOp, attrs);
Expand All @@ -129,16 +159,31 @@ static void mapParallelOp(ParallelOp parallelOp,
// walk but just iterate over the operations.
for (Operation &op : *parallelOp.getBody()) {
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
mapParallelOp(nested, mappingLevel);
mapParallelOp(nested, mappingLevel, mappingPolicy);
}
}

namespace {
struct GpuMapParallelLoopsPass
: public impl::GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
using Base::Base;

void runOnOperation() override {
// Parse the mapping policy.
FailureOr<MappingPolicy> policyOrFailure =
getMappingPolicyFromStr(mappingPolicyStr);
if (failed(policyOrFailure)) {
getOperation()->emitError() << "Invalid mapping policy specified.";
return signalPassFailure();
}

MappingPolicy policy = *policyOrFailure;
MappingLevel topLevel = MappingLevel::MapGrid;

for (Region &region : getOperation()->getRegions()) {
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
region.walk([&](ParallelOp parallelOp) {
mapParallelOp(parallelOp, topLevel, policy);
});
}
}
};
Expand Down
80 changes: 54 additions & 26 deletions mlir/test/Dialect/GPU/mapping.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER
// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER

func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) {
Expand All @@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
return
}

// CHECK-LABEL: func @parallel_loop(
// CHECK: scf.parallel
// CHECK: scf.parallel
// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
// OUTER-LABEL: func @parallel_loop(
// OUTER: scf.parallel
// OUTER: scf.parallel
// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// OUTER-NOT: mapping

// INNER-LABEL: func @parallel_loop(
// INNER: scf.parallel
// INNER: scf.parallel
// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// INNER: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// INNER-NOT: mapping

// -----

Expand All @@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
return
}

// CHECK-LABEL: func @parallel_loop_4d(
// CHECK: scf.parallel
// CHECK: scf.parallel
// CHECK: scf.parallel
// CHECK: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
// OUTER-LABEL: func @parallel_loop_4d(
// OUTER: scf.parallel
// OUTER: scf.parallel
// OUTER: scf.parallel
// OUTER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// OUTER-NOT: mapping

// INNER-LABEL: func @parallel_loop_4d(
// INNER: scf.parallel
// INNER: scf.parallel
// INNER: scf.parallel
// INNER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// INNER: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// INNER-NOT: mapping