Skip to content

Commit

Permalink
[MLIR] Simplify affine maps + operands exploiting IV info
Browse files Browse the repository at this point in the history
Simplify affine expressions and maps while exploiting simple range and
step info of any IVs that are operands. This simplification is local,
O(1) and practically useful in several scenarios. Accesses with
floordiv's and mod's where the LHS is non-negative and bounded or is a
known multiple of a constant can often be simplified. This is
implemented as a canonicalization for all affine ops in a generic way:
all affine.load/store, vector_load/store, affine.apply, affine.min/max,
etc. ops.

Eg: For tiled loop nests accessing buffers this way:

affine.for %i = 0 to 1024 step 32 {
  affine.for %ii = 0 to 32 {
    affine.load [(%i + %ii) floordiv 32, (%i + %ii) mod 32]
  }
}

// Note that %i is a multiple of 32 and %ii < 32, hence:

(%i + %ii) floordiv 32 is the same as %i floordiv 32
(%i + %ii) mod 32 is the same as %ii mod 32.

The simplification leads to simpler index/subscript arithmetic for
multi-dimensional arrays and also in turn enables detection of spatial
locality (for vectorization for eg.), temporal locality or loop
invariance for hoisting or scalar replacement.

Differential Revision: https://reviews.llvm.org/D135085
  • Loading branch information
bondhugula committed Oct 4, 2022
1 parent 82cac65 commit ddff376
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 4 deletions.
6 changes: 6 additions & 0 deletions mlir/include/mlir/IR/AffineMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,12 @@ class AffineMap {
/// Returns `*this` if `numResults` >= `this->getNumResults()`.
AffineMap getMinorSubMap(unsigned numResults) const;

/// Get the largest known divisor of all map expressions.
/// For eg: for (d0, d1) -> (8*d0 + 4, 4*d1 + 2), the result is 2.
/// In the case of maps with no expressions or all zero constant expressions,
/// the largest known divisor is trivially the max uint64_t value.
uint64_t getLargestKnownDivisorOfMapExprs();

friend ::llvm::hash_code hash_value(AffineMap arg);

/// Methods supporting C API.
Expand Down
165 changes: 165 additions & 0 deletions mlir/lib/Dialect/Affine/IR/AffineOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"
#include <numeric>

using namespace mlir;

Expand Down Expand Up @@ -578,6 +579,169 @@ OpFoldResult AffineApplyOp::fold(ArrayRef<Attribute> operands) {
return result[0];
}

/// Returns the largest known divisor of `e`. Exploits information from the
/// values in `operands`.
static int64_t getLargestKnownDivisor(AffineExpr e, ArrayRef<Value> operands) {
// This method isn't aware of `operands`.
int64_t div = e.getLargestKnownDivisor();

// We now make use of operands for the case `e` is a dim expression.
// TODO: More powerful simplification would have to modify
// getLargestKnownDivisor to take `operands` and exploit that information as
// well for dim/sym expressions, but in that case, getLargestKnownDivisor
// can't be part of the IR library but of the `Analysis` library. The IR
// library can only really depend on simple O(1) checks.
auto dimExpr = e.dyn_cast<AffineDimExpr>();
// If it's not a dim expr, `div` is the best we have.
if (!dimExpr)
return div;

// We simply exploit information from loop IVs.
// We don't need to use mlir::getLargestKnownDivisorOfValue since the other
// desired simplifications are expected to be part of other
// canonicalizations. Also, mlir::getLargestKnownDivisorOfValue is part of the
// LoopAnalysis library.
Value operand = operands[dimExpr.getPosition()];
int64_t operandDivisor = 1;
// TODO: With the right accessors, this can be extended to
// LoopLikeOpInterface.
if (AffineForOp forOp = getForInductionVarOwner(operand)) {
if (forOp.hasConstantLowerBound() && forOp.getConstantLowerBound() == 0) {
operandDivisor = forOp.getStep();
} else {
uint64_t lbLargestKnownDivisor =
forOp.getLowerBoundMap().getLargestKnownDivisorOfMapExprs();
operandDivisor = std::gcd(lbLargestKnownDivisor, forOp.getStep());
}
}
return operandDivisor;
}

/// Check if `e` is known to be: 0 <= `e` < `k`. Handles the simple cases of `e`
/// being an affine dim expression or a constant.
static bool isNonNegativeBoundedBy(AffineExpr e, ArrayRef<Value> operands,
int64_t k) {
if (auto constExpr = e.dyn_cast<AffineConstantExpr>()) {
int64_t constVal = constExpr.getValue();
return constVal >= 0 && constVal < k;
}
auto dimExpr = e.dyn_cast<AffineDimExpr>();
if (!dimExpr)
return false;
Value operand = operands[dimExpr.getPosition()];
// TODO: With the right accessors, this can be extended to
// LoopLikeOpInterface.
if (AffineForOp forOp = getForInductionVarOwner(operand)) {
if (forOp.hasConstantLowerBound() && forOp.getConstantLowerBound() >= 0 &&
forOp.hasConstantUpperBound() && forOp.getConstantUpperBound() <= k) {
return true;
}
}

// We don't consider other cases like `operand` being defined by a constant or
// an affine.apply op since such cases will already be handled by other
// patterns and propagation of loop IVs or constant would happen.
return false;
}

/// Check if expression `e` is of the form d*e_1 + e_2 where 0 <= e_2 < d.
/// Set `div` to `d`, `quotientTimesDiv` to e_1 and `rem` to e_2 if the
/// expression is in that form.
static bool isQTimesDPlusR(AffineExpr e, ArrayRef<Value> operands, int64_t &div,
AffineExpr &quotientTimesDiv, AffineExpr &rem) {
auto bin = e.dyn_cast<AffineBinaryOpExpr>();
if (!bin || bin.getKind() != AffineExprKind::Add)
return false;

AffineExpr llhs = bin.getLHS();
AffineExpr rlhs = bin.getRHS();
div = getLargestKnownDivisor(llhs, operands);
if (isNonNegativeBoundedBy(rlhs, operands, div)) {
quotientTimesDiv = llhs;
rem = rlhs;
return true;
}
div = getLargestKnownDivisor(rlhs, operands);
if (isNonNegativeBoundedBy(llhs, operands, div)) {
quotientTimesDiv = rlhs;
rem = llhs;
return true;
}
return false;
}

/// Simplify `expr` while exploiting information from the values in `operands`.
static void simplifyExprAndOperands(AffineExpr &expr,
ArrayRef<Value> operands) {
// We do this only for certain floordiv/mod expressions.
auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>();
if (!binExpr)
return;

// Simplify the child expressions first.
auto lhs = binExpr.getLHS();
auto rhs = binExpr.getRHS();
simplifyExprAndOperands(lhs, operands);
simplifyExprAndOperands(rhs, operands);
expr = getAffineBinaryOpExpr(binExpr.getKind(), lhs, rhs);

binExpr = expr.dyn_cast<AffineBinaryOpExpr>();
if (!binExpr || (binExpr.getKind() != AffineExprKind::FloorDiv &&
binExpr.getKind() != AffineExprKind::Mod)) {
return;
}

auto rhsConst = rhs.dyn_cast<AffineConstantExpr>();
if (!rhsConst)
return;

int64_t rhsConstVal = rhsConst.getValue();
AffineExpr quotientTimesDiv, rem;
int64_t divisor;

// Simplify expressions of the form e = (e_1 + e_2) floordiv c or (e_1 + e_2)
// mod c, where e_1 is a multiple of `k` and 0 <= e_2 < k. In such cases, if
// `c` % `k` == 0, (e_1 + e_2) floordiv c can be simplified to e_1 floordiv c.
// And when k % c == 0, (e_1 + e_2) mod c can be simplified to e_2 mod c.
if (isQTimesDPlusR(lhs, operands, divisor, quotientTimesDiv, rem)) {
if (rhsConstVal % divisor == 0 &&
binExpr.getKind() == AffineExprKind::FloorDiv) {
expr = quotientTimesDiv.floorDiv(rhsConst);
} else if (divisor % rhsConstVal == 0 &&
binExpr.getKind() == AffineExprKind::Mod) {
expr = rem % rhsConst;
}
return;
}

// Handle the simple case when the LHS expression can be either upper
// bounded or is a known multiple of RHS constant.
// lhs floordiv c -> 0 if 0 <= lhs < c,
// lhs mod c -> 0 if lhs % c = 0.
if ((isNonNegativeBoundedBy(lhs, operands, rhsConstVal) &&
binExpr.getKind() == AffineExprKind::FloorDiv) ||
(getLargestKnownDivisor(lhs, operands) % rhsConstVal == 0 &&
binExpr.getKind() == AffineExprKind::Mod)) {
expr = getAffineConstantExpr(0, expr.getContext());
}
}

/// Simplify the map while exploiting information on the values in `operands`.
// Use "unused attribute" marker to silence warning stemming from the inability
// to see through the template expansion.
static void LLVM_ATTRIBUTE_UNUSED
simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
assert(map.getNumInputs() == operands.size() && "invalid operands for map");
SmallVector<AffineExpr> newResults;
newResults.reserve(map.getNumResults());
for (AffineExpr expr : map.getResults()) {
simplifyExprAndOperands(expr, operands);
newResults.push_back(expr);
}
map = AffineMap::get(map.getNumDims(), map.getNumSymbols(), newResults,
map.getContext());
}

/// Replace all occurrences of AffineExpr at position `pos` in `map` by the
/// defining AffineApplyOp expression and operands.
/// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
Expand Down Expand Up @@ -1095,6 +1259,7 @@ struct SimplifyAffineOp : public OpRewritePattern<AffineOpTy> {
SmallVector<Value, 8> resultOperands(oldOperands);
composeAffineMapAndOperands(&map, &resultOperands);
canonicalizeMapAndOperands(&map, &resultOperands);
simplifyMapWithOperands(map, resultOperands);
if (map == oldMap && std::equal(oldOperands.begin(), oldOperands.end(),
resultOperands.begin()))
return failure();
Expand Down
12 changes: 12 additions & 0 deletions mlir/lib/IR/AffineMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
#include <numeric>

using namespace mlir;

Expand Down Expand Up @@ -241,6 +242,17 @@ AffineMap::inferFromExprList(ArrayRef<SmallVector<AffineExpr, 4>> exprsList) {
return ::inferFromExprList(exprsList);
}

uint64_t AffineMap::getLargestKnownDivisorOfMapExprs() {
uint64_t gcd = 0;
for (AffineExpr resultExpr : getResults()) {
uint64_t thisGcd = resultExpr.getLargestKnownDivisor();
gcd = std::gcd(gcd, thisGcd);
}
if (gcd == 0)
gcd = std::numeric_limits<uint64_t>::max();
return gcd;
}

AffineMap AffineMap::getMultiDimIdentityMap(unsigned numDims,
MLIRContext *context) {
SmallVector<AffineExpr, 4> dimExprs;
Expand Down
46 changes: 42 additions & 4 deletions mlir/test/Dialect/Affine/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@ func.func @compose_affine_maps_2d_tile(%0: memref<16x32xf32>, %1: memref<16x32xf
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index

affine.for %i0 = 0 to 3 {
affine.for %i0 = 0 to 16 {
%x0 = affine.apply affine_map<(d0)[s0] -> (d0 ceildiv s0)> (%i0)[%c4]
affine.for %i1 = 0 to 3 {
affine.for %i1 = 0 to 16 {
%x1 = affine.apply affine_map<(d0)[s0] -> (d0 ceildiv s0)> (%i1)[%c8]
affine.for %i2 = 0 to 3 {
affine.for %i2 = 0 to 16 {
%x2 = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)> (%i2)[%c4]
affine.for %i3 = 0 to 3 {
affine.for %i3 = 0 to 16 {
%x3 = affine.apply affine_map<(d0)[s0] -> (d0 mod s0)> (%i3)[%c8]

%x40 = affine.apply affine_map<(d0, d1, d2, d3)[s0, s1] ->
Expand Down Expand Up @@ -1150,3 +1150,41 @@ module {
return %s: memref<32x64xf32>
}
}

// -----

// Simplification of maps exploiting operand info.

// CHECK-LABEL: func @simplify_with_operands
func.func @simplify_with_operands(%N: index, %A: memref<?x32xf32>) {
// CHECK-NEXT: affine.for %[[I:.*]] = 0 to %{{.*}}
affine.for %i = 0 to %N step 32 {
// CHECK-NEXT: affine.for %[[II:.*]] = 0 to 32
affine.for %ii = 0 to 32 {
// %ii is less than 32 and %i divides 32.
// CHECK: affine.load %{{.*}}[0, 0]
%x = affine.load %A[%ii floordiv 32, %i mod 32] : memref<?x32xf32>
"test.foo"(%x) : (f32) -> ()

// %i is aligned at 32 boundary and %ii < 32.
// CHECK: affine.load %{{.*}}[%[[I]] floordiv 32, %[[II]] mod 32]
%a = affine.load %A[(%i + %ii) floordiv 32, (%i + %ii) mod 32] : memref<?x32xf32>
"test.foo"(%a) : (f32) -> ()
// CHECK: affine.load %{{.*}}[%[[I]] floordiv 64, (%[[I]] + %[[II]]) mod 64]
%b = affine.load %A[(%i + %ii) floordiv 64, (%i + %ii) mod 64] : memref<?x32xf32>
"test.foo"(%b) : (f32) -> ()
// CHECK: affine.load %{{.*}}[(%[[I]] + %[[II]]) floordiv 16, %[[II]] mod 16]
%c = affine.load %A[(%i + %ii) floordiv 16, (%i + %ii) mod 16] : memref<?x32xf32>
"test.foo"(%c) : (f32) -> ()
}
}

// Should not simplify.
affine.for %i = -1 to 32 {
// CHECK: affine.load %{{.*}}[%{{.*}} floordiv {{.*}}, %{{.*}} mod {{.*}}] :
%x = affine.load %A[%i floordiv 32, %i mod 32] : memref<?x32xf32>
"test.foo"(%x) : (f32) -> ()
}

return
}

0 comments on commit ddff376

Please sign in to comment.