From 61115a86543732f898936c8c3d42519e3dec38be Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 25 Apr 2024 20:12:45 -0400 Subject: [PATCH 1/7] WIP: Raise to linalg --- include/polygeist/Passes/Passes.h | 1 + include/polygeist/Passes/Passes.td | 9 ++ lib/polygeist/Passes/CMakeLists.txt | 1 + lib/polygeist/Passes/RaiseToLinalg.cpp | 184 +++++++++++++++++++++++++ 4 files changed, 195 insertions(+) create mode 100644 lib/polygeist/Passes/RaiseToLinalg.cpp diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h index 5f3777441d1a..29f5a9b3536c 100644 --- a/include/polygeist/Passes/Passes.h +++ b/include/polygeist/Passes/Passes.h @@ -31,6 +31,7 @@ std::unique_ptr replaceAffineCFGPass(); std::unique_ptr createOpenMPOptPass(); std::unique_ptr createCanonicalizeForPass(); std::unique_ptr createRaiseSCFToAffinePass(); +std::unique_ptr createRaiseAffineToLinalgPass(); std::unique_ptr createCPUifyPass(StringRef method = ""); std::unique_ptr createBarrierRemovalContinuation(); std::unique_ptr detectReductionPass(); diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td index 05c3644c956e..5c17a9d6dc25 100644 --- a/include/polygeist/Passes/Passes.td +++ b/include/polygeist/Passes/Passes.td @@ -151,6 +151,15 @@ def SCFRaiseToAffine : Pass<"raise-scf-to-affine"> { ]; } +def AffineRaiseToLinalg : Pass<"raise-affine-to-linalg"> { + let summary = "Raise affine to linalg"; + let constructor = "mlir::polygeist::createRaiseAffineToLinalgPass()"; + let dependentDialects = [ + "affine::AffineDialect", + "linalg::LinalgDialect", + ]; +} + def SCFCanonicalizeFor : Pass<"canonicalize-scf-for"> { let summary = "Run some additional canonicalization for scf::for"; let constructor = "mlir::polygeist::createCanonicalizeForPass()"; diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt index 5d6164ef53d7..d6947a1931c5 100644 --- a/lib/polygeist/Passes/CMakeLists.txt +++ b/lib/polygeist/Passes/CMakeLists.txt @@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms OpenMPOpt.cpp BarrierRemovalContinuation.cpp RaiseToAffine.cpp + RaiseToLinalg.cpp ParallelLower.cpp TrivialUse.cpp ConvertPolygeistToLLVM.cpp diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp new file mode 100644 index 000000000000..d14771c677fd --- /dev/null +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -0,0 +1,184 @@ +#include "PassDetails.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Passes.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "polygeist/Passes/Passes.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "raise-to-affine" + +using namespace mlir; +using namespace mlir::arith; +using namespace polygeist; +using namespace affine; + +namespace { +struct RaiseAffineToLinalg : public SCFRaiseToAffineBase { + void runOnOperation() override; +}; +} // namespace + +// Also want to add support for affine.for ( ) { linalg.generic } -> bigger linalg.generic +// Also probably want to try to do { linalg.generc1(); linalg.generic2(); } -> bigger linalg.generic() + +/* + +affine.for() { + affine.for() { + } + affine.for() { + } +} + +*/ +struct Condition { + bool ifTrue; + AffineIfOp op; +}; +struct ForOpRaising : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineForOp loop, + PatternRewriter &rewriter) const final { + + // Don't handle accumulations in registers for the moment, we can have + // a separate pattern move them into memref's + if (loop.getNumReductions() != 0) { + return failure(); + } + + SmallVector, AffineLoadOp> loads; + SmallVector, AffineStoreOp> stores; + + // Check that the only operations within the region are either: + // affine.load, affine.store, affine.if, affine.yield + // Additionally, for each load/store, remember what conditions are + // required for that load or store to execute. + Walkloop result = loop->walk([&](Operation* op) { + // TODO extend this, any non-memory operation is also legal here. + // mul, add, etc (we can just check propety) + if (isa(op)) { + return WalkResult::advance(); + } + if (isa(op)) { + Operation *cur = op->getParentOp(); + std::vector conditions; + while (cur != loop) { + auto ifstmt = dyn_cast(cur); + if (!ifstmt) { + return WalkResult::interrupt(); + } + bool ifTrue = ifstmt.getTrueRegion()->isAncestor(cur); + conditions.push_back(ifTrue, ifstmt); + cur = ifstmt->getParent(); + } + if (auto load = dyn_cast(cur)) { + loads.emplace_back(condition, load); + } else { + auto store = cast(cur); + stores.emplace_back(condition, store); + } + return WalkResult::advance(); + } + return WalkResult::interrupt(); + }); + + if (result.wasInterrupted()) return failure(); + + // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result) + // TODO we can extend this and handle things like reductions, but we're going to start easy for now + for (auto &&[_, store] : stores) { + for (auto &&[_, load]: loads) { + if (mayAlias(load.getMemref(), stores.getMemref())) { + return failure(); + } + } + for (auto &&[_, store2]: stores) { + if (store == store2) continue; + if (mayAlias(store.getMemref(), stores2.getMemref())) { + return failure(); + } + } + } + + + + SmallVector inputs; + SmallVector affineMaps; + for (auto [conds, load]&& : loads) { + // Only support unconditional loads for the moment + if (conds.size() != 0) return failure(); + inputs.push_back(load.getMemref()); + affineMaps.push_back(load.getAffineMap()); + } + + SmallVector outputs; + for (auto [conds, store]&& : stores) { + // Only support unconditional loads for the moment + if (conds.size() != 0) return failure(); + outputs.push_back(store.getMemref()); + affineMaps.push_back(store.getAffineMap()); + } + + ArrayRef iteratorTypes; + // TODO fill this with the for loop bounds + + auto genericOp = rewriter.create( + loc, TypeRange(), inputs, outputs, affineMaps, iteratorTypes, + StringAttr(), + StringAttr()); + + + auto &body = genericOp.getRegion(); + body.takeBody(loop.getRegion()); + + rewriter.setInsertionPointToStart(*body.begin()); + + // This index will replace the use of the affine index + auto idx = rewriter.create(rewriter.getIndexAttr(0)); + rewriter.replaceAllUsesWith(loop.getInductionVariable(), idx); + + // TODO + // replace all loads with the corresponding block arguments we will create in the linalg generic + // + + + SmallVector toreturn; + // push all sotred values into a linalg.yield + + // fixup the return + + + + // return success! + + + return failure(); + } +}; + +void RaiseAffineToLinalg::runOnOperation() { + RewritePatternSet patterns(&getContext()); + patterns.insert(&getContext()); + + GreedyRewriteConfig config; + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + config); +} + +namespace mlir { +namespace polygeist { +std::unique_ptr createRaiseAffineToLinalgPass() { + return std::make_unique(); +} +} // namespace polygeist +} // namespace mlir From 82e228d27d9f9522ebf5e1c40ddbb289da34f2a2 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 7 May 2024 20:01:57 -0400 Subject: [PATCH 2/7] It builds --- include/polygeist/Ops.h | 2 + include/polygeist/Passes/Passes.h | 4 ++ lib/polygeist/Ops.cpp | 3 +- lib/polygeist/Passes/RaiseToLinalg.cpp | 71 +++++++++++++++----------- 4 files changed, 49 insertions(+), 31 deletions(-) diff --git a/include/polygeist/Ops.h b/include/polygeist/Ops.h index cae361d2f399..e3f0cc9db17d 100644 --- a/include/polygeist/Ops.h +++ b/include/polygeist/Ops.h @@ -57,6 +57,8 @@ bool mayAlias(mlir::MemoryEffects::EffectInstance a, bool mayAlias(mlir::MemoryEffects::EffectInstance a, mlir::Value b); +bool mayAlias(mlir::Value v, mlir::Value v2); + extern llvm::cl::opt BarrierOpt; template diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h index 29f5a9b3536c..92c5812e8c4c 100644 --- a/include/polygeist/Passes/Passes.h +++ b/include/polygeist/Passes/Passes.h @@ -124,6 +124,10 @@ namespace affine { class AffineDialect; } +namespace linalg { +class LinalgDialect; +} + namespace LLVM { class LLVMDialect; } diff --git a/lib/polygeist/Ops.cpp b/lib/polygeist/Ops.cpp index 926891b40611..d9a60fbcce45 100644 --- a/lib/polygeist/Ops.cpp +++ b/lib/polygeist/Ops.cpp @@ -784,7 +784,8 @@ bool isStackAlloca(Value v) { v.getDefiningOp() || v.getDefiningOp(); } -static bool mayAlias(Value v, Value v2) { + +bool mayAlias(Value v, Value v2) { v = getBase(v); v2 = getBase(v2); if (v == v2) diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index d14771c677fd..09c19f8df5be 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -14,7 +14,7 @@ #include "polygeist/Passes/Passes.h" #include "llvm/Support/Debug.h" -#define DEBUG_TYPE "raise-to-affine" +#define DEBUG_TYPE "raise-to-linalg" using namespace mlir; using namespace mlir::arith; @@ -22,7 +22,7 @@ using namespace polygeist; using namespace affine; namespace { -struct RaiseAffineToLinalg : public SCFRaiseToAffineBase { +struct RaiseAffineToLinalg : public AffineRaiseToLinalgBase { void runOnOperation() override; }; } // namespace @@ -43,27 +43,28 @@ affine.for() { struct Condition { bool ifTrue; AffineIfOp op; + Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {} }; -struct ForOpRaising : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct ForOpRaising : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(affine::AffineForOp loop, PatternRewriter &rewriter) const final { // Don't handle accumulations in registers for the moment, we can have // a separate pattern move them into memref's - if (loop.getNumReductions() != 0) { + if (loop.getNumResults() != 0) { return failure(); } - SmallVector, AffineLoadOp> loads; - SmallVector, AffineStoreOp> stores; + SmallVector, AffineLoadOp>> loads; + SmallVector, AffineStoreOp>> stores; // Check that the only operations within the region are either: // affine.load, affine.store, affine.if, affine.yield // Additionally, for each load/store, remember what conditions are // required for that load or store to execute. - Walkloop result = loop->walk([&](Operation* op) { + auto result = loop->walk([&](Operation* op) { // TODO extend this, any non-memory operation is also legal here. // mul, add, etc (we can just check propety) if (isa(op)) { @@ -77,18 +78,21 @@ struct ForOpRaising : public OpRewritePattern { if (!ifstmt) { return WalkResult::interrupt(); } - bool ifTrue = ifstmt.getTrueRegion()->isAncestor(cur); - conditions.push_back(ifTrue, ifstmt); - cur = ifstmt->getParent(); + bool ifTrue = ifstmt.getThenRegion().isAncestor(cur->getParentRegion()); + conditions.emplace_back(ifTrue, ifstmt); + cur = ifstmt->getParentOp(); } if (auto load = dyn_cast(cur)) { - loads.emplace_back(condition, load); + loads.emplace_back(conditions, load); } else { auto store = cast(cur); - stores.emplace_back(condition, store); + stores.emplace_back(conditions, store); } return WalkResult::advance(); } + if (isReadNone(op)) { + return WalkResult::advance(); + } return WalkResult::interrupt(); }); @@ -98,13 +102,13 @@ struct ForOpRaising : public OpRewritePattern { // TODO we can extend this and handle things like reductions, but we're going to start easy for now for (auto &&[_, store] : stores) { for (auto &&[_, load]: loads) { - if (mayAlias(load.getMemref(), stores.getMemref())) { + if (mayAlias(load.getMemref(), store.getMemref())) { return failure(); } } for (auto &&[_, store2]: stores) { if (store == store2) continue; - if (mayAlias(store.getMemref(), stores2.getMemref())) { + if (mayAlias(store.getMemref(), store2.getMemref())) { return failure(); } } @@ -114,7 +118,7 @@ struct ForOpRaising : public OpRewritePattern { SmallVector inputs; SmallVector affineMaps; - for (auto [conds, load]&& : loads) { + for (auto &&[conds, load] : loads) { // Only support unconditional loads for the moment if (conds.size() != 0) return failure(); inputs.push_back(load.getMemref()); @@ -122,18 +126,19 @@ struct ForOpRaising : public OpRewritePattern { } SmallVector outputs; - for (auto [conds, store]&& : stores) { + for (auto &&[conds, store] : stores) { // Only support unconditional loads for the moment if (conds.size() != 0) return failure(); outputs.push_back(store.getMemref()); affineMaps.push_back(store.getAffineMap()); } - ArrayRef iteratorTypes; - // TODO fill this with the for loop bounds + SmallVector iteratorTypes; + // TODO revisit this later + iteratorTypes.push_back(utils::IteratorType::parallel); auto genericOp = rewriter.create( - loc, TypeRange(), inputs, outputs, affineMaps, iteratorTypes, + loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes, StringAttr(), StringAttr()); @@ -141,27 +146,33 @@ struct ForOpRaising : public OpRewritePattern { auto &body = genericOp.getRegion(); body.takeBody(loop.getRegion()); - rewriter.setInsertionPointToStart(*body.begin()); + auto blk = &*body.begin(); + rewriter.setInsertionPointToStart(blk); // This index will replace the use of the affine index - auto idx = rewriter.create(rewriter.getIndexAttr(0)); - rewriter.replaceAllUsesWith(loop.getInductionVariable(), idx); + auto idx = rewriter.create(loop.getLoc(), rewriter.getIndexAttr(0)); + rewriter.replaceAllUsesWith(loop.getInductionVar(), idx); - // TODO - // replace all loads with the corresponding block arguments we will create in the linalg generic - // + blk->eraseArguments(0, blk->getNumArguments()); + for (auto &&[conds, load] : loads) { + auto arg = blk->addArgument(load.getType(), load.getLoc()); + rewriter.replaceOp(load, arg); + } SmallVector toreturn; - // push all sotred values into a linalg.yield - // fixup the return + for (auto &&[conds, store] : stores) { + toreturn.push_back(store.getValueToStore()); + rewriter.eraseOp(store); + } + rewriter.eraseOp(blk->getTerminator()); + rewriter.setInsertionPointToEnd(blk); + rewriter.create(loop.getLoc(), toreturn); // return success! - - return failure(); } }; From 17d5c262304f936112c72f055e2ed7ecb9236c68 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 7 May 2024 20:20:16 -0400 Subject: [PATCH 3/7] Fixup --- lib/polygeist/Passes/RaiseToLinalg.cpp | 11 +++++---- test/polygeist-opt/linalgraise.mlir | 32 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 test/polygeist-opt/linalgraise.mlir diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index 09c19f8df5be..31dd32205055 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -45,7 +45,7 @@ struct Condition { AffineIfOp op; Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {} }; -struct ForOpRaising : public OpRewritePattern { +struct AffineForOpRaising : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(affine::AffineForOp loop, @@ -65,6 +65,7 @@ struct ForOpRaising : public OpRewritePattern { // Additionally, for each load/store, remember what conditions are // required for that load or store to execute. auto result = loop->walk([&](Operation* op) { + if (op == loop) return WalkResult::advance(); // TODO extend this, any non-memory operation is also legal here. // mul, add, etc (we can just check propety) if (isa(op)) { @@ -82,10 +83,10 @@ struct ForOpRaising : public OpRewritePattern { conditions.emplace_back(ifTrue, ifstmt); cur = ifstmt->getParentOp(); } - if (auto load = dyn_cast(cur)) { + if (auto load = dyn_cast(op)) { loads.emplace_back(conditions, load); } else { - auto store = cast(cur); + auto store = cast(op); stores.emplace_back(conditions, store); } return WalkResult::advance(); @@ -173,13 +174,13 @@ struct ForOpRaising : public OpRewritePattern { // return success! - return failure(); + return success(); } }; void RaiseAffineToLinalg::runOnOperation() { RewritePatternSet patterns(&getContext()); - patterns.insert(&getContext()); + patterns.insert(&getContext()); GreedyRewriteConfig config; (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir new file mode 100644 index 000000000000..91f7d86a8fbd --- /dev/null +++ b/test/polygeist-opt/linalgraise.mlir @@ -0,0 +1,32 @@ +// RUN: polygeist-opt --raise-affine-to-linalg --split-input-file %s | FileCheck %s + +module { + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + scf.if %12 { + affine.for %arg4 = 0 to %17 { + %ld = affine.load %18[%arg4] : memref + affine.store %ld, %19[%arg4] : memref + } + } + return + } +} + +// CHECK: func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref, %[[arg3:.+]]: memref) { +// CHECK-NEXT: %[[c4:.+]] = arith.constant 4 : index +// CHECK-NEXT: %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index +// CHECK-NEXT: %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index +// CHECK-NEXT: %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index +// CHECK-NEXT: scf.if %[[arg0]] { +// CHECK-NEXT: affine.for %[[arg4:.+]] = 0 to %[[V2]] { +// CHECK-NEXT: %[[a:.+]] = memref.load %[[arg3]][%[[arg4]]] : memref +// CHECK-NEXT: memref.store %[[a]], %[[arg2]][%[[arg4]]] : memref +// CHECK-NEXT: } +// CHECK-NEXT: } From b4d199f39aa2903abe0ac92ce6d4ca8dcc58d467 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 7 May 2024 20:42:07 -0400 Subject: [PATCH 4/7] minimal functional --- lib/polygeist/Passes/RaiseToLinalg.cpp | 20 +++++++++++++------- test/polygeist-opt/linalgraise.mlir | 11 +++++++---- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index 31dd32205055..75080be2168e 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -138,22 +138,24 @@ struct AffineForOpRaising : public OpRewritePattern { // TODO revisit this later iteratorTypes.push_back(utils::IteratorType::parallel); + StringAttr empty = StringAttr::get(loop.getContext()); auto genericOp = rewriter.create( loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes, - StringAttr(), - StringAttr()); + empty, + empty); - auto &body = genericOp.getRegion(); - body.takeBody(loop.getRegion()); - - auto blk = &*body.begin(); + auto blk = &*loop.getRegion().begin(); rewriter.setInsertionPointToStart(blk); // This index will replace the use of the affine index auto idx = rewriter.create(loop.getLoc(), rewriter.getIndexAttr(0)); rewriter.replaceAllUsesWith(loop.getInductionVar(), idx); + auto &body = genericOp.getRegion(); + body.takeBody(loop.getRegion()); + + blk->eraseArguments(0, blk->getNumArguments()); for (auto &&[conds, load] : loads) { @@ -161,6 +163,10 @@ struct AffineForOpRaising : public OpRewritePattern { rewriter.replaceOp(load, arg); } + for (auto &&[conds, store] : stores) { + blk->addArgument(store.getValueToStore().getType(), store.getLoc()); + } + SmallVector toreturn; for (auto &&[conds, store] : stores) { @@ -172,7 +178,7 @@ struct AffineForOpRaising : public OpRewritePattern { rewriter.setInsertionPointToEnd(blk); rewriter.create(loop.getLoc(), toreturn); - + rewriter.eraseOp(loop); // return success! return success(); } diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir index 91f7d86a8fbd..0470ba91dc74 100644 --- a/test/polygeist-opt/linalgraise.mlir +++ b/test/polygeist-opt/linalgraise.mlir @@ -19,14 +19,17 @@ module { } } +// CHECK: #map = affine_map<(d0) -> (d0)> // CHECK: func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref, %[[arg3:.+]]: memref) { // CHECK-NEXT: %[[c4:.+]] = arith.constant 4 : index // CHECK-NEXT: %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index // CHECK-NEXT: %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index // CHECK-NEXT: %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index // CHECK-NEXT: scf.if %[[arg0]] { -// CHECK-NEXT: affine.for %[[arg4:.+]] = 0 to %[[V2]] { -// CHECK-NEXT: %[[a:.+]] = memref.load %[[arg3]][%[[arg4]]] : memref -// CHECK-NEXT: memref.store %[[a]], %[[arg2]][%[[arg4]]] : memref -// CHECK-NEXT: } +// TODO note that presently we do not ensure that the memrefs are sliced to the right size as the space requires +// CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg2 : memref) outs(%alloca : memref) { +// CHECK-NEXT: ^bb0(%in: f32, %out: f32): +// CHECK-NEXT: linalg.yield %in : f32 +// CHECK-NEXT: } +// CHECK-NEXT: } // CHECK-NEXT: } From b8b4fe7c12eea8b1eaf7a665174777926536642c Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 14 May 2024 21:30:53 -0400 Subject: [PATCH 5/7] Now actually correct for indexing --- lib/polygeist/Passes/RaiseToLinalg.cpp | 183 ++++++++++++++++++++++++- test/polygeist-opt/linalgraise.mlir | 23 +++- 2 files changed, 199 insertions(+), 7 deletions(-) diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index 75080be2168e..78c1dbe27f9e 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -1,6 +1,7 @@ #include "PassDetails.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -13,6 +14,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "polygeist/Passes/Passes.h" #include "llvm/Support/Debug.h" +#include "mlir/IR/AffineExpr.h" #define DEBUG_TYPE "raise-to-linalg" @@ -45,6 +47,135 @@ struct Condition { AffineIfOp op; Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {} }; + +bool isLinearInIndex(AffineExpr expr, size_t idx) { + if (!expr.isFunctionOfDim(idx)) { + return true; + } + + if (expr.getKind() == AffineExprKind::DimId) { + return true; + } + + if (expr.getKind() == AffineExprKind::Add) { + auto binop = expr.cast(); + return isLinearInIndex(binop.getLHS(), idx) && isLinearInIndex(binop.getRHS(), idx); + } + if (expr.getKind() == AffineExprKind::Mul) { + auto binop = expr.cast(); + return (isLinearInIndex(binop.getLHS(), idx) && !binop.getRHS().isFunctionOfDim(idx)) || + (isLinearInIndex(binop.getRHS(), idx) && !binop.getLHS().isFunctionOfDim(idx)); + } + + return false; +} + +bool isLinearInIndex(AffineMap map, size_t idx) { + for (auto expr : map.getResults()) { + if (!isLinearInIndex(expr, idx)) + return false; + } + return true; +} + + AffineExpr shiftDimsDown1(AffineExpr expr, unsigned numDims, + unsigned offset) { + SmallVector dims; + for (unsigned idx = 0; idx < offset; ++idx) + dims.push_back(getAffineDimExpr(idx, expr.getContext())); + for (unsigned idx = offset; idx < numDims; ++idx) + dims.push_back(getAffineDimExpr(idx - 1, expr.getContext())); + return expr.replaceDimsAndSymbols(dims, {}); + } + + AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim, + unsigned offset) { + assert(offset <= expr.getNumDims()); + return AffineMap::get(expr.getNumDims() - 1, expr.getNumSymbols(), + llvm::map_to_vector<4>( + expr.getResults(), + [&](AffineExpr e) { + return shiftDimsDown1(e, expr.getNumDims(), offset); + }), + expr.getContext()); + } + +// Given an affine map `oldmap`, memref `val`, and corresponding input values (which are a list of indicies, then symbols), +// and a loop index `ind` produce the following: +// 1. A (potentially new) memref value `newval` which does not have any dependence on `ind` +// and +// 2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that +// indexing `newval[map(ind)]` produces the same result as indexing the original map. +std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, mlir::OperandRange vals) { + // First we need to remove any dependence on the loop index from the affine map + SmallVector vals_without_idx; + ssize_t dim_idx = -1; + for (auto &&[i, v] : llvm::enumerate(vals)) { + if (v == idx) { + // Offset we're replacing must be an index (not a symbol). + // If we guarantee to run AffineCFG first, this should always be true. + assert(i < oldmap.getNumDims()); + // There should only be one use of the index. + assert(dim_idx == -1); + dim_idx = i; + continue; + } + vals_without_idx.push_back(v); + } + + if (dim_idx != -1 && !isLinearInIndex(oldmap, dim_idx)) { + legal = false; + return {val, oldmap}; + } + + + // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables + + AffineMap offsetMap = oldmap; + if (dim_idx != -1) { + offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(0),offsetMap.getNumDims(), offsetMap.getNumSymbols()); + offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx); + } + + AffineMap strideMap = oldmap; + if (dim_idx != -1) { + strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(1),offsetMap.getNumDims(), offsetMap.getNumSymbols()); + strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx); + } + + { + SmallVector subtracts; + for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) { + subtracts.push_back(lhs - rhs); + } + strideMap = AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), subtracts, builder.getContext()); + } + + // Expression to index into the generated subview given the loop index + SmallVector loop_idxs; + + // List of starting offsets into the subview + SmallVector offsets; + SmallVector sizes; + SmallVector strides; + + for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) { + offsets.push_back(builder.create(val.getLoc(), offset_expr, vals_without_idx)); + strides.push_back(builder.create(val.getLoc(), stride_expr, vals_without_idx)); + if (!expr.isFunctionOfDim(dim_idx)) { + loop_idxs.push_back(builder.getAffineConstantExpr(0)); + sizes.push_back(builder.create(val.getLoc(), 1)); + } else { + loop_idxs.push_back(builder.getAffineDimExpr(0)); + sizes.push_back(idx_size); + } + } + + auto newval = builder.create(val.getLoc(), val, offsets, sizes, strides); + legal = true; + return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())}; +} + struct AffineForOpRaising : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -115,23 +246,63 @@ struct AffineForOpRaising : public OpRewritePattern { } } - - SmallVector inputs; SmallVector affineMaps; + + if (loop.getStep() != 1) { + return failure(); + } + + // our remapper currently assumes 0 start to bound. + if (!loop.hasConstantLowerBound() || loop.getConstantLowerBound() != 0) { + return failure(); + } + + // compute this correctly later. + auto ub = loop.getSingleUpperBound(); + if (!ub) return failure(); + + auto lb = loop.getSingleLowerBound(); + if (!lb) return failure(); + + + if (!loop.hasConstantUpperBound()) { + return failure(); + } + + Value loopSize = rewriter.create(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create(loop.getLoc(), *ub, *lb); + + // current spec is going to be indexed off of the loop var in isolation for (auto &&[conds, load] : loads) { // Only support unconditional loads for the moment if (conds.size() != 0) return failure(); - inputs.push_back(load.getMemref()); - affineMaps.push_back(load.getAffineMap()); + + bool legal = true; + + auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(), + loopSize, load.getMapOperands()); + + if (!legal) return failure(); + + affineMaps.push_back(newAffineMap); + inputs.push_back(newMemref); } SmallVector outputs; + // Store we may need to reindex into a splat potentially later, but for now we'll be lazy for (auto &&[conds, store] : stores) { // Only support unconditional loads for the moment if (conds.size() != 0) return failure(); - outputs.push_back(store.getMemref()); - affineMaps.push_back(store.getAffineMap()); + + bool legal = true; + + auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(), + loopSize, store.getMapOperands()); + + if (!legal) return failure(); + + affineMaps.push_back(newAffineMap); + outputs.push_back(newMemref); } SmallVector iteratorTypes; diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir index 0470ba91dc74..0e23a7c28a0b 100644 --- a/test/polygeist-opt/linalgraise.mlir +++ b/test/polygeist-opt/linalgraise.mlir @@ -10,13 +10,34 @@ module { %17 = arith.divui %16, %c4 : index %19 = memref.alloca(%17) : memref scf.if %12 { - affine.for %arg4 = 0 to %17 { + affine.for %arg4 = 0 to 17 { %ld = affine.load %18[%arg4] : memref affine.store %ld, %19[%arg4] : memref } } return } + + + func.func @main2(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + scf.if %12 { + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[3 * %arg4] : memref + %ld2 = affine.load %18[0] : memref + %fadd = arith.addf %ld, %ld2 : f32 + affine.store %fadd, %19[%arg4 + 17] : memref + } + } + return + } + } // CHECK: #map = affine_map<(d0) -> (d0)> From e1dd3e414e1a63a78def0d49bcf5c33c2ef88c6c Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 21 May 2024 20:07:41 -0400 Subject: [PATCH 6/7] Now featuring reductions --- lib/polygeist/Passes/RaiseToLinalg.cpp | 106 ++++++- test/polygeist-opt/linalgraise.mlir | 374 ++++++++++++++++++++++++- 2 files changed, 465 insertions(+), 15 deletions(-) diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index 78c1dbe27f9e..320243ba415a 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -10,6 +10,7 @@ #include "mlir/Dialect/SCF/Transforms/Passes.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/Operation.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "polygeist/Passes/Passes.h" @@ -88,6 +89,7 @@ bool isLinearInIndex(AffineMap map, size_t idx) { return expr.replaceDimsAndSymbols(dims, {}); } +//This is reducing the number of input dims in expression by 1 AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim, unsigned offset) { assert(offset <= expr.getNumDims()); @@ -106,10 +108,11 @@ bool isLinearInIndex(AffineMap map, size_t idx) { // and // 2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that // indexing `newval[map(ind)]` produces the same result as indexing the original map. -std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, mlir::OperandRange vals) { +std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, int loopLowerBound, int loopStepSize, mlir::OperandRange vals) { // First we need to remove any dependence on the loop index from the affine map SmallVector vals_without_idx; ssize_t dim_idx = -1; + //To check if induction variable of for loop in an operand of this op (load/store) for (auto &&[i, v] : llvm::enumerate(vals)) { if (v == idx) { // Offset we're replacing must be an index (not a symbol). @@ -131,18 +134,21 @@ std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables + //Instead of lower bound we are using 0 (assumption as the lower bound) AffineMap offsetMap = oldmap; if (dim_idx != -1) { - offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(0),offsetMap.getNumDims(), offsetMap.getNumSymbols()); + offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound),offsetMap.getNumDims(), offsetMap.getNumSymbols()); offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx); } + //Instead of using loop step we are using 1 (Assumption as the stride size) AffineMap strideMap = oldmap; if (dim_idx != -1) { - strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(1),offsetMap.getNumDims(), offsetMap.getNumSymbols()); + strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound + loopStepSize),strideMap.getNumDims(), strideMap.getNumSymbols()); strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx); } + //Subtracting maps of stride and offset, gives you the offset value in the result of the map { SmallVector subtracts; for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) { @@ -160,8 +166,8 @@ std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, SmallVector strides; for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) { - offsets.push_back(builder.create(val.getLoc(), offset_expr, vals_without_idx)); - strides.push_back(builder.create(val.getLoc(), stride_expr, vals_without_idx)); + offsets.push_back(builder.create(val.getLoc(),AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), offset_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression? + strides.push_back(builder.create(val.getLoc(),AffineMap::get(strideMap.getNumDims(), strideMap.getNumSymbols(), stride_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression? if (!expr.isFunctionOfDim(dim_idx)) { loop_idxs.push_back(builder.getAffineConstantExpr(0)); sizes.push_back(builder.create(val.getLoc(), 1)); @@ -173,9 +179,20 @@ std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, auto newval = builder.create(val.getLoc(), val, offsets, sizes, strides); legal = true; + //Does this need fix? Here we are constraining to dims as 1 and symbols as 0, should it be, original return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())}; } + +// store A[...] +// val = load A[...] + +/* prevA : + store A + val is now prevA +*/ + + struct AffineForOpRaising : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -230,11 +247,21 @@ struct AffineForOpRaising : public OpRewritePattern { if (result.wasInterrupted()) return failure(); + DominanceInfo DI(loop); + // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result) // TODO we can extend this and handle things like reductions, but we're going to start easy for now + DenseMap stores_map; for (auto &&[_, store] : stores) { for (auto &&[_, load]: loads) { if (mayAlias(load.getMemref(), store.getMemref())) { + // We have one exception in this case -- if the load and store are from the exact same location, it is permitted. + if (load.getMemref() == store.getMemref() && + load.getAffineMap() == store.getAffineMap() && + load.getIndices() == store.getIndices() && DI.dominates((Operation*)load,(Operation*)store)) { + stores_map[load] = store; + continue; + } return failure(); } } @@ -249,16 +276,25 @@ struct AffineForOpRaising : public OpRewritePattern { SmallVector inputs; SmallVector affineMaps; - if (loop.getStep() != 1) { - return failure(); - } + //if (loop.getStep() != 1) { + // return failure(); + //} // our remapper currently assumes 0 start to bound. - if (!loop.hasConstantLowerBound() || loop.getConstantLowerBound() != 0) { + if (!loop.hasConstantLowerBound() /*|| loop.getConstantLowerBound() != 0*/) { return failure(); } // compute this correctly later. + auto ubMap = loop.getUpperBoundMap(); + auto ubOperands = loop.getUpperBoundOperands(); + if (!ubMap || ubMap.getNumResults() != 1) return failure(); + + // Retrieve the lower bound + auto lbMap = loop.getLowerBoundMap(); + auto lbOperands = loop.getLowerBoundOperands(); + if (!lbMap || lbMap.getNumResults() != 1) return failure(); + auto ub = loop.getSingleUpperBound(); if (!ub) return failure(); @@ -270,17 +306,41 @@ struct AffineForOpRaising : public OpRewritePattern { return failure(); } - Value loopSize = rewriter.create(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create(loop.getLoc(), *ub, *lb); + // Retrieve the step size + int64_t step = loop.getStep(); + + // Get the single result expressions + AffineExpr ubExpr = ubMap.getResult(0); + auto ubValue = rewriter.create(loop.getLoc(), ubMap, ubOperands); + + AffineExpr lbExpr = lbMap.getResult(0); + auto lbValue = rewriter.create(loop.getLoc(), lbMap, lbOperands); + + //// Ensure the bounds are constant expressions + auto ubConst = ubExpr.dyn_cast(); + auto lbConst = lbExpr.dyn_cast(); + if (!ubConst || !lbConst) return failure(); + // Compute the loop size + //int64_t loopSize = ubConst.getValue() - lbConst.getValue(); + auto loopSize = rewriter.create(loop.getLoc(), ubValue, lbValue); + + //Value loopSize = rewriter.create(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create(loop.getLoc(), *ub, *lb); + // current spec is going to be indexed off of the loop var in isolation for (auto &&[conds, load] : loads) { // Only support unconditional loads for the moment if (conds.size() != 0) return failure(); + if (stores_map.find(load) != stores_map.end()) { + // We have a store that represents this load. + continue; + } + bool legal = true; auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(), - loopSize, load.getMapOperands()); + loopSize, lbConst.getValue(), step, load.getMapOperands()); if (!legal) return failure(); @@ -297,7 +357,7 @@ struct AffineForOpRaising : public OpRewritePattern { bool legal = true; auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(), - loopSize, store.getMapOperands()); + loopSize, lbConst.getValue(), step, store.getMapOperands()); if (!legal) return failure(); @@ -307,7 +367,7 @@ struct AffineForOpRaising : public OpRewritePattern { SmallVector iteratorTypes; // TODO revisit this later - iteratorTypes.push_back(utils::IteratorType::parallel); + iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction); StringAttr empty = StringAttr::get(loop.getContext()); auto genericOp = rewriter.create( @@ -330,12 +390,30 @@ struct AffineForOpRaising : public OpRewritePattern { blk->eraseArguments(0, blk->getNumArguments()); for (auto &&[conds, load] : loads) { + if (stores_map.find(load) != stores_map.end()) { + // We have a store that represents this load. + continue; + } auto arg = blk->addArgument(load.getType(), load.getLoc()); rewriter.replaceOp(load, arg); + } for (auto &&[conds, store] : stores) { - blk->addArgument(store.getValueToStore().getType(), store.getLoc()); + auto arg = blk->addArgument(store.getValueToStore().getType(), store.getLoc()); + + SmallVector inverted; + for (auto && [map_load, map_store] : stores_map) { + if (map_store == store) { + inverted.push_back(map_load); + } + } + for (size_t i=0; i toreturn; diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir index 0e23a7c28a0b..e0ceffa1849c 100644 --- a/test/polygeist-opt/linalgraise.mlir +++ b/test/polygeist-opt/linalgraise.mlir @@ -10,7 +10,7 @@ module { %17 = arith.divui %16, %c4 : index %19 = memref.alloca(%17) : memref scf.if %12 { - affine.for %arg4 = 0 to 17 { + affine.for %arg4 = 0 to %17 { %ld = affine.load %18[%arg4] : memref affine.store %ld, %19[%arg4] : memref } @@ -54,3 +54,375 @@ module { // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } + +//constant-access +module @constant_access{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %ci324 = arith.constant 4.0 : f32 + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ci324 : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//constant-mem-access +module @constant_mem_access{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 4 to 17 step 2 { + %ld = affine.load %18[3*%arg4] : memref + %ld2 = affine.load %18[%c4] : memref + %mul = arith.mulf %ld, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//without-if +module @no_if{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + affine.store %ld, %19[%arg4] : memref + } + return + } +} + +//arith.mul +module @arith_mul{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//arith.add +module @arith_add{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg4] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//Conditional arith +module @cond_arith{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %if = scf.if %12 -> f32 { + %mul = arith.mulf %ld, %ld : f32 + scf.yield %mul : f32 + } else { + scf.yield %ld : f32 + } + affine.store %if, %19[%arg4] : memref + } + return + } +} + +//reduction +module @reduction{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + %sum_0 = arith.constant 0.0 : f32 + %red = affine.for %arg4 = 0 to 17 step 1 iter_args(%sum_iter = %sum_0) -> f32 { + %ld1 = affine.load %18[%arg4] : memref + %sum_next = arith.addf %sum_iter, %ld1 : f32 + affine.yield %sum_next : f32 + } + affine.store %red, %19[0] : memref + return + } +} + +//Conditional store-1 +module @cond_store_1 { + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + scf.if %12 { + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//Conditional store-2 +module @cond_store_2{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + scf.if %12 { + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } else { + affine.store %ld, %19[%arg4] : memref + } + } + return + } +} + +//Parallel for +module @parallel_for{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg4] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//Fors inside for +module @for_within_for{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %21 = arith.muli %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg3 = 0 to 21 { + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %mul = arith.mulf %ld1, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//Parallel fors inside for +module @parallel_fors_inside_for { + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg3 = 0 to 17 { + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %mul = arith.mulf %ld1, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//matrix-mul iter arg +module @matmul_1 { + memref.global @out : memref<32x8xi32> = uninitialized + memref.global @im2 : memref<8x8xi32> = uninitialized + memref.global @im1 : memref<32x8xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im1 : memref<32x8xi32> + %1 = memref.get_global @im2 : memref<8x8xi32> + %2 = memref.get_global @out : memref<32x8xi32> + affine.for %arg0 = 0 to 32 { + affine.for %arg1 = 0 to 8 { + %3 = affine.for %arg2 = 0 to 8 iter_args(%arg3 = %c0_i32) -> (i32) { + %4 = affine.load %0[%arg0, %arg2] : memref<32x8xi32> + %5 = affine.load %1[%arg2, %arg1] : memref<8x8xi32> + %6 = arith.muli %4, %5 : i32 + %7 = arith.addi %arg3, %6 : i32 + affine.yield %7 : i32 + } + affine.store %3, %2[%arg0, %arg1] : memref<32x8xi32> + } + } + return %c0_i32 : i32 + } +} + +//matrix-mul alias issue +module @matmul_2 { + memref.global @out : memref<128x32xi32> = uninitialized + memref.global @im2 : memref<64x32xi32> = uninitialized + memref.global @im1 : memref<128x64xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im1 : memref<128x64xi32> + %1 = memref.get_global @im2 : memref<64x32xi32> + %2 = memref.get_global @out : memref<128x32xi32> + affine.for %arg0 = 0 to 128 { + affine.for %arg1 = 0 to 32 { + affine.for %arg2 = 0 to 64 { + %3 = affine.load %0[%arg0, %arg2] : memref<128x64xi32> + %4 = affine.load %1[%arg2, %arg1] : memref<64x32xi32> + %5 = arith.muli %3, %4 : i32 + %6 = affine.load %2[%arg0, %arg1] : memref<128x32xi32> + %7 = arith.addi %6, %5 : i32 + affine.store %7, %2[%arg0, %arg1] : memref<128x32xi32> + } + } + } + return %c0_i32 : i32 + } +} + +//conv (with inner loop accumulate) +//How to deal with IR in outer loops as well? +module @conv_1{ + memref.global @out : memref<512x64xi32> = uninitialized + memref.global @filter : memref<4x4xi32> = uninitialized + memref.global @im : memref<515x67xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im : memref<515x67xi32> + %1 = memref.get_global @filter : memref<4x4xi32> + %2 = memref.get_global @out : memref<512x64xi32> + affine.for %arg0 = 0 to 512 { + affine.for %arg1 = 0 to 64 { + %3 = affine.for %arg2 = 0 to 4 iter_args(%arg3 = %c0_i32) -> (i32) { + %4 = affine.for %arg4 = 0 to 4 iter_args(%arg5 = %arg3) -> (i32) { + %5 = affine.load %0[%arg0 + %arg2, %arg1 + %arg4] : memref<515x67xi32> + %6 = affine.load %1[%arg2, %arg4] : memref<4x4xi32> + %7 = arith.muli %5, %6 : i32 + %8 = arith.addi %arg5, %7 : i32 + affine.yield %8 : i32 + } + affine.yield %4 : i32 + } + affine.store %3, %2[%arg0, %arg1] : memref<512x64xi32> + } + } + return %c0_i32 : i32 + } +} + +//conv (direct store) +module @conv_2{ + memref.global @out : memref<512x64xi32> = uninitialized + memref.global @filter : memref<4x4xi32> = uninitialized + memref.global @im : memref<515x67xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im : memref<515x67xi32> + %1 = memref.get_global @out : memref<512x64xi32> + affine.for %arg0 = 0 to 512 { + affine.for %arg1 = 0 to 64 { + affine.for %arg2 = 0 to 4 { + affine.for %arg3 = 0 to 4 { + %2 = affine.load %0[%arg0 + %arg2, %arg1 + %arg3] : memref<515x67xi32> + %3 = affine.load %1[%arg0, %arg1] : memref<512x64xi32> + %4 = arith.addi %3, %2 : i32 + affine.store %4, %1[%arg0, %arg1] : memref<512x64xi32> + } + } + } + } + return %c0_i32 : i32 + } +} \ No newline at end of file From 1bbf3f69623fdeeb5774d387e212c3bb39e75ce0 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Tue, 21 May 2024 20:16:42 -0400 Subject: [PATCH 7/7] add comments on the raising fors --- lib/polygeist/Passes/RaiseToLinalg.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp index 320243ba415a..254d3a11881b 100644 --- a/lib/polygeist/Passes/RaiseToLinalg.cpp +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -207,6 +207,7 @@ struct AffineForOpRaising : public OpRewritePattern { SmallVector, AffineLoadOp>> loads; SmallVector, AffineStoreOp>> stores; + // TODO Also collect all the linalg generics! // Check that the only operations within the region are either: // affine.load, affine.store, affine.if, affine.yield @@ -251,6 +252,7 @@ struct AffineForOpRaising : public OpRewritePattern { // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result) // TODO we can extend this and handle things like reductions, but we're going to start easy for now + // TODO DenseMap stores_map; for (auto &&[_, store] : stores) { for (auto &&[_, load]: loads) { @@ -272,6 +274,8 @@ struct AffineForOpRaising : public OpRewritePattern { } } } + // Check that any other loads / stores do not alias with any linalg generics + // We're going to need to upgrade the defn of mayAlias for subviews (aka mayAlias(subview, x) -> mayAlias(operand(subview), x)) SmallVector inputs; SmallVector affineMaps; @@ -347,6 +351,7 @@ struct AffineForOpRaising : public OpRewritePattern { affineMaps.push_back(newAffineMap); inputs.push_back(newMemref); } + // TODO Push all of the inputs to the linalg generics (modifying maps as needed) SmallVector outputs; // Store we may need to reindex into a splat potentially later, but for now we'll be lazy @@ -364,18 +369,24 @@ struct AffineForOpRaising : public OpRewritePattern { affineMaps.push_back(newAffineMap); outputs.push_back(newMemref); } + // TODO Push all of the outputs to the linalg generics + // TODO presently if linalg generic exists, assert there are no load/stores + // TODO assert only zero or one linalg generic exists SmallVector iteratorTypes; - // TODO revisit this later + // TODO if linalg generic exists, make this iterator type prepend to the existing iterators iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction); + + StringAttr empty = StringAttr::get(loop.getContext()); auto genericOp = rewriter.create( loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes, empty, empty); - + // TODO if doing the linalg generic case, ignore a lot of the below and instead of injecting the old body of the affine.for, move the inner linalg.generic body + // and also add a new induction variable auto blk = &*loop.getRegion().begin(); rewriter.setInsertionPointToStart(blk); @@ -435,6 +446,8 @@ struct AffineForOpRaising : public OpRewritePattern { void RaiseAffineToLinalg::runOnOperation() { RewritePatternSet patterns(&getContext()); + // TODO add the existing canonicalization patterns + // + subview of an affine apply -> subview patterns.insert(&getContext()); GreedyRewriteConfig config;