diff --git a/include/polygeist/Ops.h b/include/polygeist/Ops.h index cae361d2f399..e3f0cc9db17d 100644 --- a/include/polygeist/Ops.h +++ b/include/polygeist/Ops.h @@ -57,6 +57,8 @@ bool mayAlias(mlir::MemoryEffects::EffectInstance a, bool mayAlias(mlir::MemoryEffects::EffectInstance a, mlir::Value b); +bool mayAlias(mlir::Value v, mlir::Value v2); + extern llvm::cl::opt BarrierOpt; template diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h index 5f3777441d1a..92c5812e8c4c 100644 --- a/include/polygeist/Passes/Passes.h +++ b/include/polygeist/Passes/Passes.h @@ -31,6 +31,7 @@ std::unique_ptr replaceAffineCFGPass(); std::unique_ptr createOpenMPOptPass(); std::unique_ptr createCanonicalizeForPass(); std::unique_ptr createRaiseSCFToAffinePass(); +std::unique_ptr createRaiseAffineToLinalgPass(); std::unique_ptr createCPUifyPass(StringRef method = ""); std::unique_ptr createBarrierRemovalContinuation(); std::unique_ptr detectReductionPass(); @@ -123,6 +124,10 @@ namespace affine { class AffineDialect; } +namespace linalg { +class LinalgDialect; +} + namespace LLVM { class LLVMDialect; } diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td index 05c3644c956e..5c17a9d6dc25 100644 --- a/include/polygeist/Passes/Passes.td +++ b/include/polygeist/Passes/Passes.td @@ -151,6 +151,15 @@ def SCFRaiseToAffine : Pass<"raise-scf-to-affine"> { ]; } +def AffineRaiseToLinalg : Pass<"raise-affine-to-linalg"> { + let summary = "Raise affine to linalg"; + let constructor = "mlir::polygeist::createRaiseAffineToLinalgPass()"; + let dependentDialects = [ + "affine::AffineDialect", + "linalg::LinalgDialect", + ]; +} + def SCFCanonicalizeFor : Pass<"canonicalize-scf-for"> { let summary = "Run some additional canonicalization for scf::for"; let constructor = "mlir::polygeist::createCanonicalizeForPass()"; diff --git a/lib/polygeist/Ops.cpp b/lib/polygeist/Ops.cpp index 926891b40611..d9a60fbcce45 100644 --- a/lib/polygeist/Ops.cpp +++ b/lib/polygeist/Ops.cpp @@ -784,7 +784,8 @@ bool isStackAlloca(Value v) { v.getDefiningOp() || v.getDefiningOp(); } -static bool mayAlias(Value v, Value v2) { + +bool mayAlias(Value v, Value v2) { v = getBase(v); v2 = getBase(v2); if (v == v2) diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt index 5d6164ef53d7..d6947a1931c5 100644 --- a/lib/polygeist/Passes/CMakeLists.txt +++ b/lib/polygeist/Passes/CMakeLists.txt @@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms OpenMPOpt.cpp BarrierRemovalContinuation.cpp RaiseToAffine.cpp + RaiseToLinalg.cpp ParallelLower.cpp TrivialUse.cpp ConvertPolygeistToLLVM.cpp diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp new file mode 100644 index 000000000000..254d3a11881b --- /dev/null +++ b/lib/polygeist/Passes/RaiseToLinalg.cpp @@ -0,0 +1,464 @@ +#include "PassDetails.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Passes.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Operation.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "polygeist/Passes/Passes.h" +#include "llvm/Support/Debug.h" +#include "mlir/IR/AffineExpr.h" + +#define DEBUG_TYPE "raise-to-linalg" + +using namespace mlir; +using namespace mlir::arith; +using namespace polygeist; +using namespace affine; + +namespace { +struct RaiseAffineToLinalg : public AffineRaiseToLinalgBase { + void runOnOperation() override; +}; +} // namespace + +// Also want to add support for affine.for ( ) { linalg.generic } -> bigger linalg.generic +// Also probably want to try to do { linalg.generc1(); linalg.generic2(); } -> bigger linalg.generic() + +/* + +affine.for() { + affine.for() { + } + affine.for() { + } +} + +*/ +struct Condition { + bool ifTrue; + AffineIfOp op; + Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {} +}; + +bool isLinearInIndex(AffineExpr expr, size_t idx) { + if (!expr.isFunctionOfDim(idx)) { + return true; + } + + if (expr.getKind() == AffineExprKind::DimId) { + return true; + } + + if (expr.getKind() == AffineExprKind::Add) { + auto binop = expr.cast(); + return isLinearInIndex(binop.getLHS(), idx) && isLinearInIndex(binop.getRHS(), idx); + } + if (expr.getKind() == AffineExprKind::Mul) { + auto binop = expr.cast(); + return (isLinearInIndex(binop.getLHS(), idx) && !binop.getRHS().isFunctionOfDim(idx)) || + (isLinearInIndex(binop.getRHS(), idx) && !binop.getLHS().isFunctionOfDim(idx)); + } + + return false; +} + +bool isLinearInIndex(AffineMap map, size_t idx) { + for (auto expr : map.getResults()) { + if (!isLinearInIndex(expr, idx)) + return false; + } + return true; +} + + AffineExpr shiftDimsDown1(AffineExpr expr, unsigned numDims, + unsigned offset) { + SmallVector dims; + for (unsigned idx = 0; idx < offset; ++idx) + dims.push_back(getAffineDimExpr(idx, expr.getContext())); + for (unsigned idx = offset; idx < numDims; ++idx) + dims.push_back(getAffineDimExpr(idx - 1, expr.getContext())); + return expr.replaceDimsAndSymbols(dims, {}); + } + +//This is reducing the number of input dims in expression by 1 + AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim, + unsigned offset) { + assert(offset <= expr.getNumDims()); + return AffineMap::get(expr.getNumDims() - 1, expr.getNumSymbols(), + llvm::map_to_vector<4>( + expr.getResults(), + [&](AffineExpr e) { + return shiftDimsDown1(e, expr.getNumDims(), offset); + }), + expr.getContext()); + } + +// Given an affine map `oldmap`, memref `val`, and corresponding input values (which are a list of indicies, then symbols), +// and a loop index `ind` produce the following: +// 1. A (potentially new) memref value `newval` which does not have any dependence on `ind` +// and +// 2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that +// indexing `newval[map(ind)]` produces the same result as indexing the original map. +std::pair remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, int loopLowerBound, int loopStepSize, mlir::OperandRange vals) { + // First we need to remove any dependence on the loop index from the affine map + SmallVector vals_without_idx; + ssize_t dim_idx = -1; + //To check if induction variable of for loop in an operand of this op (load/store) + for (auto &&[i, v] : llvm::enumerate(vals)) { + if (v == idx) { + // Offset we're replacing must be an index (not a symbol). + // If we guarantee to run AffineCFG first, this should always be true. + assert(i < oldmap.getNumDims()); + // There should only be one use of the index. + assert(dim_idx == -1); + dim_idx = i; + continue; + } + vals_without_idx.push_back(v); + } + + if (dim_idx != -1 && !isLinearInIndex(oldmap, dim_idx)) { + legal = false; + return {val, oldmap}; + } + + + // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables + + //Instead of lower bound we are using 0 (assumption as the lower bound) + AffineMap offsetMap = oldmap; + if (dim_idx != -1) { + offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound),offsetMap.getNumDims(), offsetMap.getNumSymbols()); + offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx); + } + + //Instead of using loop step we are using 1 (Assumption as the stride size) + AffineMap strideMap = oldmap; + if (dim_idx != -1) { + strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound + loopStepSize),strideMap.getNumDims(), strideMap.getNumSymbols()); + strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx); + } + + //Subtracting maps of stride and offset, gives you the offset value in the result of the map + { + SmallVector subtracts; + for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) { + subtracts.push_back(lhs - rhs); + } + strideMap = AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), subtracts, builder.getContext()); + } + + // Expression to index into the generated subview given the loop index + SmallVector loop_idxs; + + // List of starting offsets into the subview + SmallVector offsets; + SmallVector sizes; + SmallVector strides; + + for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) { + offsets.push_back(builder.create(val.getLoc(),AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), offset_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression? + strides.push_back(builder.create(val.getLoc(),AffineMap::get(strideMap.getNumDims(), strideMap.getNumSymbols(), stride_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression? + if (!expr.isFunctionOfDim(dim_idx)) { + loop_idxs.push_back(builder.getAffineConstantExpr(0)); + sizes.push_back(builder.create(val.getLoc(), 1)); + } else { + loop_idxs.push_back(builder.getAffineDimExpr(0)); + sizes.push_back(idx_size); + } + } + + auto newval = builder.create(val.getLoc(), val, offsets, sizes, strides); + legal = true; + //Does this need fix? Here we are constraining to dims as 1 and symbols as 0, should it be, original + return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())}; +} + + +// store A[...] +// val = load A[...] + +/* prevA : + store A + val is now prevA +*/ + + +struct AffineForOpRaising : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineForOp loop, + PatternRewriter &rewriter) const final { + + // Don't handle accumulations in registers for the moment, we can have + // a separate pattern move them into memref's + if (loop.getNumResults() != 0) { + return failure(); + } + + SmallVector, AffineLoadOp>> loads; + SmallVector, AffineStoreOp>> stores; + // TODO Also collect all the linalg generics! + + // Check that the only operations within the region are either: + // affine.load, affine.store, affine.if, affine.yield + // Additionally, for each load/store, remember what conditions are + // required for that load or store to execute. + auto result = loop->walk([&](Operation* op) { + if (op == loop) return WalkResult::advance(); + // TODO extend this, any non-memory operation is also legal here. + // mul, add, etc (we can just check propety) + if (isa(op)) { + return WalkResult::advance(); + } + if (isa(op)) { + Operation *cur = op->getParentOp(); + std::vector conditions; + while (cur != loop) { + auto ifstmt = dyn_cast(cur); + if (!ifstmt) { + return WalkResult::interrupt(); + } + bool ifTrue = ifstmt.getThenRegion().isAncestor(cur->getParentRegion()); + conditions.emplace_back(ifTrue, ifstmt); + cur = ifstmt->getParentOp(); + } + if (auto load = dyn_cast(op)) { + loads.emplace_back(conditions, load); + } else { + auto store = cast(op); + stores.emplace_back(conditions, store); + } + return WalkResult::advance(); + } + if (isReadNone(op)) { + return WalkResult::advance(); + } + return WalkResult::interrupt(); + }); + + if (result.wasInterrupted()) return failure(); + + DominanceInfo DI(loop); + + // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result) + // TODO we can extend this and handle things like reductions, but we're going to start easy for now + // TODO + DenseMap stores_map; + for (auto &&[_, store] : stores) { + for (auto &&[_, load]: loads) { + if (mayAlias(load.getMemref(), store.getMemref())) { + // We have one exception in this case -- if the load and store are from the exact same location, it is permitted. + if (load.getMemref() == store.getMemref() && + load.getAffineMap() == store.getAffineMap() && + load.getIndices() == store.getIndices() && DI.dominates((Operation*)load,(Operation*)store)) { + stores_map[load] = store; + continue; + } + return failure(); + } + } + for (auto &&[_, store2]: stores) { + if (store == store2) continue; + if (mayAlias(store.getMemref(), store2.getMemref())) { + return failure(); + } + } + } + // Check that any other loads / stores do not alias with any linalg generics + // We're going to need to upgrade the defn of mayAlias for subviews (aka mayAlias(subview, x) -> mayAlias(operand(subview), x)) + + SmallVector inputs; + SmallVector affineMaps; + + //if (loop.getStep() != 1) { + // return failure(); + //} + + // our remapper currently assumes 0 start to bound. + if (!loop.hasConstantLowerBound() /*|| loop.getConstantLowerBound() != 0*/) { + return failure(); + } + + // compute this correctly later. + auto ubMap = loop.getUpperBoundMap(); + auto ubOperands = loop.getUpperBoundOperands(); + if (!ubMap || ubMap.getNumResults() != 1) return failure(); + + // Retrieve the lower bound + auto lbMap = loop.getLowerBoundMap(); + auto lbOperands = loop.getLowerBoundOperands(); + if (!lbMap || lbMap.getNumResults() != 1) return failure(); + + auto ub = loop.getSingleUpperBound(); + if (!ub) return failure(); + + auto lb = loop.getSingleLowerBound(); + if (!lb) return failure(); + + + if (!loop.hasConstantUpperBound()) { + return failure(); + } + + // Retrieve the step size + int64_t step = loop.getStep(); + + // Get the single result expressions + AffineExpr ubExpr = ubMap.getResult(0); + auto ubValue = rewriter.create(loop.getLoc(), ubMap, ubOperands); + + AffineExpr lbExpr = lbMap.getResult(0); + auto lbValue = rewriter.create(loop.getLoc(), lbMap, lbOperands); + + //// Ensure the bounds are constant expressions + auto ubConst = ubExpr.dyn_cast(); + auto lbConst = lbExpr.dyn_cast(); + if (!ubConst || !lbConst) return failure(); + + // Compute the loop size + //int64_t loopSize = ubConst.getValue() - lbConst.getValue(); + auto loopSize = rewriter.create(loop.getLoc(), ubValue, lbValue); + + //Value loopSize = rewriter.create(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create(loop.getLoc(), *ub, *lb); + + // current spec is going to be indexed off of the loop var in isolation + for (auto &&[conds, load] : loads) { + // Only support unconditional loads for the moment + if (conds.size() != 0) return failure(); + + if (stores_map.find(load) != stores_map.end()) { + // We have a store that represents this load. + continue; + } + + bool legal = true; + + auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(), + loopSize, lbConst.getValue(), step, load.getMapOperands()); + + if (!legal) return failure(); + + affineMaps.push_back(newAffineMap); + inputs.push_back(newMemref); + } + // TODO Push all of the inputs to the linalg generics (modifying maps as needed) + + SmallVector outputs; + // Store we may need to reindex into a splat potentially later, but for now we'll be lazy + for (auto &&[conds, store] : stores) { + // Only support unconditional loads for the moment + if (conds.size() != 0) return failure(); + + bool legal = true; + + auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(), + loopSize, lbConst.getValue(), step, store.getMapOperands()); + + if (!legal) return failure(); + + affineMaps.push_back(newAffineMap); + outputs.push_back(newMemref); + } + // TODO Push all of the outputs to the linalg generics + + // TODO presently if linalg generic exists, assert there are no load/stores + // TODO assert only zero or one linalg generic exists + SmallVector iteratorTypes; + // TODO if linalg generic exists, make this iterator type prepend to the existing iterators + iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction); + + + + StringAttr empty = StringAttr::get(loop.getContext()); + auto genericOp = rewriter.create( + loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes, + empty, + empty); + + // TODO if doing the linalg generic case, ignore a lot of the below and instead of injecting the old body of the affine.for, move the inner linalg.generic body + // and also add a new induction variable + auto blk = &*loop.getRegion().begin(); + rewriter.setInsertionPointToStart(blk); + + // This index will replace the use of the affine index + auto idx = rewriter.create(loop.getLoc(), rewriter.getIndexAttr(0)); + rewriter.replaceAllUsesWith(loop.getInductionVar(), idx); + + auto &body = genericOp.getRegion(); + body.takeBody(loop.getRegion()); + + + blk->eraseArguments(0, blk->getNumArguments()); + + for (auto &&[conds, load] : loads) { + if (stores_map.find(load) != stores_map.end()) { + // We have a store that represents this load. + continue; + } + auto arg = blk->addArgument(load.getType(), load.getLoc()); + rewriter.replaceOp(load, arg); + + } + + for (auto &&[conds, store] : stores) { + auto arg = blk->addArgument(store.getValueToStore().getType(), store.getLoc()); + + SmallVector inverted; + for (auto && [map_load, map_store] : stores_map) { + if (map_store == store) { + inverted.push_back(map_load); + } + } + for (size_t i=0; i toreturn; + + for (auto &&[conds, store] : stores) { + toreturn.push_back(store.getValueToStore()); + rewriter.eraseOp(store); + } + + rewriter.eraseOp(blk->getTerminator()); + rewriter.setInsertionPointToEnd(blk); + rewriter.create(loop.getLoc(), toreturn); + + rewriter.eraseOp(loop); + // return success! + return success(); + } +}; + +void RaiseAffineToLinalg::runOnOperation() { + RewritePatternSet patterns(&getContext()); + // TODO add the existing canonicalization patterns + // + subview of an affine apply -> subview + patterns.insert(&getContext()); + + GreedyRewriteConfig config; + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + config); +} + +namespace mlir { +namespace polygeist { +std::unique_ptr createRaiseAffineToLinalgPass() { + return std::make_unique(); +} +} // namespace polygeist +} // namespace mlir diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir new file mode 100644 index 000000000000..e0ceffa1849c --- /dev/null +++ b/test/polygeist-opt/linalgraise.mlir @@ -0,0 +1,428 @@ +// RUN: polygeist-opt --raise-affine-to-linalg --split-input-file %s | FileCheck %s + +module { + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + scf.if %12 { + affine.for %arg4 = 0 to %17 { + %ld = affine.load %18[%arg4] : memref + affine.store %ld, %19[%arg4] : memref + } + } + return + } + + + func.func @main2(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + scf.if %12 { + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[3 * %arg4] : memref + %ld2 = affine.load %18[0] : memref + %fadd = arith.addf %ld, %ld2 : f32 + affine.store %fadd, %19[%arg4 + 17] : memref + } + } + return + } + +} + +// CHECK: #map = affine_map<(d0) -> (d0)> +// CHECK: func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref, %[[arg3:.+]]: memref) { +// CHECK-NEXT: %[[c4:.+]] = arith.constant 4 : index +// CHECK-NEXT: %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index +// CHECK-NEXT: %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index +// CHECK-NEXT: %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index +// CHECK-NEXT: scf.if %[[arg0]] { +// TODO note that presently we do not ensure that the memrefs are sliced to the right size as the space requires +// CHECK-NEXT: linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg2 : memref) outs(%alloca : memref) { +// CHECK-NEXT: ^bb0(%in: f32, %out: f32): +// CHECK-NEXT: linalg.yield %in : f32 +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } + +//constant-access +module @constant_access{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %ci324 = arith.constant 4.0 : f32 + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ci324 : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//constant-mem-access +module @constant_mem_access{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 4 to 17 step 2 { + %ld = affine.load %18[3*%arg4] : memref + %ld2 = affine.load %18[%c4] : memref + %mul = arith.mulf %ld, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//without-if +module @no_if{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + affine.store %ld, %19[%arg4] : memref + } + return + } +} + +//arith.mul +module @arith_mul{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//arith.add +module @arith_add{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg4] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//Conditional arith +module @cond_arith{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %if = scf.if %12 -> f32 { + %mul = arith.mulf %ld, %ld : f32 + scf.yield %mul : f32 + } else { + scf.yield %ld : f32 + } + affine.store %if, %19[%arg4] : memref + } + return + } +} + +//reduction +module @reduction{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + %sum_0 = arith.constant 0.0 : f32 + %red = affine.for %arg4 = 0 to 17 step 1 iter_args(%sum_iter = %sum_0) -> f32 { + %ld1 = affine.load %18[%arg4] : memref + %sum_next = arith.addf %sum_iter, %ld1 : f32 + affine.yield %sum_next : f32 + } + affine.store %red, %19[0] : memref + return + } +} + +//Conditional store-1 +module @cond_store_1 { + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + scf.if %12 { + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//Conditional store-2 +module @cond_store_2{ + func.func @main(%12 : i1, %14 : i32, %18 : memref ) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + scf.if %12 { + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } else { + affine.store %ld, %19[%arg4] : memref + } + } + return + } +} + +//Parallel for +module @parallel_for{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg4 = 0 to 17 { + %ld = affine.load %18[%arg4] : memref + %mul = arith.mulf %ld, %ld : f32 + affine.store %mul, %19[%arg4] : memref + } + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg4] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + return + } +} + +//Fors inside for +module @for_within_for{ + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %21 = arith.muli %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg3 = 0 to 21 { + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %mul = arith.mulf %ld1, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//Parallel fors inside for +module @parallel_fors_inside_for { + func.func @main(%12 : i1, %14 : i32, %18 : memref, %20 : memref) { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %15 = arith.index_cast %14 : i32 to index + %16 = arith.muli %15, %c4 : index + %17 = arith.divui %16, %c4 : index + %19 = memref.alloca(%17) : memref + affine.for %arg3 = 0 to 17 { + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %mul = arith.mulf %ld1, %ld2 : f32 + affine.store %mul, %19[%arg4] : memref + } + affine.for %arg4 = 0 to 17 { + %ld1 = affine.load %18[%arg3] : memref + %ld2 = affine.load %20[%arg4] : memref + %add = arith.addf %ld1, %ld2 : f32 + %mul = arith.mulf %add, %add : f32 + affine.store %mul, %19[%arg4] : memref + } + } + return + } +} + +//matrix-mul iter arg +module @matmul_1 { + memref.global @out : memref<32x8xi32> = uninitialized + memref.global @im2 : memref<8x8xi32> = uninitialized + memref.global @im1 : memref<32x8xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im1 : memref<32x8xi32> + %1 = memref.get_global @im2 : memref<8x8xi32> + %2 = memref.get_global @out : memref<32x8xi32> + affine.for %arg0 = 0 to 32 { + affine.for %arg1 = 0 to 8 { + %3 = affine.for %arg2 = 0 to 8 iter_args(%arg3 = %c0_i32) -> (i32) { + %4 = affine.load %0[%arg0, %arg2] : memref<32x8xi32> + %5 = affine.load %1[%arg2, %arg1] : memref<8x8xi32> + %6 = arith.muli %4, %5 : i32 + %7 = arith.addi %arg3, %6 : i32 + affine.yield %7 : i32 + } + affine.store %3, %2[%arg0, %arg1] : memref<32x8xi32> + } + } + return %c0_i32 : i32 + } +} + +//matrix-mul alias issue +module @matmul_2 { + memref.global @out : memref<128x32xi32> = uninitialized + memref.global @im2 : memref<64x32xi32> = uninitialized + memref.global @im1 : memref<128x64xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im1 : memref<128x64xi32> + %1 = memref.get_global @im2 : memref<64x32xi32> + %2 = memref.get_global @out : memref<128x32xi32> + affine.for %arg0 = 0 to 128 { + affine.for %arg1 = 0 to 32 { + affine.for %arg2 = 0 to 64 { + %3 = affine.load %0[%arg0, %arg2] : memref<128x64xi32> + %4 = affine.load %1[%arg2, %arg1] : memref<64x32xi32> + %5 = arith.muli %3, %4 : i32 + %6 = affine.load %2[%arg0, %arg1] : memref<128x32xi32> + %7 = arith.addi %6, %5 : i32 + affine.store %7, %2[%arg0, %arg1] : memref<128x32xi32> + } + } + } + return %c0_i32 : i32 + } +} + +//conv (with inner loop accumulate) +//How to deal with IR in outer loops as well? +module @conv_1{ + memref.global @out : memref<512x64xi32> = uninitialized + memref.global @filter : memref<4x4xi32> = uninitialized + memref.global @im : memref<515x67xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im : memref<515x67xi32> + %1 = memref.get_global @filter : memref<4x4xi32> + %2 = memref.get_global @out : memref<512x64xi32> + affine.for %arg0 = 0 to 512 { + affine.for %arg1 = 0 to 64 { + %3 = affine.for %arg2 = 0 to 4 iter_args(%arg3 = %c0_i32) -> (i32) { + %4 = affine.for %arg4 = 0 to 4 iter_args(%arg5 = %arg3) -> (i32) { + %5 = affine.load %0[%arg0 + %arg2, %arg1 + %arg4] : memref<515x67xi32> + %6 = affine.load %1[%arg2, %arg4] : memref<4x4xi32> + %7 = arith.muli %5, %6 : i32 + %8 = arith.addi %arg5, %7 : i32 + affine.yield %8 : i32 + } + affine.yield %4 : i32 + } + affine.store %3, %2[%arg0, %arg1] : memref<512x64xi32> + } + } + return %c0_i32 : i32 + } +} + +//conv (direct store) +module @conv_2{ + memref.global @out : memref<512x64xi32> = uninitialized + memref.global @filter : memref<4x4xi32> = uninitialized + memref.global @im : memref<515x67xi32> = uninitialized + func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage} { + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @im : memref<515x67xi32> + %1 = memref.get_global @out : memref<512x64xi32> + affine.for %arg0 = 0 to 512 { + affine.for %arg1 = 0 to 64 { + affine.for %arg2 = 0 to 4 { + affine.for %arg3 = 0 to 4 { + %2 = affine.load %0[%arg0 + %arg2, %arg1 + %arg3] : memref<515x67xi32> + %3 = affine.load %1[%arg0, %arg1] : memref<512x64xi32> + %4 = arith.addi %3, %2 : i32 + affine.store %4, %1[%arg0, %arg1] : memref<512x64xi32> + } + } + } + } + return %c0_i32 : i32 + } +} \ No newline at end of file