diff --git a/include/polygeist/Ops.h b/include/polygeist/Ops.h
index cae361d2f399..e3f0cc9db17d 100644
--- a/include/polygeist/Ops.h
+++ b/include/polygeist/Ops.h
@@ -57,6 +57,8 @@ bool mayAlias(mlir::MemoryEffects::EffectInstance a,
 
 bool mayAlias(mlir::MemoryEffects::EffectInstance a, mlir::Value b);
 
+bool mayAlias(mlir::Value v, mlir::Value v2);
+
 extern llvm::cl::opt<bool> BarrierOpt;
 
 template <bool NotTopLevel = false>
diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h
index 5f3777441d1a..92c5812e8c4c 100644
--- a/include/polygeist/Passes/Passes.h
+++ b/include/polygeist/Passes/Passes.h
@@ -31,6 +31,7 @@ std::unique_ptr<Pass> replaceAffineCFGPass();
 std::unique_ptr<Pass> createOpenMPOptPass();
 std::unique_ptr<Pass> createCanonicalizeForPass();
 std::unique_ptr<Pass> createRaiseSCFToAffinePass();
+std::unique_ptr<Pass> createRaiseAffineToLinalgPass();
 std::unique_ptr<Pass> createCPUifyPass(StringRef method = "");
 std::unique_ptr<Pass> createBarrierRemovalContinuation();
 std::unique_ptr<Pass> detectReductionPass();
@@ -123,6 +124,10 @@ namespace affine {
 class AffineDialect;
 }
 
+namespace linalg {
+class LinalgDialect;
+}
+
 namespace LLVM {
 class LLVMDialect;
 }
diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td
index 05c3644c956e..5c17a9d6dc25 100644
--- a/include/polygeist/Passes/Passes.td
+++ b/include/polygeist/Passes/Passes.td
@@ -151,6 +151,15 @@ def SCFRaiseToAffine : Pass<"raise-scf-to-affine"> {
   ];
 }
 
+def AffineRaiseToLinalg : Pass<"raise-affine-to-linalg"> {
+  let summary = "Raise affine to linalg";
+  let constructor = "mlir::polygeist::createRaiseAffineToLinalgPass()";
+  let dependentDialects = [
+    "affine::AffineDialect",
+    "linalg::LinalgDialect",
+  ];
+}
+
 def SCFCanonicalizeFor : Pass<"canonicalize-scf-for"> {
   let summary = "Run some additional canonicalization for scf::for";
   let constructor = "mlir::polygeist::createCanonicalizeForPass()";
diff --git a/lib/polygeist/Ops.cpp b/lib/polygeist/Ops.cpp
index 926891b40611..d9a60fbcce45 100644
--- a/lib/polygeist/Ops.cpp
+++ b/lib/polygeist/Ops.cpp
@@ -784,7 +784,8 @@ bool isStackAlloca(Value v) {
          v.getDefiningOp<memref::AllocOp>() ||
          v.getDefiningOp<LLVM::AllocaOp>();
 }
-static bool mayAlias(Value v, Value v2) {
+
+bool mayAlias(Value v, Value v2) {
   v = getBase(v);
   v2 = getBase(v2);
   if (v == v2)
diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt
index 5d6164ef53d7..d6947a1931c5 100644
--- a/lib/polygeist/Passes/CMakeLists.txt
+++ b/lib/polygeist/Passes/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms
   OpenMPOpt.cpp
   BarrierRemovalContinuation.cpp
   RaiseToAffine.cpp
+  RaiseToLinalg.cpp
   ParallelLower.cpp
   TrivialUse.cpp
   ConvertPolygeistToLLVM.cpp
diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
new file mode 100644
index 000000000000..254d3a11881b
--- /dev/null
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -0,0 +1,464 @@
+#include "PassDetails.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Passes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "polygeist/Passes/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/IR/AffineExpr.h"
+
+#define DEBUG_TYPE "raise-to-linalg"
+
+using namespace mlir;
+using namespace mlir::arith;
+using namespace polygeist;
+using namespace affine;
+
+namespace {
+struct RaiseAffineToLinalg : public AffineRaiseToLinalgBase<RaiseAffineToLinalg> {
+  void runOnOperation() override;
+};
+} // namespace
+
+// Also want to add support for affine.for ( ) { linalg.generic } -> bigger linalg.generic
+// Also probably want to try to do { linalg.generc1(); linalg.generic2(); } -> bigger linalg.generic()
+
+/*
+
+affine.for() {
+    affine.for() {
+    } 
+    affine.for() {
+    }
+}
+
+*/
+struct Condition {
+    bool ifTrue;
+    AffineIfOp op;
+    Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {}
+};
+
+bool isLinearInIndex(AffineExpr expr, size_t idx) {
+    if (!expr.isFunctionOfDim(idx)) {
+        return true;
+    }
+
+    if (expr.getKind() == AffineExprKind::DimId) {
+        return true;
+    }
+
+        if (expr.getKind() == AffineExprKind::Add) {
+            auto binop = expr.cast<AffineBinaryOpExpr>();
+            return isLinearInIndex(binop.getLHS(), idx) && isLinearInIndex(binop.getRHS(), idx);
+        }
+        if (expr.getKind() == AffineExprKind::Mul) {
+            auto binop = expr.cast<AffineBinaryOpExpr>();
+            return (isLinearInIndex(binop.getLHS(), idx) && !binop.getRHS().isFunctionOfDim(idx)) ||
+                    (isLinearInIndex(binop.getRHS(), idx) && !binop.getLHS().isFunctionOfDim(idx));
+        }
+
+    return false;
+}
+
+bool isLinearInIndex(AffineMap map, size_t idx) {
+    for (auto expr : map.getResults()) {
+        if (!isLinearInIndex(expr, idx))
+            return false;
+    }
+    return true;
+}
+
+ AffineExpr shiftDimsDown1(AffineExpr expr, unsigned numDims,
+                                  unsigned offset) {
+   SmallVector<AffineExpr, 4> dims;
+   for (unsigned idx = 0; idx < offset; ++idx)
+     dims.push_back(getAffineDimExpr(idx, expr.getContext()));
+   for (unsigned idx = offset; idx < numDims; ++idx)
+     dims.push_back(getAffineDimExpr(idx - 1, expr.getContext()));
+   return expr.replaceDimsAndSymbols(dims, {});
+ }
+
+//This is reducing the number of input dims in expression by 1
+ AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim,
+                                  unsigned offset) {
+            assert(offset <= expr.getNumDims());
+     return AffineMap::get(expr.getNumDims() - 1, expr.getNumSymbols(),
+                           llvm::map_to_vector<4>(
+                               expr.getResults(),
+                               [&](AffineExpr e) {
+                                 return shiftDimsDown1(e, expr.getNumDims(), offset);
+                               }),
+                           expr.getContext());
+                                  }
+
+// Given an affine map `oldmap`, memref `val`, and corresponding input values (which are a list of indicies, then symbols),
+// and a loop index `ind` produce the following:
+//  1. A (potentially new) memref value `newval` which does not have any dependence on `ind`
+//     and
+//  2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that
+//     indexing `newval[map(ind)]` produces the same result as indexing the original map.
+std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, int loopLowerBound, int loopStepSize, mlir::OperandRange vals) {
+    // First we need to remove any dependence on the loop index from the affine map
+    SmallVector<Value> vals_without_idx;
+    ssize_t dim_idx = -1;
+    //To check if induction variable of for loop in an operand of this op (load/store)
+    for (auto &&[i, v] : llvm::enumerate(vals)) {
+        if (v == idx) {
+            // Offset we're replacing must be an index (not a symbol).
+            // If we guarantee to run AffineCFG first, this should always be true.
+            assert(i < oldmap.getNumDims());
+            // There should only be one use of the index.
+            assert(dim_idx == -1);
+            dim_idx = i;
+            continue;
+        }
+        vals_without_idx.push_back(v);
+    }
+
+    if (dim_idx != -1 && !isLinearInIndex(oldmap, dim_idx)) {
+        legal = false;
+        return {val, oldmap};
+    }
+
+
+    // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables
+
+    //Instead of lower bound we are using 0 (assumption as the lower bound)
+    AffineMap offsetMap = oldmap;
+    if (dim_idx != -1) {
+        offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound),offsetMap.getNumDims(), offsetMap.getNumSymbols());
+        offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx);
+    }
+
+    //Instead of using loop step we are using 1 (Assumption as the stride size)
+    AffineMap strideMap = oldmap;
+    if (dim_idx != -1) {
+        strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound + loopStepSize),strideMap.getNumDims(), strideMap.getNumSymbols());
+        strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx);
+    }
+
+    //Subtracting maps of stride and offset, gives you the offset value in the result of the map
+    {
+        SmallVector<AffineExpr> subtracts;
+        for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) {
+            subtracts.push_back(lhs - rhs);
+        }
+        strideMap = AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), subtracts, builder.getContext());
+    }
+
+    // Expression to index into the generated subview given the loop index
+    SmallVector<AffineExpr> loop_idxs;
+
+    // List of starting offsets into the subview
+    SmallVector<Value> offsets;
+    SmallVector<Value> sizes;
+    SmallVector<Value> strides;
+
+    for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) {
+        offsets.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(),AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), offset_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression?
+        strides.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(),AffineMap::get(strideMap.getNumDims(), strideMap.getNumSymbols(), stride_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression?
+        if (!expr.isFunctionOfDim(dim_idx)) {
+            loop_idxs.push_back(builder.getAffineConstantExpr(0));
+            sizes.push_back(builder.create<arith::ConstantIndexOp>(val.getLoc(), 1));
+        } else {
+            loop_idxs.push_back(builder.getAffineDimExpr(0));
+            sizes.push_back(idx_size);
+        }
+    }
+
+    auto newval = builder.create<memref::SubViewOp>(val.getLoc(), val, offsets, sizes, strides);
+    legal = true;
+    //Does this need fix? Here we are constraining to dims as 1 and symbols as 0, should it be, original 
+    return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())};
+}
+
+
+// store A[...]
+// val = load A[...]
+
+/*  prevA : 
+    store A
+    val is now prevA
+*/
+
+
+struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
+  using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineForOp loop,
+                                PatternRewriter &rewriter) const final {
+
+    // Don't handle accumulations in registers for the moment, we can have
+    // a separate pattern move them into memref's
+    if (loop.getNumResults() != 0) {
+        return failure();
+    }
+
+    SmallVector<std::pair<std::vector<Condition>, AffineLoadOp>> loads;
+    SmallVector<std::pair<std::vector<Condition>, AffineStoreOp>> stores;
+    // TODO Also collect all the linalg generics!
+
+    // Check that the only operations within the region are either:
+    //      affine.load, affine.store, affine.if, affine.yield
+    // Additionally, for each load/store, remember what conditions are
+    // required for that load or store to execute.
+    auto result = loop->walk<WalkOrder::PreOrder>([&](Operation* op) {
+        if (op == loop) return WalkResult::advance();
+        // TODO extend this, any non-memory operation is also legal here.
+        // mul, add, etc (we can just check propety)
+        if (isa<AffineYieldOp, AffineIfOp>(op)) {
+            return WalkResult::advance();
+        }
+        if (isa<AffineLoadOp, AffineStoreOp>(op)) {
+            Operation *cur = op->getParentOp();
+            std::vector<Condition> conditions;
+            while (cur != loop) {
+                auto ifstmt = dyn_cast<AffineIfOp>(cur);
+                if (!ifstmt) {
+                    return WalkResult::interrupt();
+                }
+                bool ifTrue = ifstmt.getThenRegion().isAncestor(cur->getParentRegion());
+                conditions.emplace_back(ifTrue, ifstmt);
+                cur = ifstmt->getParentOp();
+            }
+            if (auto load = dyn_cast<AffineLoadOp>(op)) {
+                loads.emplace_back(conditions, load);
+            } else {
+                auto store = cast<AffineStoreOp>(op);
+                stores.emplace_back(conditions, store);
+            }
+            return WalkResult::advance();
+        }
+        if (isReadNone(op)) {
+            return WalkResult::advance();
+        }
+        return WalkResult::interrupt();
+    });
+    
+    if (result.wasInterrupted()) return failure();
+
+    DominanceInfo DI(loop);
+
+    // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result)
+    // TODO we can extend this and handle things like reductions, but we're going to start easy for now
+    // TODO 
+    DenseMap<AffineLoadOp, AffineStoreOp> stores_map;
+    for (auto &&[_, store] : stores) {
+        for (auto &&[_, load]: loads) {
+            if (mayAlias(load.getMemref(), store.getMemref())) {
+                // We have one exception in this case -- if the load and store are from the exact same location, it is permitted.
+                if (load.getMemref() == store.getMemref() &&
+                    load.getAffineMap() == store.getAffineMap() &&
+                    load.getIndices() == store.getIndices() && DI.dominates((Operation*)load,(Operation*)store)) {
+                        stores_map[load] = store;
+                        continue;
+                    }
+                return failure();
+            }
+        }
+        for (auto &&[_, store2]: stores) {
+            if (store == store2) continue;
+            if (mayAlias(store.getMemref(), store2.getMemref())) {
+                return failure();
+            }
+        }
+    }
+    // Check that any other loads / stores do not alias with any linalg generics
+    // We're going to need to upgrade the defn of mayAlias for subviews (aka mayAlias(subview, x) -> mayAlias(operand(subview), x))
+
+    SmallVector<Value> inputs;
+    SmallVector<AffineMap> affineMaps;
+
+    //if (loop.getStep() != 1) {
+    //    return failure();
+    //}
+
+    // our remapper currently assumes 0 start to bound. 
+    if (!loop.hasConstantLowerBound() /*|| loop.getConstantLowerBound() != 0*/) {
+        return failure();
+    }
+
+    // compute this correctly later.
+    auto ubMap = loop.getUpperBoundMap();
+    auto ubOperands = loop.getUpperBoundOperands();
+    if (!ubMap || ubMap.getNumResults() != 1) return failure();
+
+    // Retrieve the lower bound
+    auto lbMap = loop.getLowerBoundMap();
+    auto lbOperands = loop.getLowerBoundOperands();
+    if (!lbMap || lbMap.getNumResults() != 1) return failure();
+    
+    auto ub = loop.getSingleUpperBound();
+    if (!ub) return failure();
+
+    auto lb = loop.getSingleLowerBound();
+    if (!lb) return failure();
+    
+
+    if (!loop.hasConstantUpperBound()) {
+        return failure();
+    }
+
+    // Retrieve the step size
+    int64_t step = loop.getStep();
+
+    // Get the single result expressions
+    AffineExpr ubExpr = ubMap.getResult(0);
+    auto ubValue = rewriter.create<AffineApplyOp>(loop.getLoc(), ubMap, ubOperands);
+    
+    AffineExpr lbExpr = lbMap.getResult(0);
+    auto lbValue = rewriter.create<AffineApplyOp>(loop.getLoc(), lbMap, lbOperands);
+
+    //// Ensure the bounds are constant expressions
+    auto ubConst = ubExpr.dyn_cast<AffineConstantExpr>();
+    auto lbConst = lbExpr.dyn_cast<AffineConstantExpr>();
+    if (!ubConst || !lbConst) return failure();
+
+    // Compute the loop size
+    //int64_t loopSize = ubConst.getValue() - lbConst.getValue();
+    auto loopSize = rewriter.create<SubIOp>(loop.getLoc(), ubValue, lbValue);
+    
+    //Value loopSize = rewriter.create<arith::ConstantIndexOp>(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create<arith::SubIOp>(loop.getLoc(), *ub, *lb);
+    
+    // current spec is going to be indexed off of the loop var in isolation
+    for (auto &&[conds, load] : loads) {
+        // Only support unconditional loads for the moment
+        if (conds.size() != 0) return failure();
+
+        if (stores_map.find(load) != stores_map.end()) {
+            // We have a store that represents this load.
+            continue;
+        }
+
+        bool legal = true;
+       
+        auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(),
+        loopSize, lbConst.getValue(), step, load.getMapOperands());
+
+        if (!legal) return failure();
+
+        affineMaps.push_back(newAffineMap);
+        inputs.push_back(newMemref);
+    }
+    // TODO Push all of the inputs to the linalg generics (modifying maps as needed)
+    
+    SmallVector<Value> outputs;
+    // Store we may need to reindex into a splat potentially later, but for now we'll be lazy
+    for (auto &&[conds, store] : stores) {
+        // Only support unconditional loads for the moment
+        if (conds.size() != 0) return failure();
+
+        bool legal = true;
+       
+        auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(),
+        loopSize, lbConst.getValue(), step, store.getMapOperands());
+
+        if (!legal) return failure();
+
+        affineMaps.push_back(newAffineMap);
+        outputs.push_back(newMemref);
+    }
+    // TODO Push all of the outputs to the linalg generics
+
+    // TODO presently  if linalg generic exists, assert there are no load/stores
+    // TODO assert only zero or one linalg generic exists
+    SmallVector<utils::IteratorType> iteratorTypes;
+    // TODO if linalg generic exists, make this iterator type prepend to the existing iterators
+    iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction);
+
+
+
+    StringAttr empty = StringAttr::get(loop.getContext());
+    auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
+      loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
+      empty,
+      empty);
+
+    // TODO if doing the linalg generic case, ignore a lot of the below and instead of injecting the old body of the affine.for, move the inner linalg.generic body 
+    // and also add a new induction variable
+    auto blk = &*loop.getRegion().begin();
+    rewriter.setInsertionPointToStart(blk);
+
+    // This index will replace the use of the affine index
+    auto idx = rewriter.create<linalg::IndexOp>(loop.getLoc(), rewriter.getIndexAttr(0));
+    rewriter.replaceAllUsesWith(loop.getInductionVar(), idx);
+
+    auto &body = genericOp.getRegion();
+    body.takeBody(loop.getRegion());
+
+
+    blk->eraseArguments(0, blk->getNumArguments());
+
+    for (auto &&[conds, load] : loads) {
+        if (stores_map.find(load) != stores_map.end()) {
+            // We have a store that represents this load.
+            continue;
+        }
+        auto arg = blk->addArgument(load.getType(), load.getLoc());
+        rewriter.replaceOp(load, arg);
+
+    }
+
+    for (auto &&[conds, store] : stores) {
+        auto arg = blk->addArgument(store.getValueToStore().getType(), store.getLoc());
+
+        SmallVector<AffineLoadOp> inverted;
+        for (auto && [map_load, map_store] : stores_map) {
+            if (map_store == store) {
+                inverted.push_back(map_load);
+            }
+        }
+        for (size_t i=0; i<inverted.size(); i++) {
+            stores_map.erase(inverted[i]);
+            auto tmp = inverted[i];
+            inverted[i] = nullptr;
+            rewriter.replaceOp(tmp, arg);
+        }
+    }
+
+    SmallVector<Value> toreturn;
+
+    for (auto &&[conds, store] : stores) {
+        toreturn.push_back(store.getValueToStore());
+        rewriter.eraseOp(store);
+    }
+
+    rewriter.eraseOp(blk->getTerminator());
+    rewriter.setInsertionPointToEnd(blk);
+    rewriter.create<linalg::YieldOp>(loop.getLoc(), toreturn);
+
+    rewriter.eraseOp(loop);
+    // return success!
+    return success();
+  }
+};
+
+void RaiseAffineToLinalg::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  // TODO add the existing canonicalization patterns
+  //  + subview of an affine apply -> subview
+  patterns.insert<AffineForOpRaising>(&getContext());
+
+  GreedyRewriteConfig config;
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
+                                     config);
+}
+
+namespace mlir {
+namespace polygeist {
+std::unique_ptr<Pass> createRaiseAffineToLinalgPass() {
+  return std::make_unique<RaiseAffineToLinalg>();
+}
+} // namespace polygeist
+} // namespace mlir
diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir
new file mode 100644
index 000000000000..e0ceffa1849c
--- /dev/null
+++ b/test/polygeist-opt/linalgraise.mlir
@@ -0,0 +1,428 @@
+// RUN: polygeist-opt --raise-affine-to-linalg --split-input-file %s | FileCheck %s
+
+module {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    scf.if %12 {
+      affine.for %arg4 = 0 to %17 {
+        %ld = affine.load %18[%arg4] : memref<?xf32>
+        affine.store %ld, %19[%arg4] : memref<?xf32>
+      }
+   }
+    return
+  }
+
+
+  func.func @main2(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    scf.if %12 {
+      affine.for %arg4 = 0 to 17 {
+        %ld = affine.load %18[3 * %arg4] : memref<?xf32>
+        %ld2 = affine.load %18[0] : memref<?xf32>
+        %fadd = arith.addf %ld, %ld2 : f32
+        affine.store %fadd, %19[%arg4 + 17] : memref<?xf32>
+      }
+   }
+    return
+  }
+
+}
+
+// CHECK: #map = affine_map<(d0) -> (d0)>
+// CHECK:   func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref<?xf32>, %[[arg3:.+]]: memref<?xf32>) {
+// CHECK-NEXT:     %[[c4:.+]] = arith.constant 4 : index
+// CHECK-NEXT:     %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index
+// CHECK-NEXT:     %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index
+// CHECK-NEXT:     %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index
+// CHECK-NEXT:     scf.if %[[arg0]] {
+// TODO note that presently we do not ensure that the memrefs are sliced to the right size as the space requires
+// CHECK-NEXT:        linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg2 : memref<?xf32>) outs(%alloca : memref<?xf32>) {
+// CHECK-NEXT:        ^bb0(%in: f32, %out: f32):
+// CHECK-NEXT:          linalg.yield %in : f32
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+
+//constant-access
+module @constant_access{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %ci324 = arith.constant 4.0 : f32
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ci324 : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//constant-mem-access
+module @constant_mem_access{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 4 to 17 step 2 {
+      %ld = affine.load %18[3*%arg4] : memref<?xf32>
+      %ld2 = affine.load %18[%c4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld2 : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//without-if
+module @no_if{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      affine.store %ld, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//arith.mul
+module @arith_mul{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//arith.add
+module @arith_add{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>  ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %ld2 = affine.load %20[%arg4] : memref<?xf32>
+      %add = arith.addf %ld1, %ld2 : f32
+      %mul = arith.mulf %add, %add : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//Conditional arith
+module @cond_arith{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %if = scf.if %12 -> f32 {
+        %mul = arith.mulf %ld, %ld : f32
+        scf.yield %mul : f32
+      } else {
+        scf.yield %ld : f32
+      }
+      affine.store %if, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//reduction
+module @reduction{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>  ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    %sum_0 = arith.constant 0.0 : f32
+    %red = affine.for %arg4 = 0 to 17 step 1 iter_args(%sum_iter = %sum_0) -> f32 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %sum_next = arith.addf %sum_iter, %ld1 : f32
+      affine.yield %sum_next : f32
+    }
+    affine.store %red, %19[0] : memref<?xf32>
+    return
+  }
+}
+
+//Conditional store-1
+module @cond_store_1 {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      scf.if %12 {
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Conditional store-2
+module @cond_store_2{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      scf.if %12 {
+        %mul = arith.mulf %ld, %ld : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      } else {
+        affine.store %ld, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Parallel for
+module @parallel_for{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    affine.for %arg4 = 0 to 17 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %ld2 = affine.load %20[%arg4] : memref<?xf32>
+      %add = arith.addf %ld1, %ld2 : f32
+      %mul = arith.mulf %add, %add : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//Fors inside for
+module @for_within_for{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %21 = arith.muli %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg3 = 0 to 21 {
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %mul = arith.mulf %ld1, %ld2 : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Parallel fors inside for
+module @parallel_fors_inside_for {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg3 = 0 to 17 {
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %mul = arith.mulf %ld1, %ld2 : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %add = arith.addf %ld1, %ld2 : f32
+        %mul = arith.mulf %add, %add : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//matrix-mul iter arg
+module @matmul_1 {
+  memref.global @out : memref<32x8xi32> = uninitialized
+  memref.global @im2 : memref<8x8xi32> = uninitialized
+  memref.global @im1 : memref<32x8xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im1 : memref<32x8xi32>
+    %1 = memref.get_global @im2 : memref<8x8xi32>
+    %2 = memref.get_global @out : memref<32x8xi32>
+    affine.for %arg0 = 0 to 32 {
+      affine.for %arg1 = 0 to 8 {
+        %3 = affine.for %arg2 = 0 to 8 iter_args(%arg3 = %c0_i32) -> (i32) {
+          %4 = affine.load %0[%arg0, %arg2] : memref<32x8xi32>
+          %5 = affine.load %1[%arg2, %arg1] : memref<8x8xi32>
+          %6 = arith.muli %4, %5 : i32
+          %7 = arith.addi %arg3, %6 : i32
+          affine.yield %7 : i32
+        }
+        affine.store %3, %2[%arg0, %arg1] : memref<32x8xi32>
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//matrix-mul alias issue
+module @matmul_2 {
+  memref.global @out : memref<128x32xi32> = uninitialized
+  memref.global @im2 : memref<64x32xi32> = uninitialized
+  memref.global @im1 : memref<128x64xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im1 : memref<128x64xi32>
+    %1 = memref.get_global @im2 : memref<64x32xi32>
+    %2 = memref.get_global @out : memref<128x32xi32>
+    affine.for %arg0 = 0 to 128 {
+      affine.for %arg1 = 0 to 32 {
+        affine.for %arg2 = 0 to 64 {
+          %3 = affine.load %0[%arg0, %arg2] : memref<128x64xi32>
+          %4 = affine.load %1[%arg2, %arg1] : memref<64x32xi32>
+          %5 = arith.muli %3, %4 : i32
+          %6 = affine.load %2[%arg0, %arg1] : memref<128x32xi32>
+          %7 = arith.addi %6, %5 : i32
+          affine.store %7, %2[%arg0, %arg1] : memref<128x32xi32>
+        }
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//conv (with inner loop accumulate)
+//How to deal with IR in outer loops as well?
+module @conv_1{
+  memref.global @out : memref<512x64xi32> = uninitialized
+  memref.global @filter : memref<4x4xi32> = uninitialized
+  memref.global @im : memref<515x67xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im : memref<515x67xi32>
+    %1 = memref.get_global @filter : memref<4x4xi32>
+    %2 = memref.get_global @out : memref<512x64xi32>
+    affine.for %arg0 = 0 to 512 {
+      affine.for %arg1 = 0 to 64 {
+        %3 = affine.for %arg2 = 0 to 4 iter_args(%arg3 = %c0_i32) -> (i32) {
+          %4 = affine.for %arg4 = 0 to 4 iter_args(%arg5 = %arg3) -> (i32) {
+            %5 = affine.load %0[%arg0 + %arg2, %arg1 + %arg4] : memref<515x67xi32>
+            %6 = affine.load %1[%arg2, %arg4] : memref<4x4xi32>
+            %7 = arith.muli %5, %6 : i32
+            %8 = arith.addi %arg5, %7 : i32
+            affine.yield %8 : i32
+          }
+          affine.yield %4 : i32
+        }
+        affine.store %3, %2[%arg0, %arg1] : memref<512x64xi32>
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//conv (direct store)
+module @conv_2{
+  memref.global @out : memref<512x64xi32> = uninitialized
+  memref.global @filter : memref<4x4xi32> = uninitialized
+  memref.global @im : memref<515x67xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im : memref<515x67xi32>
+    %1 = memref.get_global @out : memref<512x64xi32>
+    affine.for %arg0 = 0 to 512 {
+      affine.for %arg1 = 0 to 64 {
+        affine.for %arg2 = 0 to 4 {
+          affine.for %arg3 = 0 to 4 {
+            %2 = affine.load %0[%arg0 + %arg2, %arg1 + %arg3] : memref<515x67xi32>
+            %3 = affine.load %1[%arg0, %arg1] : memref<512x64xi32>
+            %4 = arith.addi %3, %2 : i32
+            affine.store %4, %1[%arg0, %arg1] : memref<512x64xi32>
+          }
+        }
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
\ No newline at end of file