From 61115a86543732f898936c8c3d42519e3dec38be Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Thu, 25 Apr 2024 20:12:45 -0400
Subject: [PATCH 1/7] WIP: Raise to linalg

---
 include/polygeist/Passes/Passes.h      |   1 +
 include/polygeist/Passes/Passes.td     |   9 ++
 lib/polygeist/Passes/CMakeLists.txt    |   1 +
 lib/polygeist/Passes/RaiseToLinalg.cpp | 184 +++++++++++++++++++++++++
 4 files changed, 195 insertions(+)
 create mode 100644 lib/polygeist/Passes/RaiseToLinalg.cpp
diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h
index 5f3777441d1a..29f5a9b3536c 100644
--- a/include/polygeist/Passes/Passes.h
+++ b/include/polygeist/Passes/Passes.h
@@ -31,6 +31,7 @@ std::unique_ptr<Pass> replaceAffineCFGPass();
 std::unique_ptr<Pass> createOpenMPOptPass();
 std::unique_ptr<Pass> createCanonicalizeForPass();
 std::unique_ptr<Pass> createRaiseSCFToAffinePass();
+std::unique_ptr<Pass> createRaiseAffineToLinalgPass();
 std::unique_ptr<Pass> createCPUifyPass(StringRef method = "");
 std::unique_ptr<Pass> createBarrierRemovalContinuation();
 std::unique_ptr<Pass> detectReductionPass();
diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td
index 05c3644c956e..5c17a9d6dc25 100644
--- a/include/polygeist/Passes/Passes.td
+++ b/include/polygeist/Passes/Passes.td
@@ -151,6 +151,15 @@ def SCFRaiseToAffine : Pass<"raise-scf-to-affine"> {
   ];
 }
 
+def AffineRaiseToLinalg : Pass<"raise-affine-to-linalg"> {
+  let summary = "Raise affine to linalg";
+  let constructor = "mlir::polygeist::createRaiseAffineToLinalgPass()";
+  let dependentDialects = [
+    "affine::AffineDialect",
+    "linalg::LinalgDialect",
+  ];
+}
+
 def SCFCanonicalizeFor : Pass<"canonicalize-scf-for"> {
   let summary = "Run some additional canonicalization for scf::for";
   let constructor = "mlir::polygeist::createCanonicalizeForPass()";
diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt
index 5d6164ef53d7..d6947a1931c5 100644
--- a/lib/polygeist/Passes/CMakeLists.txt
+++ b/lib/polygeist/Passes/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms
   OpenMPOpt.cpp
   BarrierRemovalContinuation.cpp
   RaiseToAffine.cpp
+  RaiseToLinalg.cpp
   ParallelLower.cpp
   TrivialUse.cpp
   ConvertPolygeistToLLVM.cpp
diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
new file mode 100644
index 000000000000..d14771c677fd
--- /dev/null
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -0,0 +1,184 @@
+#include "PassDetails.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Passes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "polygeist/Passes/Passes.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "raise-to-affine"
+
+using namespace mlir;
+using namespace mlir::arith;
+using namespace polygeist;
+using namespace affine;
+
+namespace {
+struct RaiseAffineToLinalg : public SCFRaiseToAffineBase<RaiseAffineToLinalg> {
+  void runOnOperation() override;
+};
+} // namespace
+
+// Also want to add support for affine.for ( ) { linalg.generic } -> bigger linalg.generic
+// Also probably want to try to do { linalg.generc1(); linalg.generic2(); } -> bigger linalg.generic()
+
+/*
+
+affine.for() {
+    affine.for() {
+    } 
+    affine.for() {
+    }
+}
+
+*/
+struct Condition {
+    bool ifTrue;
+    AffineIfOp op;
+};
+struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
+  using OpRewritePattern<scf::ForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineForOp loop,
+                                PatternRewriter &rewriter) const final {
+
+    // Don't handle accumulations in registers for the moment, we can have
+    // a separate pattern move them into memref's
+    if (loop.getNumReductions() != 0) {
+        return failure();
+    }
+
+    SmallVector<std::pair<std::vector<Condition>, AffineLoadOp> loads;
+    SmallVector<std::pair<std::vector<Condition>, AffineStoreOp> stores;
+
+    // Check that the only operations within the region are either:
+    //      affine.load, affine.store, affine.if, affine.yield
+    // Additionally, for each load/store, remember what conditions are
+    // required for that load or store to execute.
+    Walkloop result = loop->walk<WalkOrder::PreOrder>([&](Operation* op) {
+        // TODO extend this, any non-memory operation is also legal here.
+        // mul, add, etc (we can just check propety)
+        if (isa<AffineYieldOp, AffineIfOp>(op)) {
+            return WalkResult::advance();
+        }
+        if (isa<AffineLoadOp, AffineStoreOp>(op)) {
+            Operation *cur = op->getParentOp();
+            std::vector<Condition> conditions;
+            while (cur != loop) {
+                auto ifstmt = dyn_cast<AffineIfOp>(cur);
+                if (!ifstmt) {
+                    return WalkResult::interrupt();
+                }
+                bool ifTrue = ifstmt.getTrueRegion()->isAncestor(cur);
+                conditions.push_back(ifTrue, ifstmt);
+                cur = ifstmt->getParent();
+            }
+            if (auto load = dyn_cast<AffineLoadOp>(cur)) {
+                loads.emplace_back(condition, load);
+            } else {
+                auto store = cast<AffineStoreOp>(cur);
+                stores.emplace_back(condition, store);
+            }
+            return WalkResult::advance();
+        }
+        return WalkResult::interrupt();
+    });
+    
+    if (result.wasInterrupted()) return failure();
+
+    // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result)
+    // TODO we can extend this and handle things like reductions, but we're going to start easy for now
+    for (auto &&[_, store] : stores) {
+        for (auto &&[_, load]: loads) {
+            if (mayAlias(load.getMemref(), stores.getMemref())) {
+                return failure();
+            }
+        }
+        for (auto &&[_, store2]: stores) {
+            if (store == store2) continue;
+            if (mayAlias(store.getMemref(), stores2.getMemref())) {
+                return failure();
+            }
+        }
+    }
+
+
+
+    SmallVector<Value> inputs;
+    SmallVector<AffineMap> affineMaps;
+    for (auto [conds, load]&& : loads) {
+        // Only support unconditional loads for the moment
+        if (conds.size() != 0) return failure();
+        inputs.push_back(load.getMemref());
+        affineMaps.push_back(load.getAffineMap());
+    }
+    
+    SmallVector<Value> outputs;
+    for (auto [conds, store]&& : stores) {
+        // Only support unconditional loads for the moment
+        if (conds.size() != 0) return failure();
+        outputs.push_back(store.getMemref());
+        affineMaps.push_back(store.getAffineMap());
+    }
+
+    ArrayRef<utils::IteratorType> iteratorTypes;
+    // TODO fill this with the for loop bounds
+
+    auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
+      loc, TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
+      StringAttr(),
+      StringAttr());
+
+
+    auto &body = genericOp.getRegion();
+    body.takeBody(loop.getRegion());
+
+    rewriter.setInsertionPointToStart(*body.begin());
+
+    // This index will replace the use of the affine index
+    auto idx = rewriter.create<linalg::IndexOp>(rewriter.getIndexAttr(0));
+    rewriter.replaceAllUsesWith(loop.getInductionVariable(), idx);
+
+    // TODO
+    // replace all loads with the corresponding block arguments we will create in the linalg generic
+    //
+
+
+    SmallVector<Value> toreturn;
+    // push all sotred values into a linalg.yield
+
+    // fixup the return
+
+
+
+    // return success!
+
+
+    return failure();
+  }
+};
+
+void RaiseAffineToLinalg::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  patterns.insert<ForOpRaising>(&getContext());
+
+  GreedyRewriteConfig config;
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
+                                     config);
+}
+
+namespace mlir {
+namespace polygeist {
+std::unique_ptr<Pass> createRaiseAffineToLinalgPass() {
+  return std::make_unique<RaiseAffineToLinalg>();
+}
+} // namespace polygeist
+} // namespace mlir

From 82e228d27d9f9522ebf5e1c40ddbb289da34f2a2 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 7 May 2024 20:01:57 -0400
Subject: [PATCH 2/7] It builds

---
 include/polygeist/Ops.h                |  2 +
 include/polygeist/Passes/Passes.h      |  4 ++
 lib/polygeist/Ops.cpp                  |  3 +-
 lib/polygeist/Passes/RaiseToLinalg.cpp | 71 +++++++++++++++-----------
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/include/polygeist/Ops.h b/include/polygeist/Ops.h
index cae361d2f399..e3f0cc9db17d 100644
--- a/include/polygeist/Ops.h
+++ b/include/polygeist/Ops.h
@@ -57,6 +57,8 @@ bool mayAlias(mlir::MemoryEffects::EffectInstance a,
 
 bool mayAlias(mlir::MemoryEffects::EffectInstance a, mlir::Value b);
 
+bool mayAlias(mlir::Value v, mlir::Value v2);
+
 extern llvm::cl::opt<bool> BarrierOpt;
 
 template <bool NotTopLevel = false>
diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h
index 29f5a9b3536c..92c5812e8c4c 100644
--- a/include/polygeist/Passes/Passes.h
+++ b/include/polygeist/Passes/Passes.h
@@ -124,6 +124,10 @@ namespace affine {
 class AffineDialect;
 }
 
+namespace linalg {
+class LinalgDialect;
+}
+
 namespace LLVM {
 class LLVMDialect;
 }
diff --git a/lib/polygeist/Ops.cpp b/lib/polygeist/Ops.cpp
index 926891b40611..d9a60fbcce45 100644
--- a/lib/polygeist/Ops.cpp
+++ b/lib/polygeist/Ops.cpp
@@ -784,7 +784,8 @@ bool isStackAlloca(Value v) {
          v.getDefiningOp<memref::AllocOp>() ||
          v.getDefiningOp<LLVM::AllocaOp>();
 }
-static bool mayAlias(Value v, Value v2) {
+
+bool mayAlias(Value v, Value v2) {
   v = getBase(v);
   v2 = getBase(v2);
   if (v == v2)
diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index d14771c677fd..09c19f8df5be 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -14,7 +14,7 @@
 #include "polygeist/Passes/Passes.h"
 #include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "raise-to-affine"
+#define DEBUG_TYPE "raise-to-linalg"
 
 using namespace mlir;
 using namespace mlir::arith;
@@ -22,7 +22,7 @@ using namespace polygeist;
 using namespace affine;
 
 namespace {
-struct RaiseAffineToLinalg : public SCFRaiseToAffineBase<RaiseAffineToLinalg> {
+struct RaiseAffineToLinalg : public AffineRaiseToLinalgBase<RaiseAffineToLinalg> {
   void runOnOperation() override;
 };
 } // namespace
@@ -43,27 +43,28 @@ affine.for() {
 struct Condition {
     bool ifTrue;
     AffineIfOp op;
+    Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {}
 };
-struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
-  using OpRewritePattern<scf::ForOp>::OpRewritePattern;
+struct ForOpRaising : public OpRewritePattern<affine::AffineForOp> {
+  using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(affine::AffineForOp loop,
                                 PatternRewriter &rewriter) const final {
 
     // Don't handle accumulations in registers for the moment, we can have
     // a separate pattern move them into memref's
-    if (loop.getNumReductions() != 0) {
+    if (loop.getNumResults() != 0) {
         return failure();
     }
 
-    SmallVector<std::pair<std::vector<Condition>, AffineLoadOp> loads;
-    SmallVector<std::pair<std::vector<Condition>, AffineStoreOp> stores;
+    SmallVector<std::pair<std::vector<Condition>, AffineLoadOp>> loads;
+    SmallVector<std::pair<std::vector<Condition>, AffineStoreOp>> stores;
 
     // Check that the only operations within the region are either:
     //      affine.load, affine.store, affine.if, affine.yield
     // Additionally, for each load/store, remember what conditions are
     // required for that load or store to execute.
-    Walkloop result = loop->walk<WalkOrder::PreOrder>([&](Operation* op) {
+    auto result = loop->walk<WalkOrder::PreOrder>([&](Operation* op) {
         // TODO extend this, any non-memory operation is also legal here.
         // mul, add, etc (we can just check propety)
         if (isa<AffineYieldOp, AffineIfOp>(op)) {
@@ -77,18 +78,21 @@ struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
                 if (!ifstmt) {
                     return WalkResult::interrupt();
                 }
-                bool ifTrue = ifstmt.getTrueRegion()->isAncestor(cur);
-                conditions.push_back(ifTrue, ifstmt);
-                cur = ifstmt->getParent();
+                bool ifTrue = ifstmt.getThenRegion().isAncestor(cur->getParentRegion());
+                conditions.emplace_back(ifTrue, ifstmt);
+                cur = ifstmt->getParentOp();
             }
             if (auto load = dyn_cast<AffineLoadOp>(cur)) {
-                loads.emplace_back(condition, load);
+                loads.emplace_back(conditions, load);
             } else {
                 auto store = cast<AffineStoreOp>(cur);
-                stores.emplace_back(condition, store);
+                stores.emplace_back(conditions, store);
             }
             return WalkResult::advance();
         }
+        if (isReadNone(op)) {
+            return WalkResult::advance();
+        }
         return WalkResult::interrupt();
     });
     
@@ -98,13 +102,13 @@ struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
     // TODO we can extend this and handle things like reductions, but we're going to start easy for now
     for (auto &&[_, store] : stores) {
         for (auto &&[_, load]: loads) {
-            if (mayAlias(load.getMemref(), stores.getMemref())) {
+            if (mayAlias(load.getMemref(), store.getMemref())) {
                 return failure();
             }
         }
         for (auto &&[_, store2]: stores) {
             if (store == store2) continue;
-            if (mayAlias(store.getMemref(), stores2.getMemref())) {
+            if (mayAlias(store.getMemref(), store2.getMemref())) {
                 return failure();
             }
         }
@@ -114,7 +118,7 @@ struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
 
     SmallVector<Value> inputs;
     SmallVector<AffineMap> affineMaps;
-    for (auto [conds, load]&& : loads) {
+    for (auto &&[conds, load] : loads) {
         // Only support unconditional loads for the moment
         if (conds.size() != 0) return failure();
         inputs.push_back(load.getMemref());
@@ -122,18 +126,19 @@ struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
     }
     
     SmallVector<Value> outputs;
-    for (auto [conds, store]&& : stores) {
+    for (auto &&[conds, store] : stores) {
         // Only support unconditional loads for the moment
         if (conds.size() != 0) return failure();
         outputs.push_back(store.getMemref());
         affineMaps.push_back(store.getAffineMap());
     }
 
-    ArrayRef<utils::IteratorType> iteratorTypes;
-    // TODO fill this with the for loop bounds
+    SmallVector<utils::IteratorType> iteratorTypes;
+    // TODO revisit this later
+    iteratorTypes.push_back(utils::IteratorType::parallel);
 
     auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
-      loc, TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
+      loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
       StringAttr(),
       StringAttr());
 
@@ -141,27 +146,33 @@ struct ForOpRaising : public OpRewritePattern<affine::ForOp> {
     auto &body = genericOp.getRegion();
     body.takeBody(loop.getRegion());
 
-    rewriter.setInsertionPointToStart(*body.begin());
+    auto blk = &*body.begin();
+    rewriter.setInsertionPointToStart(blk);
 
     // This index will replace the use of the affine index
-    auto idx = rewriter.create<linalg::IndexOp>(rewriter.getIndexAttr(0));
-    rewriter.replaceAllUsesWith(loop.getInductionVariable(), idx);
+    auto idx = rewriter.create<linalg::IndexOp>(loop.getLoc(), rewriter.getIndexAttr(0));
+    rewriter.replaceAllUsesWith(loop.getInductionVar(), idx);
 
-    // TODO
-    // replace all loads with the corresponding block arguments we will create in the linalg generic
-    //
+    blk->eraseArguments(0, blk->getNumArguments());
 
+    for (auto &&[conds, load] : loads) {
+        auto arg = blk->addArgument(load.getType(), load.getLoc());
+        rewriter.replaceOp(load, arg);
+    }
 
     SmallVector<Value> toreturn;
-    // push all sotred values into a linalg.yield
 
-    // fixup the return
+    for (auto &&[conds, store] : stores) {
+        toreturn.push_back(store.getValueToStore());
+        rewriter.eraseOp(store);
+    }
 
+    rewriter.eraseOp(blk->getTerminator());
+    rewriter.setInsertionPointToEnd(blk);
+    rewriter.create<linalg::YieldOp>(loop.getLoc(), toreturn);
 
 
     // return success!
-
-
     return failure();
   }
 };

From 17d5c262304f936112c72f055e2ed7ecb9236c68 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 7 May 2024 20:20:16 -0400
Subject: [PATCH 3/7] Fixup

---
 lib/polygeist/Passes/RaiseToLinalg.cpp | 11 +++++----
 test/polygeist-opt/linalgraise.mlir    | 32 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 test/polygeist-opt/linalgraise.mlir

diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index 09c19f8df5be..31dd32205055 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -45,7 +45,7 @@ struct Condition {
     AffineIfOp op;
     Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {}
 };
-struct ForOpRaising : public OpRewritePattern<affine::AffineForOp> {
+struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
   using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(affine::AffineForOp loop,
@@ -65,6 +65,7 @@ struct ForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     // Additionally, for each load/store, remember what conditions are
     // required for that load or store to execute.
     auto result = loop->walk<WalkOrder::PreOrder>([&](Operation* op) {
+        if (op == loop) return WalkResult::advance();
         // TODO extend this, any non-memory operation is also legal here.
         // mul, add, etc (we can just check propety)
         if (isa<AffineYieldOp, AffineIfOp>(op)) {
@@ -82,10 +83,10 @@ struct ForOpRaising : public OpRewritePattern<affine::AffineForOp> {
                 conditions.emplace_back(ifTrue, ifstmt);
                 cur = ifstmt->getParentOp();
             }
-            if (auto load = dyn_cast<AffineLoadOp>(cur)) {
+            if (auto load = dyn_cast<AffineLoadOp>(op)) {
                 loads.emplace_back(conditions, load);
             } else {
-                auto store = cast<AffineStoreOp>(cur);
+                auto store = cast<AffineStoreOp>(op);
                 stores.emplace_back(conditions, store);
             }
             return WalkResult::advance();
@@ -173,13 +174,13 @@ struct ForOpRaising : public OpRewritePattern<affine::AffineForOp> {
 
 
     // return success!
-    return failure();
+    return success();
   }
 };
 
 void RaiseAffineToLinalg::runOnOperation() {
   RewritePatternSet patterns(&getContext());
-  patterns.insert<ForOpRaising>(&getContext());
+  patterns.insert<AffineForOpRaising>(&getContext());
 
   GreedyRewriteConfig config;
   (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir
new file mode 100644
index 000000000000..91f7d86a8fbd
--- /dev/null
+++ b/test/polygeist-opt/linalgraise.mlir
@@ -0,0 +1,32 @@
+// RUN: polygeist-opt --raise-affine-to-linalg --split-input-file %s | FileCheck %s
+
+module {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    scf.if %12 {
+      affine.for %arg4 = 0 to %17 {
+        %ld = affine.load %18[%arg4] : memref<?xf32>
+        affine.store %ld, %19[%arg4] : memref<?xf32>
+      }
+   }
+    return
+  }
+}
+
+// CHECK:   func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref<?xf32>, %[[arg3:.+]]: memref<?xf32>) {
+// CHECK-NEXT:     %[[c4:.+]] = arith.constant 4 : index
+// CHECK-NEXT:     %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index
+// CHECK-NEXT:     %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index
+// CHECK-NEXT:     %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index
+// CHECK-NEXT:     scf.if %[[arg0]] {
+// CHECK-NEXT:       affine.for %[[arg4:.+]] = 0 to %[[V2]] {
+// CHECK-NEXT:         %[[a:.+]] = memref.load %[[arg3]][%[[arg4]]] : memref<?xf32>
+// CHECK-NEXT:         memref.store %[[a]], %[[arg2]][%[[arg4]]] : memref<?xf32>
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }

From b4d199f39aa2903abe0ac92ce6d4ca8dcc58d467 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 7 May 2024 20:42:07 -0400
Subject: [PATCH 4/7] minimal functional

---
 lib/polygeist/Passes/RaiseToLinalg.cpp | 20 +++++++++++++-------
 test/polygeist-opt/linalgraise.mlir    | 11 +++++++----
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index 31dd32205055..75080be2168e 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -138,22 +138,24 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     // TODO revisit this later
     iteratorTypes.push_back(utils::IteratorType::parallel);
 
+    StringAttr empty = StringAttr::get(loop.getContext());
     auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
       loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
-      StringAttr(),
-      StringAttr());
+      empty,
+      empty);
 
 
-    auto &body = genericOp.getRegion();
-    body.takeBody(loop.getRegion());
-
-    auto blk = &*body.begin();
+    auto blk = &*loop.getRegion().begin();
     rewriter.setInsertionPointToStart(blk);
 
     // This index will replace the use of the affine index
     auto idx = rewriter.create<linalg::IndexOp>(loop.getLoc(), rewriter.getIndexAttr(0));
     rewriter.replaceAllUsesWith(loop.getInductionVar(), idx);
 
+    auto &body = genericOp.getRegion();
+    body.takeBody(loop.getRegion());
+
+
     blk->eraseArguments(0, blk->getNumArguments());
 
     for (auto &&[conds, load] : loads) {
@@ -161,6 +163,10 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         rewriter.replaceOp(load, arg);
     }
 
+    for (auto &&[conds, store] : stores) {
+        blk->addArgument(store.getValueToStore().getType(), store.getLoc());
+    }
+
     SmallVector<Value> toreturn;
 
     for (auto &&[conds, store] : stores) {
@@ -172,7 +178,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     rewriter.setInsertionPointToEnd(blk);
     rewriter.create<linalg::YieldOp>(loop.getLoc(), toreturn);
 
-
+    rewriter.eraseOp(loop);
     // return success!
     return success();
   }
diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir
index 91f7d86a8fbd..0470ba91dc74 100644
--- a/test/polygeist-opt/linalgraise.mlir
+++ b/test/polygeist-opt/linalgraise.mlir
@@ -19,14 +19,17 @@ module {
   }
 }
 
+// CHECK: #map = affine_map<(d0) -> (d0)>
 // CHECK:   func.func @main(%[[arg0:.+]]: i1, %[[arg1:.+]]: i32, %[[arg2:.+]]: memref<?xf32>, %[[arg3:.+]]: memref<?xf32>) {
 // CHECK-NEXT:     %[[c4:.+]] = arith.constant 4 : index
 // CHECK-NEXT:     %[[V0:.+]] = arith.index_cast %[[arg1]] : i32 to index
 // CHECK-NEXT:     %[[V1:.+]] = arith.muli %[[V0]], %[[c4]] : index
 // CHECK-NEXT:     %[[V2:.+]] = arith.divui %[[V1]], %[[c4]] : index
 // CHECK-NEXT:     scf.if %[[arg0]] {
-// CHECK-NEXT:       affine.for %[[arg4:.+]] = 0 to %[[V2]] {
-// CHECK-NEXT:         %[[a:.+]] = memref.load %[[arg3]][%[[arg4]]] : memref<?xf32>
-// CHECK-NEXT:         memref.store %[[a]], %[[arg2]][%[[arg4]]] : memref<?xf32>
-// CHECK-NEXT:       }
+// TODO note that presently we do not ensure that the memrefs are sliced to the right size as the space requires
+// CHECK-NEXT:        linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg2 : memref<?xf32>) outs(%alloca : memref<?xf32>) {
+// CHECK-NEXT:        ^bb0(%in: f32, %out: f32):
+// CHECK-NEXT:          linalg.yield %in : f32
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
 // CHECK-NEXT:     }

From b8b4fe7c12eea8b1eaf7a665174777926536642c Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 14 May 2024 21:30:53 -0400
Subject: [PATCH 5/7] Now actually correct for indexing

---
 lib/polygeist/Passes/RaiseToLinalg.cpp | 183 ++++++++++++++++++++++++-
 test/polygeist-opt/linalgraise.mlir    |  23 +++-
 2 files changed, 199 insertions(+), 7 deletions(-)

diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index 75080be2168e..78c1dbe27f9e 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -1,6 +1,7 @@
 #include "PassDetails.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -13,6 +14,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "polygeist/Passes/Passes.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/IR/AffineExpr.h"
 
 #define DEBUG_TYPE "raise-to-linalg"
 
@@ -45,6 +47,135 @@ struct Condition {
     AffineIfOp op;
     Condition(bool ifTrue, AffineIfOp op) : ifTrue(ifTrue), op(op) {}
 };
+
+bool isLinearInIndex(AffineExpr expr, size_t idx) {
+    if (!expr.isFunctionOfDim(idx)) {
+        return true;
+    }
+
+    if (expr.getKind() == AffineExprKind::DimId) {
+        return true;
+    }
+
+        if (expr.getKind() == AffineExprKind::Add) {
+            auto binop = expr.cast<AffineBinaryOpExpr>();
+            return isLinearInIndex(binop.getLHS(), idx) && isLinearInIndex(binop.getRHS(), idx);
+        }
+        if (expr.getKind() == AffineExprKind::Mul) {
+            auto binop = expr.cast<AffineBinaryOpExpr>();
+            return (isLinearInIndex(binop.getLHS(), idx) && !binop.getRHS().isFunctionOfDim(idx)) ||
+                    (isLinearInIndex(binop.getRHS(), idx) && !binop.getLHS().isFunctionOfDim(idx));
+        }
+
+    return false;
+}
+
+bool isLinearInIndex(AffineMap map, size_t idx) {
+    for (auto expr : map.getResults()) {
+        if (!isLinearInIndex(expr, idx))
+            return false;
+    }
+    return true;
+}
+
+ AffineExpr shiftDimsDown1(AffineExpr expr, unsigned numDims,
+                                  unsigned offset) {
+   SmallVector<AffineExpr, 4> dims;
+   for (unsigned idx = 0; idx < offset; ++idx)
+     dims.push_back(getAffineDimExpr(idx, expr.getContext()));
+   for (unsigned idx = offset; idx < numDims; ++idx)
+     dims.push_back(getAffineDimExpr(idx - 1, expr.getContext()));
+   return expr.replaceDimsAndSymbols(dims, {});
+ }
+
+ AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim,
+                                  unsigned offset) {
+            assert(offset <= expr.getNumDims());
+     return AffineMap::get(expr.getNumDims() - 1, expr.getNumSymbols(),
+                           llvm::map_to_vector<4>(
+                               expr.getResults(),
+                               [&](AffineExpr e) {
+                                 return shiftDimsDown1(e, expr.getNumDims(), offset);
+                               }),
+                           expr.getContext());
+                                  }
+
+// Given an affine map `oldmap`, memref `val`, and corresponding input values (which are a list of indicies, then symbols),
+// and a loop index `ind` produce the following:
+//  1. A (potentially new) memref value `newval` which does not have any dependence on `ind`
+//     and
+//  2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that
+//     indexing `newval[map(ind)]` produces the same result as indexing the original map.
+std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, mlir::OperandRange vals) {
+    // First we need to remove any dependence on the loop index from the affine map
+    SmallVector<Value> vals_without_idx;
+    ssize_t dim_idx = -1;
+    for (auto &&[i, v] : llvm::enumerate(vals)) {
+        if (v == idx) {
+            // Offset we're replacing must be an index (not a symbol).
+            // If we guarantee to run AffineCFG first, this should always be true.
+            assert(i < oldmap.getNumDims());
+            // There should only be one use of the index.
+            assert(dim_idx == -1);
+            dim_idx = i;
+            continue;
+        }
+        vals_without_idx.push_back(v);
+    }
+
+    if (dim_idx != -1 && !isLinearInIndex(oldmap, dim_idx)) {
+        legal = false;
+        return {val, oldmap};
+    }
+
+
+    // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables
+
+    AffineMap offsetMap = oldmap;
+    if (dim_idx != -1) {
+        offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(0),offsetMap.getNumDims(), offsetMap.getNumSymbols());
+        offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx);
+    }
+
+    AffineMap strideMap = oldmap;
+    if (dim_idx != -1) {
+        strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(1),offsetMap.getNumDims(), offsetMap.getNumSymbols());
+        strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx);
+    }
+
+    {
+        SmallVector<AffineExpr> subtracts;
+        for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) {
+            subtracts.push_back(lhs - rhs);
+        }
+        strideMap = AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), subtracts, builder.getContext());
+    }
+
+    // Expression to index into the generated subview given the loop index
+    SmallVector<AffineExpr> loop_idxs;
+
+    // List of starting offsets into the subview
+    SmallVector<Value> offsets;
+    SmallVector<Value> sizes;
+    SmallVector<Value> strides;
+
+    for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) {
+        offsets.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(), offset_expr, vals_without_idx));
+        strides.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(), stride_expr, vals_without_idx));
+        if (!expr.isFunctionOfDim(dim_idx)) {
+            loop_idxs.push_back(builder.getAffineConstantExpr(0));
+            sizes.push_back(builder.create<arith::ConstantIndexOp>(val.getLoc(), 1));
+        } else {
+            loop_idxs.push_back(builder.getAffineDimExpr(0));
+            sizes.push_back(idx_size);
+        }
+    }
+
+    auto newval = builder.create<memref::SubViewOp>(val.getLoc(), val, offsets, sizes, strides);
+    legal = true;
+    return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())};
+}
+
 struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
   using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
 
@@ -115,23 +246,63 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         }
     }
 
-
-
     SmallVector<Value> inputs;
     SmallVector<AffineMap> affineMaps;
+
+    if (loop.getStep() != 1) {
+        return failure();
+    }
+
+    // our remapper currently assumes 0 start to bound. 
+    if (!loop.hasConstantLowerBound() || loop.getConstantLowerBound() != 0) {
+        return failure();
+    }
+
+    // compute this correctly later.
+    auto ub = loop.getSingleUpperBound();
+    if (!ub) return failure();
+
+    auto lb = loop.getSingleLowerBound();
+    if (!lb) return failure();
+    
+
+    if (!loop.hasConstantUpperBound()) {
+        return failure();
+    }
+
+    Value loopSize = rewriter.create<arith::ConstantIndexOp>(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create<arith::SubIOp>(loop.getLoc(), *ub, *lb);
+
+    // current spec is going to be indexed off of the loop var in isolation
     for (auto &&[conds, load] : loads) {
         // Only support unconditional loads for the moment
         if (conds.size() != 0) return failure();
-        inputs.push_back(load.getMemref());
-        affineMaps.push_back(load.getAffineMap());
+
+        bool legal = true;
+       
+        auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(),
+        loopSize, load.getMapOperands());
+
+        if (!legal) return failure();
+
+        affineMaps.push_back(newAffineMap);
+        inputs.push_back(newMemref);
     }
     
     SmallVector<Value> outputs;
+    // Store we may need to reindex into a splat potentially later, but for now we'll be lazy
     for (auto &&[conds, store] : stores) {
         // Only support unconditional loads for the moment
         if (conds.size() != 0) return failure();
-        outputs.push_back(store.getMemref());
-        affineMaps.push_back(store.getAffineMap());
+
+        bool legal = true;
+       
+        auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(),
+        loopSize, store.getMapOperands());
+
+        if (!legal) return failure();
+
+        affineMaps.push_back(newAffineMap);
+        outputs.push_back(newMemref);
     }
 
     SmallVector<utils::IteratorType> iteratorTypes;
diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir
index 0470ba91dc74..0e23a7c28a0b 100644
--- a/test/polygeist-opt/linalgraise.mlir
+++ b/test/polygeist-opt/linalgraise.mlir
@@ -10,13 +10,34 @@ module {
     %17 = arith.divui %16, %c4 : index
     %19 = memref.alloca(%17) : memref<?xf32>
     scf.if %12 {
-      affine.for %arg4 = 0 to %17 {
+      affine.for %arg4 = 0 to 17 {
         %ld = affine.load %18[%arg4] : memref<?xf32>
         affine.store %ld, %19[%arg4] : memref<?xf32>
       }
    }
     return
   }
+
+
+  func.func @main2(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    scf.if %12 {
+      affine.for %arg4 = 0 to 17 {
+        %ld = affine.load %18[3 * %arg4] : memref<?xf32>
+        %ld2 = affine.load %18[0] : memref<?xf32>
+        %fadd = arith.addf %ld, %ld2 : f32
+        affine.store %fadd, %19[%arg4 + 17] : memref<?xf32>
+      }
+   }
+    return
+  }
+
 }
 
 // CHECK: #map = affine_map<(d0) -> (d0)>

From e1dd3e414e1a63a78def0d49bcf5c33c2ef88c6c Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 21 May 2024 20:07:41 -0400
Subject: [PATCH 6/7] Now featuring reductions

---
 lib/polygeist/Passes/RaiseToLinalg.cpp | 106 ++++++-
 test/polygeist-opt/linalgraise.mlir    | 374 ++++++++++++++++++++++++-
 2 files changed, 465 insertions(+), 15 deletions(-)

diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index 78c1dbe27f9e..320243ba415a 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "polygeist/Passes/Passes.h"
@@ -88,6 +89,7 @@ bool isLinearInIndex(AffineMap map, size_t idx) {
    return expr.replaceDimsAndSymbols(dims, {});
  }
 
+//This is reducing the number of input dims in expression by 1
  AffineMap shiftDimsDown1(AffineMap expr, unsigned numDim,
                                   unsigned offset) {
             assert(offset <= expr.getNumDims());
@@ -106,10 +108,11 @@ bool isLinearInIndex(AffineMap map, size_t idx) {
 //     and
 //  2. an affine map `newmap` which takes a single index (`ind`) and produces indices into `newval` such that
 //     indexing `newval[map(ind)]` produces the same result as indexing the original map.
-std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, mlir::OperandRange vals) {
+std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder, AffineMap oldmap, Value val, Value idx, Value idx_size, int loopLowerBound, int loopStepSize, mlir::OperandRange vals) {
     // First we need to remove any dependence on the loop index from the affine map
     SmallVector<Value> vals_without_idx;
     ssize_t dim_idx = -1;
+    //To check if induction variable of for loop in an operand of this op (load/store)
     for (auto &&[i, v] : llvm::enumerate(vals)) {
         if (v == idx) {
             // Offset we're replacing must be an index (not a symbol).
@@ -131,18 +134,21 @@ std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder,
 
     // Evaluate offsets as oldmap replacing idx with 0, and evaluating at the remaining variables
 
+    //Instead of lower bound we are using 0 (assumption as the lower bound)
     AffineMap offsetMap = oldmap;
     if (dim_idx != -1) {
-        offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(0),offsetMap.getNumDims(), offsetMap.getNumSymbols());
+        offsetMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound),offsetMap.getNumDims(), offsetMap.getNumSymbols());
         offsetMap = shiftDimsDown1(offsetMap, oldmap.getNumDims(), dim_idx);
     }
 
+    //Instead of using loop step we are using 1 (Assumption as the stride size)
     AffineMap strideMap = oldmap;
     if (dim_idx != -1) {
-        strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(1),offsetMap.getNumDims(), offsetMap.getNumSymbols());
+        strideMap = oldmap.replace(builder.getAffineDimExpr(dim_idx), builder.getAffineConstantExpr(loopLowerBound + loopStepSize),strideMap.getNumDims(), strideMap.getNumSymbols());
         strideMap = shiftDimsDown1(strideMap, oldmap.getNumDims(), dim_idx);
     }
 
+    //Subtracting maps of stride and offset, gives you the offset value in the result of the map
     {
         SmallVector<AffineExpr> subtracts;
         for (auto &&[lhs, rhs] : llvm::zip(strideMap.getResults(), offsetMap.getResults())) {
@@ -160,8 +166,8 @@ std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder,
     SmallVector<Value> strides;
 
     for (auto &&[expr, offset_expr, stride_expr] : llvm::zip(oldmap.getResults(), offsetMap.getResults(),strideMap.getResults() )) {
-        offsets.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(), offset_expr, vals_without_idx));
-        strides.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(), stride_expr, vals_without_idx));
+        offsets.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(),AffineMap::get(offsetMap.getNumDims(), offsetMap.getNumSymbols(), offset_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression?
+        strides.push_back(builder.create<affine::AffineApplyOp>(val.getLoc(),AffineMap::get(strideMap.getNumDims(), strideMap.getNumSymbols(), stride_expr, builder.getContext()), vals_without_idx)); //What is there are symbols in the expression?
         if (!expr.isFunctionOfDim(dim_idx)) {
             loop_idxs.push_back(builder.getAffineConstantExpr(0));
             sizes.push_back(builder.create<arith::ConstantIndexOp>(val.getLoc(), 1));
@@ -173,9 +179,20 @@ std::pair<Value, AffineMap> remap_in_affine_dim(bool &legal, OpBuilder &builder,
 
     auto newval = builder.create<memref::SubViewOp>(val.getLoc(), val, offsets, sizes, strides);
     legal = true;
+    //Does this need fix? Here we are constraining to dims as 1 and symbols as 0, should it be, original 
     return {newval, AffineMap::get(/*dims*/1, /*symbols*/0, loop_idxs, builder.getContext())};
 }
 
+
+// store A[...]
+// val = load A[...]
+
+/*  prevA : 
+    store A
+    val is now prevA
+*/
+
+
 struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
   using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
 
@@ -230,11 +247,21 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     
     if (result.wasInterrupted()) return failure();
 
+    DominanceInfo DI(loop);
+
     // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result)
     // TODO we can extend this and handle things like reductions, but we're going to start easy for now
+    DenseMap<AffineLoadOp, AffineStoreOp> stores_map;
     for (auto &&[_, store] : stores) {
         for (auto &&[_, load]: loads) {
             if (mayAlias(load.getMemref(), store.getMemref())) {
+                // We have one exception in this case -- if the load and store are from the exact same location, it is permitted.
+                if (load.getMemref() == store.getMemref() &&
+                    load.getAffineMap() == store.getAffineMap() &&
+                    load.getIndices() == store.getIndices() && DI.dominates((Operation*)load,(Operation*)store)) {
+                        stores_map[load] = store;
+                        continue;
+                    }
                 return failure();
             }
         }
@@ -249,16 +276,25 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     SmallVector<Value> inputs;
     SmallVector<AffineMap> affineMaps;
 
-    if (loop.getStep() != 1) {
-        return failure();
-    }
+    //if (loop.getStep() != 1) {
+    //    return failure();
+    //}
 
     // our remapper currently assumes 0 start to bound. 
-    if (!loop.hasConstantLowerBound() || loop.getConstantLowerBound() != 0) {
+    if (!loop.hasConstantLowerBound() /*|| loop.getConstantLowerBound() != 0*/) {
         return failure();
     }
 
     // compute this correctly later.
+    auto ubMap = loop.getUpperBoundMap();
+    auto ubOperands = loop.getUpperBoundOperands();
+    if (!ubMap || ubMap.getNumResults() != 1) return failure();
+
+    // Retrieve the lower bound
+    auto lbMap = loop.getLowerBoundMap();
+    auto lbOperands = loop.getLowerBoundOperands();
+    if (!lbMap || lbMap.getNumResults() != 1) return failure();
+    
     auto ub = loop.getSingleUpperBound();
     if (!ub) return failure();
 
@@ -270,17 +306,41 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         return failure();
     }
 
-    Value loopSize = rewriter.create<arith::ConstantIndexOp>(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create<arith::SubIOp>(loop.getLoc(), *ub, *lb);
+    // Retrieve the step size
+    int64_t step = loop.getStep();
+
+    // Get the single result expressions
+    AffineExpr ubExpr = ubMap.getResult(0);
+    auto ubValue = rewriter.create<AffineApplyOp>(loop.getLoc(), ubMap, ubOperands);
+    
+    AffineExpr lbExpr = lbMap.getResult(0);
+    auto lbValue = rewriter.create<AffineApplyOp>(loop.getLoc(), lbMap, lbOperands);
+
+    //// Ensure the bounds are constant expressions
+    auto ubConst = ubExpr.dyn_cast<AffineConstantExpr>();
+    auto lbConst = lbExpr.dyn_cast<AffineConstantExpr>();
+    if (!ubConst || !lbConst) return failure();
 
+    // Compute the loop size
+    //int64_t loopSize = ubConst.getValue() - lbConst.getValue();
+    auto loopSize = rewriter.create<SubIOp>(loop.getLoc(), ubValue, lbValue);
+    
+    //Value loopSize = rewriter.create<arith::ConstantIndexOp>(loop.getLoc(), loop.getConstantUpperBound());//rewriter.create<arith::SubIOp>(loop.getLoc(), *ub, *lb);
+    
     // current spec is going to be indexed off of the loop var in isolation
     for (auto &&[conds, load] : loads) {
         // Only support unconditional loads for the moment
         if (conds.size() != 0) return failure();
 
+        if (stores_map.find(load) != stores_map.end()) {
+            // We have a store that represents this load.
+            continue;
+        }
+
         bool legal = true;
        
         auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, load.getAffineMap(), load.getMemref(), loop.getInductionVar(),
-        loopSize, load.getMapOperands());
+        loopSize, lbConst.getValue(), step, load.getMapOperands());
 
         if (!legal) return failure();
 
@@ -297,7 +357,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         bool legal = true;
        
         auto &&[newMemref, newAffineMap] = remap_in_affine_dim(legal, rewriter, store.getAffineMap(), store.getMemref(), loop.getInductionVar(),
-        loopSize, store.getMapOperands());
+        loopSize, lbConst.getValue(), step, store.getMapOperands());
 
         if (!legal) return failure();
 
@@ -307,7 +367,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
 
     SmallVector<utils::IteratorType> iteratorTypes;
     // TODO revisit this later
-    iteratorTypes.push_back(utils::IteratorType::parallel);
+    iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction);
 
     StringAttr empty = StringAttr::get(loop.getContext());
     auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
@@ -330,12 +390,30 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
     blk->eraseArguments(0, blk->getNumArguments());
 
     for (auto &&[conds, load] : loads) {
+        if (stores_map.find(load) != stores_map.end()) {
+            // We have a store that represents this load.
+            continue;
+        }
         auto arg = blk->addArgument(load.getType(), load.getLoc());
         rewriter.replaceOp(load, arg);
+
     }
 
     for (auto &&[conds, store] : stores) {
-        blk->addArgument(store.getValueToStore().getType(), store.getLoc());
+        auto arg = blk->addArgument(store.getValueToStore().getType(), store.getLoc());
+
+        SmallVector<AffineLoadOp> inverted;
+        for (auto && [map_load, map_store] : stores_map) {
+            if (map_store == store) {
+                inverted.push_back(map_load);
+            }
+        }
+        for (size_t i=0; i<inverted.size(); i++) {
+            stores_map.erase(inverted[i]);
+            auto tmp = inverted[i];
+            inverted[i] = nullptr;
+            rewriter.replaceOp(tmp, arg);
+        }
     }
 
     SmallVector<Value> toreturn;
diff --git a/test/polygeist-opt/linalgraise.mlir b/test/polygeist-opt/linalgraise.mlir
index 0e23a7c28a0b..e0ceffa1849c 100644
--- a/test/polygeist-opt/linalgraise.mlir
+++ b/test/polygeist-opt/linalgraise.mlir
@@ -10,7 +10,7 @@ module {
     %17 = arith.divui %16, %c4 : index
     %19 = memref.alloca(%17) : memref<?xf32>
     scf.if %12 {
-      affine.for %arg4 = 0 to 17 {
+      affine.for %arg4 = 0 to %17 {
         %ld = affine.load %18[%arg4] : memref<?xf32>
         affine.store %ld, %19[%arg4] : memref<?xf32>
       }
@@ -54,3 +54,375 @@ module {
 // CHECK-NEXT:        }
 // CHECK-NEXT:      }
 // CHECK-NEXT:     }
+
+//constant-access
+module @constant_access{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %ci324 = arith.constant 4.0 : f32
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ci324 : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//constant-mem-access
+module @constant_mem_access{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 4 to 17 step 2 {
+      %ld = affine.load %18[3*%arg4] : memref<?xf32>
+      %ld2 = affine.load %18[%c4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld2 : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//without-if
+module @no_if{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      affine.store %ld, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//arith.mul
+module @arith_mul{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//arith.add
+module @arith_add{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>  ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %ld2 = affine.load %20[%arg4] : memref<?xf32>
+      %add = arith.addf %ld1, %ld2 : f32
+      %mul = arith.mulf %add, %add : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//Conditional arith
+module @cond_arith{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %if = scf.if %12 -> f32 {
+        %mul = arith.mulf %ld, %ld : f32
+        scf.yield %mul : f32
+      } else {
+        scf.yield %ld : f32
+      }
+      affine.store %if, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//reduction
+module @reduction{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>  ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    %sum_0 = arith.constant 0.0 : f32
+    %red = affine.for %arg4 = 0 to 17 step 1 iter_args(%sum_iter = %sum_0) -> f32 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %sum_next = arith.addf %sum_iter, %ld1 : f32
+      affine.yield %sum_next : f32
+    }
+    affine.store %red, %19[0] : memref<?xf32>
+    return
+  }
+}
+
+//Conditional store-1
+module @cond_store_1 {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      scf.if %12 {
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Conditional store-2
+module @cond_store_2{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32> ) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      scf.if %12 {
+        %mul = arith.mulf %ld, %ld : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      } else {
+        affine.store %ld, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Parallel for
+module @parallel_for{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg4 = 0 to 17 {
+      %ld = affine.load %18[%arg4] : memref<?xf32>
+      %mul = arith.mulf %ld, %ld : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    affine.for %arg4 = 0 to 17 {
+      %ld1 = affine.load %18[%arg4] : memref<?xf32>
+      %ld2 = affine.load %20[%arg4] : memref<?xf32>
+      %add = arith.addf %ld1, %ld2 : f32
+      %mul = arith.mulf %add, %add : f32
+      affine.store %mul, %19[%arg4] : memref<?xf32>
+    }
+    return
+  }
+}
+
+//Fors inside for
+module @for_within_for{
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %21 = arith.muli %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg3 = 0 to 21 {
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %mul = arith.mulf %ld1, %ld2 : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//Parallel fors inside for
+module @parallel_fors_inside_for {
+  func.func @main(%12 : i1, %14 : i32, %18 : memref<?xf32>, %20 : memref<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %15 = arith.index_cast %14 : i32 to index
+    %16 = arith.muli %15, %c4 : index
+    %17 = arith.divui %16, %c4 : index
+    %19 = memref.alloca(%17) : memref<?xf32>
+    affine.for %arg3 = 0 to 17 {
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %mul = arith.mulf %ld1, %ld2 : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+      affine.for %arg4 = 0 to 17 {
+        %ld1 = affine.load %18[%arg3] : memref<?xf32>
+        %ld2 = affine.load %20[%arg4] : memref<?xf32>
+        %add = arith.addf %ld1, %ld2 : f32
+        %mul = arith.mulf %add, %add : f32
+        affine.store %mul, %19[%arg4] : memref<?xf32>
+      }
+    }
+    return
+  }
+}
+
+//matrix-mul iter arg
+module @matmul_1 {
+  memref.global @out : memref<32x8xi32> = uninitialized
+  memref.global @im2 : memref<8x8xi32> = uninitialized
+  memref.global @im1 : memref<32x8xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im1 : memref<32x8xi32>
+    %1 = memref.get_global @im2 : memref<8x8xi32>
+    %2 = memref.get_global @out : memref<32x8xi32>
+    affine.for %arg0 = 0 to 32 {
+      affine.for %arg1 = 0 to 8 {
+        %3 = affine.for %arg2 = 0 to 8 iter_args(%arg3 = %c0_i32) -> (i32) {
+          %4 = affine.load %0[%arg0, %arg2] : memref<32x8xi32>
+          %5 = affine.load %1[%arg2, %arg1] : memref<8x8xi32>
+          %6 = arith.muli %4, %5 : i32
+          %7 = arith.addi %arg3, %6 : i32
+          affine.yield %7 : i32
+        }
+        affine.store %3, %2[%arg0, %arg1] : memref<32x8xi32>
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//matrix-mul alias issue
+module @matmul_2 {
+  memref.global @out : memref<128x32xi32> = uninitialized
+  memref.global @im2 : memref<64x32xi32> = uninitialized
+  memref.global @im1 : memref<128x64xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im1 : memref<128x64xi32>
+    %1 = memref.get_global @im2 : memref<64x32xi32>
+    %2 = memref.get_global @out : memref<128x32xi32>
+    affine.for %arg0 = 0 to 128 {
+      affine.for %arg1 = 0 to 32 {
+        affine.for %arg2 = 0 to 64 {
+          %3 = affine.load %0[%arg0, %arg2] : memref<128x64xi32>
+          %4 = affine.load %1[%arg2, %arg1] : memref<64x32xi32>
+          %5 = arith.muli %3, %4 : i32
+          %6 = affine.load %2[%arg0, %arg1] : memref<128x32xi32>
+          %7 = arith.addi %6, %5 : i32
+          affine.store %7, %2[%arg0, %arg1] : memref<128x32xi32>
+        }
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//conv (with inner loop accumulate)
+//How to deal with IR in outer loops as well?
+module @conv_1{
+  memref.global @out : memref<512x64xi32> = uninitialized
+  memref.global @filter : memref<4x4xi32> = uninitialized
+  memref.global @im : memref<515x67xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im : memref<515x67xi32>
+    %1 = memref.get_global @filter : memref<4x4xi32>
+    %2 = memref.get_global @out : memref<512x64xi32>
+    affine.for %arg0 = 0 to 512 {
+      affine.for %arg1 = 0 to 64 {
+        %3 = affine.for %arg2 = 0 to 4 iter_args(%arg3 = %c0_i32) -> (i32) {
+          %4 = affine.for %arg4 = 0 to 4 iter_args(%arg5 = %arg3) -> (i32) {
+            %5 = affine.load %0[%arg0 + %arg2, %arg1 + %arg4] : memref<515x67xi32>
+            %6 = affine.load %1[%arg2, %arg4] : memref<4x4xi32>
+            %7 = arith.muli %5, %6 : i32
+            %8 = arith.addi %arg5, %7 : i32
+            affine.yield %8 : i32
+          }
+          affine.yield %4 : i32
+        }
+        affine.store %3, %2[%arg0, %arg1] : memref<512x64xi32>
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+//conv (direct store)
+module @conv_2{
+  memref.global @out : memref<512x64xi32> = uninitialized
+  memref.global @filter : memref<4x4xi32> = uninitialized
+  memref.global @im : memref<515x67xi32> = uninitialized
+  func.func @main() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @im : memref<515x67xi32>
+    %1 = memref.get_global @out : memref<512x64xi32>
+    affine.for %arg0 = 0 to 512 {
+      affine.for %arg1 = 0 to 64 {
+        affine.for %arg2 = 0 to 4 {
+          affine.for %arg3 = 0 to 4 {
+            %2 = affine.load %0[%arg0 + %arg2, %arg1 + %arg3] : memref<515x67xi32>
+            %3 = affine.load %1[%arg0, %arg1] : memref<512x64xi32>
+            %4 = arith.addi %3, %2 : i32
+            affine.store %4, %1[%arg0, %arg1] : memref<512x64xi32>
+          }
+        }
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
\ No newline at end of file

From 1bbf3f69623fdeeb5774d387e212c3bb39e75ce0 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 21 May 2024 20:16:42 -0400
Subject: [PATCH 7/7] add comments on the raising fors

---
 lib/polygeist/Passes/RaiseToLinalg.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/lib/polygeist/Passes/RaiseToLinalg.cpp b/lib/polygeist/Passes/RaiseToLinalg.cpp
index 320243ba415a..254d3a11881b 100644
--- a/lib/polygeist/Passes/RaiseToLinalg.cpp
+++ b/lib/polygeist/Passes/RaiseToLinalg.cpp
@@ -207,6 +207,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
 
     SmallVector<std::pair<std::vector<Condition>, AffineLoadOp>> loads;
     SmallVector<std::pair<std::vector<Condition>, AffineStoreOp>> stores;
+    // TODO Also collect all the linalg generics!
 
     // Check that the only operations within the region are either:
     //      affine.load, affine.store, affine.if, affine.yield
@@ -251,6 +252,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
 
     // Check that all of the stores do not alias the loaded values (otherwise we could get an incorrect result)
     // TODO we can extend this and handle things like reductions, but we're going to start easy for now
+    // TODO 
     DenseMap<AffineLoadOp, AffineStoreOp> stores_map;
     for (auto &&[_, store] : stores) {
         for (auto &&[_, load]: loads) {
@@ -272,6 +274,8 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
             }
         }
     }
+    // Check that any other loads / stores do not alias with any linalg generics
+    // We're going to need to upgrade the defn of mayAlias for subviews (aka mayAlias(subview, x) -> mayAlias(operand(subview), x))
 
     SmallVector<Value> inputs;
     SmallVector<AffineMap> affineMaps;
@@ -347,6 +351,7 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         affineMaps.push_back(newAffineMap);
         inputs.push_back(newMemref);
     }
+    // TODO Push all of the inputs to the linalg generics (modifying maps as needed)
     
     SmallVector<Value> outputs;
     // Store we may need to reindex into a splat potentially later, but for now we'll be lazy
@@ -364,18 +369,24 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
         affineMaps.push_back(newAffineMap);
         outputs.push_back(newMemref);
     }
+    // TODO Push all of the outputs to the linalg generics
 
+    // TODO presently  if linalg generic exists, assert there are no load/stores
+    // TODO assert only zero or one linalg generic exists
     SmallVector<utils::IteratorType> iteratorTypes;
-    // TODO revisit this later
+    // TODO if linalg generic exists, make this iterator type prepend to the existing iterators
     iteratorTypes.push_back((stores_map.size() == 0) ? utils::IteratorType::parallel : utils::IteratorType::reduction);
 
+
+
     StringAttr empty = StringAttr::get(loop.getContext());
     auto genericOp = rewriter.create<mlir::linalg::GenericOp>(
       loop.getLoc(), TypeRange(), inputs, outputs, affineMaps, iteratorTypes,
       empty,
       empty);
 
-
+    // TODO if doing the linalg generic case, ignore a lot of the below and instead of injecting the old body of the affine.for, move the inner linalg.generic body 
+    // and also add a new induction variable
     auto blk = &*loop.getRegion().begin();
     rewriter.setInsertionPointToStart(blk);
 
@@ -435,6 +446,8 @@ struct AffineForOpRaising : public OpRewritePattern<affine::AffineForOp> {
 
 void RaiseAffineToLinalg::runOnOperation() {
   RewritePatternSet patterns(&getContext());
+  // TODO add the existing canonicalization patterns
+  //  + subview of an affine apply -> subview
   patterns.insert<AffineForOpRaising>(&getContext());
 
   GreedyRewriteConfig config;