Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

Open
imaihal opened this issue Mar 24, 2024 · 4 comments
Open

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

imaihal opened this issue Mar 24, 2024 · 4 comments

Comments

@imaihal
Copy link
Collaborator

imaihal commented Mar 24, 2024

This issue was observed when applying --convert-krnl-to-affine to generated KrnlIR in PR #2756 . In the PR, following kinds of KrnlIR are generated, and lowered to affine by using --convert-krnl-to-affine, but I got an error in some cases.

%loop0 = krnl.define_loops
   krnl.parallel(%loop0) : !krnl.loop
   krnl.iterate(%loop0) with (%loop0 -> %arg2 = 0 to 2){
      %idx0 = krnl.get_induction_var_value(%loop0) : (!krnl.loop) -> (index)
      %0 = arith.cmpi eq, %idx0, 0 : index
      scf.if %0 {
         // KrnlIR  for the ONNX operations (Thread 0)
      }
      %1 = arith.cmpi eq, %i, 1 : index
      scf.if  %1 {
         // KrnlIR for the ONNX operations (Thread 1)
      }
 } // krnl.iterate
  • Error message
onnx-mlir-opt: /home/imaihal/onnx-mlir/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp:238: void onnx_mlir::krnl::LoopBodyMover::moveOne(ml
ir::Value, llvm::SmallDenseMap<mlir::Value, mlir::Operation*, 4>&, bool): Assertion `insertPt != loopBody.end() && "Expecting insertPt in the l
oop"' failed.
  • Example to reproduce the error:
    Command : onnx-mlir-opt --convert-krnl-to-affine issue-ng.mlir.txt
    This file includes single MatMulOp in ForkOp and the results of MatMulOp is used in AddOp.

  • Other observations:
    No error: When removing the AddOp after ParallelOp, the error disappeared. (file: issue-ok.mlir.txt)
    No error: When comments out scf.if, the error disappeared (file: issue-ok2.mlir.txt)
    Error: Comment out krnl.parallel, the error still appeared.

  func.func @main_graph(%arg0: memref<8x1024x768xf32>) -> memref<8x1024x768xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c768 = arith.constant 768 : index
    %c1024 = arith.constant 1024 : index
    %c8 = arith.constant 8 : index
    %0 = "krnl.global"() {name = "constant_0", shape = [768, 768], value = dense<1.000000e+00> : tensor<768x768xf32>} : () -> memref<768x768xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [768], value = dense<1.000000e+00> : tensor<768xf32>} : () -> memref<768xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %2 = krnl.define_loops 1
    krnl.parallel(%2) : !krnl.loop
    krnl.iterate(%2) with (%2 -> %arg1 = 0 to 1){
      %4 = krnl.get_induction_var_value(%2) : (!krnl.loop) -> index
      %5 = arith.cmpi eq, %4, %c0 : index
      scf.if %5 {
        // MatMul
        krnl.memset %alloc, %cst : memref<8x1024x768xf32>
        %6 = krnl.define_loops 1
        krnl.iterate(%6) with (%6 -> %arg2 = %c0 to %c8){
          %7 = krnl.get_induction_var_value(%6) : (!krnl.loop) -> index
          %8:3 = krnl.define_loops 3
          %loop_block_1, %loop_local_2 = krnl.block %8#0 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_3, %loop_local_4 = krnl.block %8#1 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_5, %loop_local_6 = krnl.block %8#2 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          krnl.permute(%loop_block_1, %loop_local_2, %loop_block_3, %loop_local_4, %loop_block_5, %loop_local_6) [0, 3, 1, 4, 2, 5] : !krnl.loop, !krnl.loop, !krnl.loop, !krnl.l\
oop, !krnl.loop, !krnl.loop
          krnl.iterate(%loop_block_1, %loop_block_3, %loop_block_5) with (%8#0 -> %arg3 = %c0 to %c1024, %8#1 -> %arg4 = %c0 to %c768, %8#2 -> %arg5 = %c0 to %c768){
            %9:3 = krnl.get_induction_var_value(%loop_block_1, %loop_block_3, %loop_block_5) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
            krnl.matmul %arg0[%7, %c0, %c0], %0[%c0, %c0], %alloc[%7, %c0, %c0], (%loop_local_2, %loop_local_4, %loop_local_6), (%9#0, %9#1, %9#2), (%c1024, %c768, %c768) {aTile\
Size = [], bTileSize = [], cTileSize = [], computeTileSize = [4, 8, 8]} : memref<8x1024x768xf32>, memref<768x768xf32>, memref<8x1024x768xf32>, (!krnl.loop, !krnl.loop, !krnl.loo\
p)
          }
        }
      }
    }
//    return %alloc : memref<8x1024x768xf32>
    // Add
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %3:3 = krnl.define_loops 3
    %loop_block, %loop_local = krnl.block %3#2 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
    krnl.iterate(%3#0, %3#1, %loop_block) with (%3#0 -> %arg1 = 0 to 8, %3#1 -> %arg2 = 0 to 1024, %3#2 -> %arg3 = 0 to 768){
      %4:3 = krnl.get_induction_var_value(%3#0, %3#1, %loop_block) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
      %5 = vector.load %alloc[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
      %6 = vector.load %1[%4#2] : memref<768xf32>, vector<32xf32>
      %7 = arith.addf %5, %6 : vector<32xf32>
      vector.store %7, %alloc_0[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
    }
    return %alloc_0 : memref<8x1024x768xf32>
  }
@imaihal
Copy link
Collaborator Author

imaihal commented Mar 24, 2024

@chentong319 Do you have any suggestions to solve this issue?

@AlexandreEichenberger if you have additional suggestions, please let me know.

@imaihal
Copy link
Collaborator Author

imaihal commented Mar 26, 2024

This sample works when I add scf::IfOp as delimiterOp in ConvertKrnlToAffine.cpp#L314-L316 .
Is this reasonable update?

diff --git a/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp b/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
index 8db0af1c..3bc2d36c 100644
--- a/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
+++ b/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
@@ -313,6 +314,8 @@ static void markLoopBodyAsMovable(
 
     // Delimeter ops are delimeter of a movable chunk of code.
     llvm::SmallVector<Operation *> delimeterOps(block.getOps<KrnlIterateOp>());
+    for (auto op : block.getOps<scf::IfOp>())
+      delimeterOps.push_back(op);
     delimeterOps.push_back(block.getTerminator());
     Operation *movableBeginOp = &block.front();
     for (Operation *delimeterOp : delimeterOps) {

@chentong319
Copy link
Collaborator

A quick answer is that Krnl.RegionOp has to be used for nested krnl.loop. I will schedule a discussion meeting after I spend sometime on this.

@imaihal
Copy link
Collaborator Author

imaihal commented Apr 22, 2024

@chentong319 Thanks for your comments.
Did your comments suggest to modify above example as follows? Actually this code worked.
My additional question is that should we always insert krnl.region when using nested krnl.loop or is there any condition to use it?
I'm trying to update my PR (I updated before, but I was a bit mistaken). Now, I'm inserting krnl.region after generating krnl.loop once, but is this correct approach?

  func.func @main_graph(%arg0: memref<8x1024x768xf32>) -> memref<8x1024x768xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c768 = arith.constant 768 : index
    %c1024 = arith.constant 1024 : index
    %c8 = arith.constant 8 : index
    %0 = "krnl.global"() {name = "constant_0", shape = [768, 768], value = dense<1.000000e+00> : tensor<768x768xf32>} : () -> memref<768x768xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [768], value = dense<1.000000e+00> : tensor<768xf32>} : () -> memref<768xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %2 = krnl.define_loops 1
    krnl.parallel(%2) : !krnl.loop
    krnl.iterate(%2) with (%2 -> %arg1 = 0 to 1){
      %4 = krnl.get_induction_var_value(%2) : (!krnl.loop) -> index
      %5 = arith.cmpi eq, %4, %c0 : index
      scf.if %5 {
        // MatMul
        krnl.memset %alloc, %cst : memref<8x1024x768xf32>
        %6 = krnl.define_loops 1
        krnl.iterate(%6) with (%6 -> %arg2 = %c0 to %c8){
///
       "krnl.region"() ({
///
          %7 = krnl.get_induction_var_value(%6) : (!krnl.loop) -> index
          %8:3 = krnl.define_loops 3
          %loop_block_1, %loop_local_2 = krnl.block %8#0 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_3, %loop_local_4 = krnl.block %8#1 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_5, %loop_local_6 = krnl.block %8#2 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          krnl.permute(%loop_block_1, %loop_local_2, %loop_block_3, %loop_local_4, %loop_block_5, %loop_local_6) [0, 3, 1, 4, 2, 5] : !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop
          krnl.iterate(%loop_block_1, %loop_block_3, %loop_block_5) with (%8#0 -> %arg3 = %c0 to %c1024, %8#1 -> %arg4 = %c0 to %c768, %8#2 -> %arg5 = %c0 to %c768){
            %9:3 = krnl.get_induction_var_value(%loop_block_1, %loop_block_3, %loop_block_5) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
            krnl.matmul %arg0[%7, %c0, %c0], %0[%c0, %c0], %alloc[%7, %c0, %c0], (%loop_local_2, %loop_local_4, %loop_local_6), (%9#0, %9#1, %9#2), (%c1024, %c768, %c768) {aTile\
Size = [], bTileSize = [], cTileSize = [], computeTileSize = [4, 8, 8]} : memref<8x1024x768xf32>, memref<768x768xf32>, memref<8x1024x768xf32>, (!krnl.loop, !krnl.loop, !krnl.loo\
p)
          }
///
        }) : () -> ()
///
        }
      }
    }
//    return %alloc : memref<8x1024x768xf32>
    // Add
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %3:3 = krnl.define_loops 3
    %loop_block, %loop_local = krnl.block %3#2 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
    krnl.iterate(%3#0, %3#1, %loop_block) with (%3#0 -> %arg1 = 0 to 8, %3#1 -> %arg2 = 0 to 1024, %3#2 -> %arg3 = 0 to 768){
      %4:3 = krnl.get_induction_var_value(%3#0, %3#1, %loop_block) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
      %5 = vector.load %alloc[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
      %6 = vector.load %1[%4#2] : memref<768xf32>, vector<32xf32>
      %7 = arith.addf %5, %6 : vector<32xf32>
      vector.store %7, %alloc_0[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
    }
    return %alloc_0 : memref<8x1024x768xf32>
  }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants