An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

imaihal · 2024-03-24T15:01:27Z

This issue was observed when applying --convert-krnl-to-affine to generated KrnlIR in PR #2756 . In the PR, following kinds of KrnlIR are generated, and lowered to affine by using --convert-krnl-to-affine, but I got an error in some cases.

%loop0 = krnl.define_loops
   krnl.parallel(%loop0) : !krnl.loop
   krnl.iterate(%loop0) with (%loop0 -> %arg2 = 0 to 2){
      %idx0 = krnl.get_induction_var_value(%loop0) : (!krnl.loop) -> (index)
      %0 = arith.cmpi eq, %idx0, 0 : index
      scf.if %0 {
         // KrnlIR  for the ONNX operations (Thread 0)
      }
      %1 = arith.cmpi eq, %i, 1 : index
      scf.if  %1 {
         // KrnlIR for the ONNX operations (Thread 1)
      }
 } // krnl.iterate

Error message

onnx-mlir-opt: /home/imaihal/onnx-mlir/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp:238: void onnx_mlir::krnl::LoopBodyMover::moveOne(ml
ir::Value, llvm::SmallDenseMap<mlir::Value, mlir::Operation*, 4>&, bool): Assertion `insertPt != loopBody.end() && "Expecting insertPt in the l
oop"' failed.

Example to reproduce the error:
Command : onnx-mlir-opt --convert-krnl-to-affine issue-ng.mlir.txt
This file includes single MatMulOp in ForkOp and the results of MatMulOp is used in AddOp.
Other observations:
No error: When removing the AddOp after ParallelOp, the error disappeared. (file: issue-ok.mlir.txt)
No error: When comments out scf.if, the error disappeared (file: issue-ok2.mlir.txt)
Error: Comment out krnl.parallel, the error still appeared.

  func.func @main_graph(%arg0: memref<8x1024x768xf32>) -> memref<8x1024x768xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c768 = arith.constant 768 : index
    %c1024 = arith.constant 1024 : index
    %c8 = arith.constant 8 : index
    %0 = "krnl.global"() {name = "constant_0", shape = [768, 768], value = dense<1.000000e+00> : tensor<768x768xf32>} : () -> memref<768x768xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [768], value = dense<1.000000e+00> : tensor<768xf32>} : () -> memref<768xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %2 = krnl.define_loops 1
    krnl.parallel(%2) : !krnl.loop
    krnl.iterate(%2) with (%2 -> %arg1 = 0 to 1){
      %4 = krnl.get_induction_var_value(%2) : (!krnl.loop) -> index
      %5 = arith.cmpi eq, %4, %c0 : index
      scf.if %5 {
        // MatMul
        krnl.memset %alloc, %cst : memref<8x1024x768xf32>
        %6 = krnl.define_loops 1
        krnl.iterate(%6) with (%6 -> %arg2 = %c0 to %c8){
          %7 = krnl.get_induction_var_value(%6) : (!krnl.loop) -> index
          %8:3 = krnl.define_loops 3
          %loop_block_1, %loop_local_2 = krnl.block %8#0 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_3, %loop_local_4 = krnl.block %8#1 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_5, %loop_local_6 = krnl.block %8#2 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          krnl.permute(%loop_block_1, %loop_local_2, %loop_block_3, %loop_local_4, %loop_block_5, %loop_local_6) [0, 3, 1, 4, 2, 5] : !krnl.loop, !krnl.loop, !krnl.loop, !krnl.l\
oop, !krnl.loop, !krnl.loop
          krnl.iterate(%loop_block_1, %loop_block_3, %loop_block_5) with (%8#0 -> %arg3 = %c0 to %c1024, %8#1 -> %arg4 = %c0 to %c768, %8#2 -> %arg5 = %c0 to %c768){
            %9:3 = krnl.get_induction_var_value(%loop_block_1, %loop_block_3, %loop_block_5) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
            krnl.matmul %arg0[%7, %c0, %c0], %0[%c0, %c0], %alloc[%7, %c0, %c0], (%loop_local_2, %loop_local_4, %loop_local_6), (%9#0, %9#1, %9#2), (%c1024, %c768, %c768) {aTile\
Size = [], bTileSize = [], cTileSize = [], computeTileSize = [4, 8, 8]} : memref<8x1024x768xf32>, memref<768x768xf32>, memref<8x1024x768xf32>, (!krnl.loop, !krnl.loop, !krnl.loo\
p)
          }
        }
      }
    }
//    return %alloc : memref<8x1024x768xf32>
    // Add
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %3:3 = krnl.define_loops 3
    %loop_block, %loop_local = krnl.block %3#2 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
    krnl.iterate(%3#0, %3#1, %loop_block) with (%3#0 -> %arg1 = 0 to 8, %3#1 -> %arg2 = 0 to 1024, %3#2 -> %arg3 = 0 to 768){
      %4:3 = krnl.get_induction_var_value(%3#0, %3#1, %loop_block) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
      %5 = vector.load %alloc[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
      %6 = vector.load %1[%4#2] : memref<768xf32>, vector<32xf32>
      %7 = arith.addf %5, %6 : vector<32xf32>
      vector.store %7, %alloc_0[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
    }
    return %alloc_0 : memref<8x1024x768xf32>
  }

The text was updated successfully, but these errors were encountered:

imaihal · 2024-03-24T15:09:54Z

@chentong319 Do you have any suggestions to solve this issue?

@AlexandreEichenberger if you have additional suggestions, please let me know.

imaihal · 2024-03-26T05:53:29Z

This sample works when I add scf::IfOp as delimiterOp in ConvertKrnlToAffine.cpp#L314-L316 .
Is this reasonable update?

diff --git a/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp b/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
index 8db0af1c..3bc2d36c 100644
--- a/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
+++ b/src/Conversion/KrnlToAffine/ConvertKrnlToAffine.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
@@ -313,6 +314,8 @@ static void markLoopBodyAsMovable(
 
     // Delimeter ops are delimeter of a movable chunk of code.
     llvm::SmallVector<Operation *> delimeterOps(block.getOps<KrnlIterateOp>());
+    for (auto op : block.getOps<scf::IfOp>())
+      delimeterOps.push_back(op);
     delimeterOps.push_back(block.getTerminator());
     Operation *movableBeginOp = &block.front();
     for (Operation *delimeterOp : delimeterOps) {

chentong319 · 2024-03-27T01:22:09Z

A quick answer is that Krnl.RegionOp has to be used for nested krnl.loop. I will schedule a discussion meeting after I spend sometime on this.

imaihal · 2024-04-22T02:22:09Z

@chentong319 Thanks for your comments.
Did your comments suggest to modify above example as follows? Actually this code worked.
My additional question is that should we always insert krnl.region when using nested krnl.loop or is there any condition to use it?
I'm trying to update my PR (I updated before, but I was a bit mistaken). Now, I'm inserting krnl.region after generating krnl.loop once, but is this correct approach?

  func.func @main_graph(%arg0: memref<8x1024x768xf32>) -> memref<8x1024x768xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c768 = arith.constant 768 : index
    %c1024 = arith.constant 1024 : index
    %c8 = arith.constant 8 : index
    %0 = "krnl.global"() {name = "constant_0", shape = [768, 768], value = dense<1.000000e+00> : tensor<768x768xf32>} : () -> memref<768x768xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [768], value = dense<1.000000e+00> : tensor<768xf32>} : () -> memref<768xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %2 = krnl.define_loops 1
    krnl.parallel(%2) : !krnl.loop
    krnl.iterate(%2) with (%2 -> %arg1 = 0 to 1){
      %4 = krnl.get_induction_var_value(%2) : (!krnl.loop) -> index
      %5 = arith.cmpi eq, %4, %c0 : index
      scf.if %5 {
        // MatMul
        krnl.memset %alloc, %cst : memref<8x1024x768xf32>
        %6 = krnl.define_loops 1
        krnl.iterate(%6) with (%6 -> %arg2 = %c0 to %c8){
///
       "krnl.region"() ({
///
          %7 = krnl.get_induction_var_value(%6) : (!krnl.loop) -> index
          %8:3 = krnl.define_loops 3
          %loop_block_1, %loop_local_2 = krnl.block %8#0 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_3, %loop_local_4 = krnl.block %8#1 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          %loop_block_5, %loop_local_6 = krnl.block %8#2 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
          krnl.permute(%loop_block_1, %loop_local_2, %loop_block_3, %loop_local_4, %loop_block_5, %loop_local_6) [0, 3, 1, 4, 2, 5] : !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop
          krnl.iterate(%loop_block_1, %loop_block_3, %loop_block_5) with (%8#0 -> %arg3 = %c0 to %c1024, %8#1 -> %arg4 = %c0 to %c768, %8#2 -> %arg5 = %c0 to %c768){
            %9:3 = krnl.get_induction_var_value(%loop_block_1, %loop_block_3, %loop_block_5) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
            krnl.matmul %arg0[%7, %c0, %c0], %0[%c0, %c0], %alloc[%7, %c0, %c0], (%loop_local_2, %loop_local_4, %loop_local_6), (%9#0, %9#1, %9#2), (%c1024, %c768, %c768) {aTile\
Size = [], bTileSize = [], cTileSize = [], computeTileSize = [4, 8, 8]} : memref<8x1024x768xf32>, memref<768x768xf32>, memref<8x1024x768xf32>, (!krnl.loop, !krnl.loop, !krnl.loo\
p)
          }
///
        }) : () -> ()
///
        }
      }
    }
//    return %alloc : memref<8x1024x768xf32>
    // Add
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32>
    %3:3 = krnl.define_loops 3
    %loop_block, %loop_local = krnl.block %3#2 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
    krnl.iterate(%3#0, %3#1, %loop_block) with (%3#0 -> %arg1 = 0 to 8, %3#1 -> %arg2 = 0 to 1024, %3#2 -> %arg3 = 0 to 768){
      %4:3 = krnl.get_induction_var_value(%3#0, %3#1, %loop_block) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
      %5 = vector.load %alloc[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
      %6 = vector.load %1[%4#2] : memref<768xf32>, vector<32xf32>
      %7 = arith.addf %5, %6 : vector<32xf32>
      vector.store %7, %alloc_0[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32>
    }
    return %alloc_0 : memref<8x1024x768xf32>
  }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

imaihal commented Mar 24, 2024 •

edited

imaihal commented Mar 24, 2024

imaihal commented Mar 26, 2024 •

edited

chentong319 commented Mar 27, 2024

imaihal commented Apr 22, 2024

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

An issue in ConvertKrnlToAffine Pass used in PR#2756 #2773

Comments

imaihal commented Mar 24, 2024 • edited

imaihal commented Mar 24, 2024

imaihal commented Mar 26, 2024 • edited

chentong319 commented Mar 27, 2024

imaihal commented Apr 22, 2024

imaihal commented Mar 24, 2024 •

edited

imaihal commented Mar 26, 2024 •

edited