func.func @main_graph(%arg0: memref<8x1024x768xf32>) -> memref<8x1024x768xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %c768 = arith.constant 768 : index %c1024 = arith.constant 1024 : index %c8 = arith.constant 8 : index %0 = "krnl.global"() {name = "constant_0", shape = [768, 768], value = dense<1.000000e+00> : tensor<768x768xf32>} : () -> memref<768x768xf32> %1 = "krnl.global"() {name = "constant_1", shape = [768], value = dense<1.000000e+00> : tensor<768xf32>} : () -> memref<768xf32> %alloc = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32> %2 = krnl.define_loops 1 krnl.parallel(%2) : !krnl.loop krnl.iterate(%2) with (%2 -> %arg1 = 0 to 1){ %4 = krnl.get_induction_var_value(%2) : (!krnl.loop) -> index %5 = arith.cmpi eq, %4, %c0 : index scf.if %5 { krnl.memset %alloc, %cst : memref<8x1024x768xf32> %6 = krnl.define_loops 1 krnl.iterate(%6) with (%6 -> %arg2 = %c0 to %c8){ %7 = krnl.get_induction_var_value(%6) : (!krnl.loop) -> index %8:3 = krnl.define_loops 3 %loop_block_1, %loop_local_2 = krnl.block %8#0 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) %loop_block_3, %loop_local_4 = krnl.block %8#1 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) %loop_block_5, %loop_local_6 = krnl.block %8#2 8 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) krnl.permute(%loop_block_1, %loop_local_2, %loop_block_3, %loop_local_4, %loop_block_5, %loop_local_6) [0, 3, 1, 4, 2, 5] : !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop krnl.iterate(%loop_block_1, %loop_block_3, %loop_block_5) with (%8#0 -> %arg3 = %c0 to %c1024, %8#1 -> %arg4 = %c0 to %c768, %8#2 -> %arg5 = %c0 to %c768){ %9:3 = krnl.get_induction_var_value(%loop_block_1, %loop_block_3, %loop_block_5) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index) krnl.matmul %arg0[%7, %c0, %c0], %0[%c0, %c0], %alloc[%7, %c0, %c0], (%loop_local_2, %loop_local_4, %loop_local_6), (%9#0, %9#1, %9#2), (%c1024, %c768, %c768) {aTileSize = [], bTileSize = [], cTileSize = [], computeTileSize = [4, 8, 8]} : memref<8x1024x768xf32>, memref<768x768xf32>, memref<8x1024x768xf32>, (!krnl.loop, !krnl.loop, !krnl.loop) } } } } return %alloc : memref<8x1024x768xf32> // %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<8x1024x768xf32> // %3:3 = krnl.define_loops 3 // %loop_block, %loop_local = krnl.block %3#2 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // krnl.iterate(%3#0, %3#1, %loop_block) with (%3#0 -> %arg1 = 0 to 8, %3#1 -> %arg2 = 0 to 1024, %3#2 -> %arg3 = 0 to 768){ // %4:3 = krnl.get_induction_var_value(%3#0, %3#1, %loop_block) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index) // %5 = vector.load %alloc[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32> // %6 = vector.load %1[%4#2] : memref<768xf32>, vector<32xf32> // %7 = arith.addf %5, %6 : vector<32xf32> // vector.store %7, %alloc_0[%4#0, %4#1, %4#2] : memref<8x1024x768xf32>, vector<32xf32> // } // return %alloc_0 : memref<8x1024x768xf32> }