[mlir][spirv] Adapt subview legalization to the updated op semantics.

The subview semantics changes recently to allow for more natural representation of constant offsets and strides. The legalization of subview op for lowering to SPIR-V needs to account for this. Also change the linearization to use the strides from the affine map of a memref. Differential Revision: https://reviews.llvm.org/D80270
llvm · May 20, 2020 · 0e88eb5 · 0e88eb5
1 parent 55430f5
commit 0e88eb5
Show file tree

Hide file tree

Showing 6 changed files with 125 additions and 89 deletions.
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -2685,6 +2685,21 @@ def SubViewOp : Std_Op<"subview", [
     /// constructed with `b` at location `loc`.
     SmallVector<Range, 8> getOrCreateRanges(OpBuilder &b, Location loc);
 
+    /// Return the offsets as Values. Each Value is either the dynamic
+    /// value specified in the op or a ConstantIndexOp constructed
+    /// with `b` at location `loc`
+    SmallVector<Value, 4> getOrCreateOffsets(OpBuilder &b, Location loc);
+
+    /// Return the sizes as Values. Each Value is either the dynamic
+    /// value specified in the op or a ConstantIndexOp constructed
+    /// with `b` at location `loc`
+    SmallVector<Value, 4> getOrCreateSizes(OpBuilder &b, Location loc);
+
+    /// Return the strides as Values. Each Value is either the dynamic
+    /// value specified in the op or a ConstantIndexOp constructed with
+    /// `b` at location `loc`
+    SmallVector<Value, 4> getOrCreateStrides(OpBuilder &b, Location loc);
+
     /// A subview result type can be fully inferred from the source type and the
     /// static representation of offsets, sizes and strides. Special sentinels
     /// encode the dynamic case.

diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -64,34 +64,15 @@ resolveSourceIndices(Location loc, PatternRewriter &rewriter,
   // TODO: Aborting when the offsets are static. There might be a way to fold
   // the subview op with load even if the offsets have been canonicalized
   // away.
-  if (subViewOp.getNumOffsets() == 0)
-    return failure();
-
-  ValueRange opOffsets = subViewOp.offsets();
-  SmallVector<Value, 2> opStrides;
-  if (subViewOp.getNumStrides()) {
-    // If the strides are dynamic, get the stride operands.
-    opStrides = llvm::to_vector<2>(subViewOp.strides());
-  } else {
-    // When static, the stride operands can be retrieved by taking the strides
-    // of the result of the subview op, and dividing the strides of the base
-    // memref.
-    SmallVector<int64_t, 2> staticStrides;
-    if (failed(subViewOp.getStaticStrides(staticStrides))) {
-      return failure();
-    }
-    opStrides.reserve(opOffsets.size());
-    for (auto stride : staticStrides) {
-      auto constValAttr = rewriter.getIntegerAttr(
-          IndexType::get(rewriter.getContext()), stride);
-      opStrides.emplace_back(rewriter.create<ConstantOp>(loc, constValAttr));
-    }
-  }
-  assert(opOffsets.size() == opStrides.size());
+  SmallVector<Value, 4> opOffsets = subViewOp.getOrCreateOffsets(rewriter, loc);
+  SmallVector<Value, 4> opStrides = subViewOp.getOrCreateStrides(rewriter, loc);
+  assert(opOffsets.size() == indices.size() &&
+         "expected as many indices as rank of subview op result type");
+  assert(opStrides.size() == indices.size() &&
+         "expected as many indices as rank of subview op result type");
 
   // New indices for the load are the current indices * subview_stride +
   // subview_offset.
-  assert(indices.size() == opStrides.size());
   sourceIndices.resize(indices.size());
   for (auto index : llvm::enumerate(indices)) {
     auto offset = opOffsets[index.index()];

diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -2548,6 +2548,44 @@ SmallVector<SubViewOp::Range, 8> SubViewOp::getOrCreateRanges(OpBuilder &b,
   return res;
 }
 
+SmallVector<Value, 4> SubViewOp::getOrCreateOffsets(OpBuilder &b,
+                                                    Location loc) {
+  unsigned dynamicIdx = 1;
+  return llvm::to_vector<4>(llvm::map_range(
+      static_offsets().cast<ArrayAttr>(), [&](Attribute a) -> Value {
+        int64_t staticOffset = a.cast<IntegerAttr>().getInt();
+        if (ShapedType::isDynamicStrideOrOffset(staticOffset))
+          return getOperand(dynamicIdx++);
+        else
+          return b.create<ConstantIndexOp>(loc, staticOffset);
+      }));
+}
+
+SmallVector<Value, 4> SubViewOp::getOrCreateSizes(OpBuilder &b, Location loc) {
+  unsigned dynamicIdx = 1 + offsets().size();
+  return llvm::to_vector<4>(llvm::map_range(
+      static_sizes().cast<ArrayAttr>(), [&](Attribute a) -> Value {
+        int64_t staticSize = a.cast<IntegerAttr>().getInt();
+        if (ShapedType::isDynamic(staticSize))
+          return getOperand(dynamicIdx++);
+        else
+          return b.create<ConstantIndexOp>(loc, staticSize);
+      }));
+}
+
+SmallVector<Value, 4> SubViewOp::getOrCreateStrides(OpBuilder &b,
+                                                    Location loc) {
+  unsigned dynamicIdx = 1 + offsets().size() + sizes().size();
+  return llvm::to_vector<4>(llvm::map_range(
+      static_strides().cast<ArrayAttr>(), [&](Attribute a) -> Value {
+        int64_t staticStride = a.cast<IntegerAttr>().getInt();
+        if (ShapedType::isDynamicStrideOrOffset(staticStride))
+          return getOperand(dynamicIdx++);
+        else
+          return b.create<ConstantIndexOp>(loc, staticStride);
+      }));
+}
+
 LogicalResult
 SubViewOp::getStaticStrides(SmallVectorImpl<int64_t> &staticStrides) {
   if (!strides().empty())

diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -23,25 +23,25 @@ module attributes {
 
   // CHECK-LABEL: spv.module Logical GLSL450
   gpu.module @kernels {
-    // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
-    // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
-    // CHECK-DAG: spv.globalVariable [[WORKGROUPIDVAR:@.*]] built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable @[[NUMWORKGROUPSVAR:.*]] built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable @[[LOCALINVOCATIONIDVAR:.*]] built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
+    // CHECK-DAG: spv.globalVariable @[[WORKGROUPIDVAR:.*]] built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
     // CHECK-LABEL:    spv.func @load_store_kernel
-    // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 0)>}
-    // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 1)>}
-    // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 2)>}
-    // CHECK-SAME: [[ARG3:%.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 3), StorageBuffer>}
-    // CHECK-SAME: [[ARG4:%.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 4), StorageBuffer>}
-    // CHECK-SAME: [[ARG5:%.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 5), StorageBuffer>}
-    // CHECK-SAME: [[ARG6:%.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 6), StorageBuffer>}
+    // CHECK-SAME: %[[ARG0:.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 0)>}
+    // CHECK-SAME: %[[ARG1:.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 1)>}
+    // CHECK-SAME: %[[ARG2:.*]]: !spv.ptr<!spv.struct<!spv.array<48 x f32, stride=4> [0]>, StorageBuffer> {spv.interface_var_abi = #spv.interface_var_abi<(0, 2)>}
+    // CHECK-SAME: %[[ARG3:.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 3), StorageBuffer>}
+    // CHECK-SAME: %[[ARG4:.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 4), StorageBuffer>}
+    // CHECK-SAME: %[[ARG5:.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 5), StorageBuffer>}
+    // CHECK-SAME: %[[ARG6:.*]]: i32 {spv.interface_var_abi = #spv.interface_var_abi<(0, 6), StorageBuffer>}
     gpu.func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) kernel
       attributes {spv.entry_point_abi = {local_size = dense<[16, 1, 1]>: vector<3xi32>}} {
-      // CHECK: [[ADDRESSWORKGROUPID:%.*]] = spv._address_of [[WORKGROUPIDVAR]]
-      // CHECK: [[WORKGROUPID:%.*]] = spv.Load "Input" [[ADDRESSWORKGROUPID]]
-      // CHECK: [[WORKGROUPIDX:%.*]] = spv.CompositeExtract [[WORKGROUPID]]{{\[}}0 : i32{{\]}}
-      // CHECK: [[ADDRESSLOCALINVOCATIONID:%.*]] = spv._address_of [[LOCALINVOCATIONIDVAR]]
-      // CHECK: [[LOCALINVOCATIONID:%.*]] = spv.Load "Input" [[ADDRESSLOCALINVOCATIONID]]
-      // CHECK: [[LOCALINVOCATIONIDX:%.*]] = spv.CompositeExtract [[LOCALINVOCATIONID]]{{\[}}0 : i32{{\]}}
+      // CHECK: %[[ADDRESSWORKGROUPID:.*]] = spv._address_of @[[WORKGROUPIDVAR]]
+      // CHECK: %[[WORKGROUPID:.*]] = spv.Load "Input" %[[ADDRESSWORKGROUPID]]
+      // CHECK: %[[WORKGROUPIDX:.*]] = spv.CompositeExtract %[[WORKGROUPID]]{{\[}}0 : i32{{\]}}
+      // CHECK: %[[ADDRESSLOCALINVOCATIONID:.*]] = spv._address_of @[[LOCALINVOCATIONIDVAR]]
+      // CHECK: %[[LOCALINVOCATIONID:.*]] = spv.Load "Input" %[[ADDRESSLOCALINVOCATIONID]]
+      // CHECK: %[[LOCALINVOCATIONIDX:.*]] = spv.CompositeExtract %[[LOCALINVOCATIONID]]{{\[}}0 : i32{{\]}}
       %0 = "gpu.block_id"() {dimension = "x"} : () -> index
       %1 = "gpu.block_id"() {dimension = "y"} : () -> index
       %2 = "gpu.block_id"() {dimension = "z"} : () -> index
@@ -54,26 +54,26 @@ module attributes {
       %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
       %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
       %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
-      // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], [[WORKGROUPIDX]]
+      // CHECK: %[[INDEX1:.*]] = spv.IAdd %[[ARG3]], %[[WORKGROUPIDX]]
       %12 = addi %arg3, %0 : index
-      // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], [[LOCALINVOCATIONIDX]]
+      // CHECK: %[[INDEX2:.*]] = spv.IAdd %[[ARG4]], %[[LOCALINVOCATIONIDX]]
       %13 = addi %arg4, %3 : index
-      // CHECK: [[STRIDE1_1:%.*]] = spv.constant 4 : i32
-      // CHECK: [[OFFSET1_1:%.*]] = spv.IMul [[STRIDE1_1]], [[INDEX1]] : i32
-      // CHECK: [[STRIDE1_2:%.*]] = spv.constant 1 : i32
-      // CHECK: [[UPDATE1_2:%.*]] = spv.IMul [[STRIDE1_2]], [[INDEX2]] : i32
-      // CHECK: [[OFFSET1_2:%.*]] = spv.IAdd [[OFFSET1_1]], [[UPDATE1_2]] : i32
-      // CHECK: [[ZERO1:%.*]] = spv.constant 0 : i32
-      // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[ZERO1]], [[OFFSET1_2]]{{\]}}
-      // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
+      // CHECK: %[[STRIDE1_1:.*]] = spv.constant 4 : i32
+      // CHECK: %[[OFFSET1_1:.*]] = spv.IMul %[[STRIDE1_1]], %[[INDEX1]] : i32
+      // CHECK: %[[STRIDE1_2:.*]] = spv.constant 1 : i32
+      // CHECK: %[[UPDATE1_2:.*]] = spv.IMul %[[STRIDE1_2]], %[[INDEX2]] : i32
+      // CHECK: %[[OFFSET1_2:.*]] = spv.IAdd %[[OFFSET1_1]], %[[UPDATE1_2]] : i32
+      // CHECK: %[[ZERO1:.*]] = spv.constant 0 : i32
+      // CHECK: %[[PTR1:.*]] = spv.AccessChain %[[ARG0]]{{\[}}%[[ZERO1]], %[[OFFSET1_2]]{{\]}}
+      // CHECK-NEXT: %[[VAL1:.*]] = spv.Load "StorageBuffer" %[[PTR1]]
       %14 = load %arg0[%12, %13] : memref<12x4xf32>
-      // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
-      // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
+      // CHECK: %[[PTR2:.*]] = spv.AccessChain %[[ARG1]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
+      // CHECK-NEXT: %[[VAL2:.*]] = spv.Load "StorageBuffer" %[[PTR2]]
       %15 = load %arg1[%12, %13] : memref<12x4xf32>
-      // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
+      // CHECK: %[[VAL3:.*]] = spv.FAdd %[[VAL1]], %[[VAL2]]
       %16 = addf %14, %15 : f32
-      // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
-      // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
+      // CHECK: %[[PTR3:.*]] = spv.AccessChain %[[ARG2]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
+      // CHECK-NEXT: spv.Store "StorageBuffer" %[[PTR3]], %[[VAL3]]
       store %16, %arg2[%12, %13] : memref<12x4xf32>
       gpu.return
     }

diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -16,27 +16,29 @@ module attributes {
   gpu.module @kernels {
     gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>) kernel
     attributes {spv.entry_point_abi = {local_size = dense<[16, 1, 1]>: vector<3xi32>}} {
-      // CHECK: [[LB:%.*]] = spv.constant 4 : i32
+      // CHECK: %[[LB:.*]] = spv.constant 4 : i32
       %lb = constant 4 : index
-      // CHECK: [[UB:%.*]] = spv.constant 42 : i32
+      // CHECK: %[[UB:.*]] = spv.constant 42 : i32
       %ub = constant 42 : index
-      // CHECK: [[STEP:%.*]] = spv.constant 2 : i32
+      // CHECK: %[[STEP:.*]] = spv.constant 2 : i32
       %step = constant 2 : index
       // CHECK:      spv.loop {
-      // CHECK-NEXT:   spv.Branch [[HEADER:\^.*]]([[LB]] : i32)
-      // CHECK:      [[HEADER]]([[INDVAR:%.*]]: i32):
-      // CHECK:        [[CMP:%.*]] = spv.SLessThan [[INDVAR]], [[UB]] : i32
-      // CHECK:        spv.BranchConditional [[CMP]], [[BODY:\^.*]], [[MERGE:\^.*]]
-      // CHECK:      [[BODY]]:
-      // CHECK:        [[STRIDE1:%.*]] = spv.constant 1 : i32
-      // CHECK:        [[OFFSET1:%.*]] = spv.IMul [[STRIDE1]], [[INDVAR]] : i32
-      // CHECK:        spv.AccessChain {{%.*}}{{\[}}{{%.*}}, [[OFFSET1]]{{\]}} : {{.*}}
-      // CHECK:        [[STRIDE2:%.*]] = spv.constant 1 : i32
-      // CHECK:        [[OFFSET2:%.*]] = spv.IMul [[STRIDE2]], [[INDVAR]] : i32
-      // CHECK:        spv.AccessChain {{%.*}}{{\[}}{{%.*}}, [[OFFSET2]]{{\]}} : {{.*}}
-      // CHECK:        [[INCREMENT:%.*]] = spv.IAdd [[INDVAR]], [[STEP]] : i32
-      // CHECK:        spv.Branch [[HEADER]]([[INCREMENT]] : i32)
-      // CHECK:      [[MERGE]]
+      // CHECK-NEXT:   spv.Branch ^[[HEADER:.*]](%[[LB]] : i32)
+      // CHECK:      ^[[HEADER]](%[[INDVAR:.*]]: i32):
+      // CHECK:        %[[CMP:.*]] = spv.SLessThan %[[INDVAR]], %[[UB]] : i32
+      // CHECK:        spv.BranchConditional %[[CMP]], ^[[BODY:.*]], ^[[MERGE:.*]]
+      // CHECK:      ^[[BODY]]:
+      // CHECK:        %[[STRIDE1:.*]] = spv.constant 1 : i32
+      // CHECK:        %[[INDEX1:.*]] = spv.IMul %[[STRIDE1]], %[[INDVAR]] : i32
+      // CHECK:        %[[ZERO1:.*]] = spv.constant 0 : i32
+      // CHECK:        spv.AccessChain {{%.*}}{{\[}}%[[ZERO1]], %[[INDEX1]]{{\]}}
+      // CHECK:        %[[STRIDE2:.*]] = spv.constant 1 : i32
+      // CHECK:        %[[INDEX2:.*]] = spv.IMul %[[STRIDE2]], %[[INDVAR]] : i32
+      // CHECK:        %[[ZERO2:.*]] = spv.constant 0 : i32
+      // CHECK:        spv.AccessChain {{%.*}}[%[[ZERO2]], %[[INDEX2]]]
+      // CHECK:        %[[INCREMENT:.*]] = spv.IAdd %[[INDVAR]], %[[STEP]] : i32
+      // CHECK:        spv.Branch ^[[HEADER]](%[[INCREMENT]] : i32)
+      // CHECK:      ^[[MERGE]]
       // CHECK:        spv._merge
       // CHECK:      }
       scf.for %arg4 = %lb to %ub step %step {

diff --git a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir
@@ -15,23 +15,23 @@ module {
 func @fold_static_stride_subview
        (%arg0 : memref<12x32xf32>, %arg1 : index,
         %arg2 : index, %arg3 : index, %arg4 : index) {
-  // CHECK: %[[C2:.*]] = constant 2
-  // CHECK: %[[C3:.*]] = constant 3
-  // CHECK: %[[T0:.*]] = muli %[[ARG3]], %[[C2]]
-  // CHECK: %[[T1:.*]] = addi %[[ARG1]], %[[T0]]
-  // CHECK: %[[T2:.*]] = muli %[[ARG4]], %[[C3]]
-  // CHECK: %[[T3:.*]] = addi %[[ARG2]], %[[T2]]
-  // CHECK: %[[LOADVAL:.*]] = load %[[ARG0]][%[[T1]], %[[T3]]]
-  // CHECK: %[[STOREVAL:.*]] = sqrt %[[LOADVAL]]
-  // CHECK: %[[T6:.*]] = muli %[[ARG3]], %[[C2]]
-  // CHECK: %[[T7:.*]] = addi %[[ARG1]], %[[T6]]
-  // CHECK: %[[T8:.*]] = muli %[[ARG4]], %[[C3]]
-  // CHECK: %[[T9:.*]] = addi %[[ARG2]], %[[T8]]
-  // CHECK store %[[STOREVAL]], %[[ARG0]][%[[T7]], %[[T9]]]
-  %0 = subview %arg0[%arg1, %arg2][4, 4][2, 3] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
-  %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  // CHECK-DAG: %[[C2:.*]] = constant 2
+  // CHECK-DAG: %[[C3:.*]] = constant 3
+  //     CHECK: %[[T0:.*]] = muli %[[ARG3]], %[[C3]]
+  //     CHECK: %[[T1:.*]] = addi %[[ARG1]], %[[T0]]
+  //     CHECK: %[[T2:.*]] = muli %[[ARG4]], %[[ARG2]]
+  //     CHECK: %[[T3:.*]] = addi %[[T2]], %[[C2]]
+  //     CHECK: %[[LOADVAL:.*]] = load %[[ARG0]][%[[T1]], %[[T3]]]
+  //     CHECK: %[[STOREVAL:.*]] = sqrt %[[LOADVAL]]
+  //     CHECK: %[[T6:.*]] = muli %[[ARG3]], %[[C3]]
+  //     CHECK: %[[T7:.*]] = addi %[[ARG1]], %[[T6]]
+  //     CHECK: %[[T8:.*]] = muli %[[ARG4]], %[[ARG2]]
+  //     CHECK: %[[T9:.*]] = addi %[[T8]], %[[C2]]
+  //     CHECK: store %[[STOREVAL]], %[[ARG0]][%[[T7]], %[[T9]]]
+  %0 = subview %arg0[%arg1, 2][4, 4][3, %arg2] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [96, ?]>
+  %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [96, ?]>
   %2 = sqrt %1 : f32
-  store %2, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
+  store %2, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [96, ?]>
   return
 }