diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 51be1e4431e70..4d31e86f8dcf6 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -23,6 +23,7 @@ #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/UB/IR/UBMatchers.h" #include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Utils/VerificationUtils.h" #include "mlir/IR/AffineExpr.h" @@ -5625,11 +5626,103 @@ struct TransferReadAfterWriteToBroadcast return success(); } }; + +/// Folds a transfer_read that reads from the result of a transfer_write on +/// the same region (Read-After-Write) into arithmetic on the written value, +/// the original tensor, the masks, and the read's padding. +/// +/// The general semantics are: +/// +/// written_tensor[i] = wMask[i] ? valToStore[i] : original[i] +/// result[i] = rMask[i] ? written_tensor[i] : rPad +/// +/// Which gives: +/// result = select(rMask, select(wMask, valToStore, original), +/// broadcast(rPad)) +/// +/// Special cases avoid emitting unnecessary IR: +/// - No wMask (unmasked write): wMask is implicitly all-true, inner select +/// collapses to valToStore. +/// - No rMask (unmasked read): rMask is implicitly all-true, outer select +/// collapses away. +/// - wMask == rMask: the original tensor is never needed (anywhere rMask is +/// true, wMask is also true), so the inner select collapses to valToStore. +/// +/// After bufferization, this generally removes the need for materializing the +/// write to memory. +struct FoldTransferReadAfterTransferWrite + : public OpRewritePattern { + using Base::Base; + + LogicalResult matchAndRewrite(TransferReadOp readOp, + PatternRewriter &rewriter) const override { + if (!readOp.hasPureTensorSemantics()) + return failure(); + + if (readOp->getParentOfType()) + return failure(); + + auto writeOp = + dyn_cast_if_present(readOp.getBase().getDefiningOp()); + if (!writeOp || !writeOp.hasPureTensorSemantics()) + return failure(); + + Value valToStore = writeOp.getValueToStore(); + if (valToStore.getType() != readOp.getType()) + return failure(); + + if ((llvm::any_of(readOp.getIndices(), + [](Value v) { return !isZeroInteger(v); }) || + llvm::any_of(writeOp.getIndices(), + [](Value v) { return !isZeroInteger(v); })) && + (readOp.getIndices() != writeOp.getIndices())) + return failure(); + + if (!readOp.getPermutationMap().isMinorIdentity() || + !writeOp.getPermutationMap().isMinorIdentity()) + return failure(); + + // We cannot fold when both of them are out of bounds. + // If one of them is in bounds but the other one isn't, then + // we can take advantage of undefined behaviour to fold. + if (readOp.hasOutOfBoundsDim() && writeOp.hasOutOfBoundsDim()) + return failure(); + + TypedValue wMask = writeOp.getMask(); + TypedValue rMask = readOp.getMask(); + + // Build the inner value: select(wMask, valToStore, original). + // When wMask is absent (unmasked write) or wMask == rMask (original is + // never accessed), this simplifies to just valToStore. + Value inner = valToStore; + bool needsOriginal = wMask && wMask != rMask; + if (needsOriginal) { + Value originalRead = TransferReadOp::create( + rewriter, readOp.getLoc(), readOp.getType(), writeOp.getBase(), + readOp.getIndices(), readOp.getPermutationMap(), readOp.getPadding(), + /*mask=*/Value(), readOp.getInBoundsAttr()); + inner = arith::SelectOp::create(rewriter, readOp.getLoc(), wMask, + valToStore, originalRead); + } + + if (!rMask) { + rewriter.replaceOp(readOp, inner); + return success(); + } + + Value rPad = readOp.getPadding(); + Value padVal = BroadcastOp::create(rewriter, rPad.getLoc(), + valToStore.getType(), rPad); + rewriter.replaceOpWithNewOp(readOp, rMask, inner, padVal); + return success(); + } +}; } // namespace void TransferReadOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } FailureOr>> diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 6aa92ab79a0dd..3a65a6a70928e 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -1991,24 +1991,6 @@ func.func @negative_store_to_load_tensor_memref( // ----- -// CHECK-LABEL: func @negative_store_to_load_tensor_no_actual_broadcast -// CHECK-NOT: vector.broadcast -// CHECK-NOT: vector.transpose -// CHECK: vector.transfer_write -// CHECK: vector.transfer_read -func.func @negative_store_to_load_tensor_no_actual_broadcast(%arg0 : tensor, - %v0 : vector<4x2xf32>) -> vector<4x2xf32> { - %c0 = arith.constant 0 : index - %cf0 = arith.constant 0.0 : f32 - %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] : - vector<4x2xf32>, tensor - %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true]} : - tensor, vector<4x2xf32> - return %0 : vector<4x2xf32> -} - -// ----- - // CHECK-LABEL: func @negative_store_to_load_tensor_broadcast_out_of_bounds // CHECK-NOT: vector.broadcast // CHECK-NOT: vector.transpose @@ -2106,6 +2088,337 @@ func.func @store_to_load_tensor_forwarding_unit_dim_broadcast( // ----- +// Both write and read are masked with the same mask: the original tensor is +// never needed, so the inner select collapses. Result is +// select(mask, val, broadcast(pad)). +// CHECK-LABEL: func @fold_transfer_raw_both_masked +// CHECK-SAME: %[[T:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf16> +// CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<128xf16> +// CHECK: %[[SEL:.*]] = arith.select %[[MASK]], %[[CST_1]], %[[CST_0]] +// CHECK: return %[[SEL]] +func.func @fold_transfer_raw_both_masked(%t: tensor<128xf16>, %mask: vector<128xi1>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%c0], %mask {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.transfer_read %w[%c0], %cst, %mask {in_bounds = [true]} + : tensor<128xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Masked write, unmasked read: replace with select(wMask, val, read(original)). +// CHECK-LABEL: func @fold_transfer_raw_masked_write_unmasked_read +// CHECK-SAME: %[[T:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f16 +// CHECK-DAG: %[[VAL:.*]] = arith.constant dense<1.000000e+00> : vector<128xf16> +// CHECK: %[[READ:.*]] = vector.transfer_read %[[T]]{{.*}}, %[[CST]] +// CHECK-SAME: {in_bounds = [true]} +// CHECK-SAME: : tensor<128xf16>, vector<128xf16> +// CHECK: %[[SEL:.*]] = arith.select %[[MASK]], %[[VAL]], %[[READ]] +// CHECK: return %[[SEL]] +func.func @fold_transfer_raw_masked_write_unmasked_read(%t: tensor<128xf16>, %mask: vector<128xi1>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%c0], %mask {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.transfer_read %w[%c0], %cst {in_bounds = [true]} + : tensor<128xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Both unmasked: the read is directly replaced by the written value. +// CHECK-LABEL: func @fold_transfer_raw_both_unmasked +// CHECK-DAG: %[[VAL:.*]] = arith.constant dense<1.000000e+00> : vector<128xf16> +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: return %[[VAL]] +func.func @fold_transfer_raw_both_unmasked(%t: tensor<128xf16>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%c0] {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.transfer_read %w[%c0], %cst {in_bounds = [true]} + : tensor<128xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Unmasked write, masked read: result is select(rMask, val, broadcast(pad)). +// CHECK-LABEL: func @fold_transfer_raw_unmasked_write_masked_read +// CHECK-SAME: %[[T:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[VAL:.*]] = arith.constant dense<1.000000e+00> : vector<128xf16> +// CHECK-DAG: %[[PAD:.*]] = arith.constant dense<0.000000e+00> : vector<128xf16> +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: %[[RES:.+]] = arith.select %[[MASK]], %[[VAL]], %[[PAD]] +// CHECK-NEXT: return %[[RES]] +func.func @fold_transfer_raw_unmasked_write_masked_read(%t: tensor<128xf16>, %mask: vector<128xi1>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%c0] {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.transfer_read %w[%c0], %cst, %mask {in_bounds = [true]} + : tensor<128xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Negative test: memref semantics — pattern must not fire. +// CHECK-LABEL: func @negative_fold_transfer_raw_memref +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_memref(%m: memref<128xf16>, %mask: vector<128xi1>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + vector.transfer_write %val, %m[%c0], %mask {in_bounds = [true]} + : vector<128xf16>, memref<128xf16> + %r = vector.transfer_read %m[%c0], %cst, %mask {in_bounds = [true]} + : memref<128xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Negative test: type mismatch between written and read vectors. +// CHECK-LABEL: func @negative_fold_transfer_raw_type_mismatch +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_type_mismatch(%t: tensor<128xf16>) -> vector<64xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%c0] {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.transfer_read %w[%c0], %cst {in_bounds = [true]} + : tensor<128xf16>, vector<64xf16> + return %r : vector<64xf16> +} + +// ----- + +// Negative test: different non-zero indices between write and read. +// CHECK-LABEL: func @negative_fold_transfer_raw_different_indices +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_different_indices( + %t: tensor<256xf16>, %i: index, %j: index) -> vector<128xf16> { + %cst = arith.constant 0.0 : f16 + %val = arith.constant dense<1.0> : vector<128xf16> + %w = vector.transfer_write %val, %t[%i] {in_bounds = [true]} + : vector<128xf16>, tensor<256xf16> + %r = vector.transfer_read %w[%j], %cst {in_bounds = [true]} + : tensor<256xf16>, vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + +// Write has OOB dim, read claims in-bounds, same mask: fold is valid because +// the read's in_bounds=true makes an actual OOB access UB. +// CHECK-LABEL: func @fold_transfer_raw_oob_write_same_mask +// CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] +// CHECK-SAME: %{{[a-zA-Z0-9]+}} +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK: %[[PAD:.*]] = arith.constant dense<0.000000e+00> : vector<1x32x16xf16> +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: %[[SEL:.*]] = arith.select %[[MASK]], %[[VAL]], %[[PAD]] +// CHECK: return %[[SEL]] +func.func @fold_transfer_raw_oob_write_same_mask( + %val: vector<1x32x16xf16>, %sz: index, + %mask: vector<1x32x16xi1>) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0], %mask + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad, %mask + {in_bounds = [true, true, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Negative: both write and read have OOB dims, no masks — fold must NOT fire. +// CHECK-LABEL: func @negative_fold_transfer_raw_oob_both_no_masks +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_oob_both_no_masks( + %val: vector<1x32x16xf16>, %sz: index) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0] + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad + {in_bounds = [true, false, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Negative: both write and read have OOB dims with same mask — fold must NOT +// fire. +// CHECK-LABEL: func @negative_fold_transfer_raw_oob_both_same_mask +// CHECK: vector.transfer_write +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_oob_both_same_mask( + %val: vector<1x32x16xf16>, %sz: index, + %mask: vector<1x32x16xi1>) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0], %mask + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad, %mask + {in_bounds = [true, false, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Only read has OOB dim, write claims in-bounds, same mask: fold is valid +// because the write's in_bounds=true makes an actual OOB access UB. +// CHECK-LABEL: func @fold_transfer_raw_oob_read_only +// CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] +// CHECK-SAME: %{{[a-zA-Z0-9]+}} +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK: %[[PAD:.*]] = arith.constant dense<0.000000e+00> : vector<1x32x16xf16> +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: %[[SEL:.*]] = arith.select %[[MASK]], %[[VAL]], %[[PAD]] +// CHECK: return %[[SEL]] +func.func @fold_transfer_raw_oob_read_only( + %val: vector<1x32x16xf16>, %sz: index, + %mask: vector<1x32x16xi1>) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0], %mask + {in_bounds = [true, true, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad, %mask + {in_bounds = [true, false, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Write has OOB dim, read claims in-bounds, no masks: fold is valid. +// CHECK-LABEL: func @fold_transfer_raw_oob_write_no_masks +// CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] +// CHECK-NOT: vector.transfer_write +// CHECK-NOT: vector.transfer_read +// CHECK: return %[[VAL]] +func.func @fold_transfer_raw_oob_write_no_masks( + %val: vector<1x32x16xf16>, %sz: index) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0] + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad + {in_bounds = [true, true, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Write has OOB dim, read claims in-bounds, different masks: fold is valid. +// The inner select reads from the original tensor (tensor.empty), producing +// select(wMask, val, read(tensor.empty)). The outer select then applies rMask. +// CHECK-LABEL: func @fold_transfer_raw_oob_write_different_masks +// CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[SZ:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[WMASK:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[RMASK:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[PAD_VEC:.*]] = arith.constant dense<0.000000e+00> : vector<1x32x16xf16> +// CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f16 +// CHECK: %[[EMPTY:.*]] = tensor.empty(%[[SZ]]) +// CHECK: %[[READ:.*]] = vector.transfer_read %[[EMPTY]]{{.*}}, %[[PAD]] +// CHECK: %[[INNER:.*]] = arith.select %[[WMASK]], %[[VAL]], %[[READ]] +// CHECK: %[[OUTER:.*]] = arith.select %[[RMASK]], %[[INNER]], %[[PAD_VEC]] +// CHECK: return %[[OUTER]] +func.func @fold_transfer_raw_oob_write_different_masks( + %val: vector<1x32x16xf16>, %sz: index, + %wmask: vector<1x32x16xi1>, + %rmask: vector<1x32x16xi1>) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0], %wmask + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad, %rmask + {in_bounds = [true, true, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Write has OOB dim, only write is masked, read claims in-bounds: fold is +// valid. The inner select reads from the original tensor (tensor.empty), +// producing select(wMask, val, read(tensor.empty)). No rMask, so the result +// is the inner select. +// CHECK-LABEL: func @fold_transfer_raw_oob_write_only_write_masked +// CHECK-SAME: %[[VAL:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[SZ:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[MASK:[a-zA-Z0-9]+]] +// CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f16 +// CHECK: %[[EMPTY:.*]] = tensor.empty(%[[SZ]]) +// CHECK: %[[READ:.*]] = vector.transfer_read %[[EMPTY]]{{.*}}, %[[PAD]] +// CHECK: %[[SEL:.*]] = arith.select %[[MASK]], %[[VAL]], %[[READ]] +// CHECK: return %[[SEL]] +func.func @fold_transfer_raw_oob_write_only_write_masked( + %val: vector<1x32x16xf16>, %sz: index, + %mask: vector<1x32x16xi1>) -> vector<1x32x16xf16> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.0 : f16 + %e = tensor.empty(%sz) : tensor<1x?x16xf16> + %w = vector.transfer_write %val, %e[%c0, %c0, %c0], %mask + {in_bounds = [true, false, true]} : vector<1x32x16xf16>, tensor<1x?x16xf16> + %r = vector.transfer_read %w[%c0, %c0, %c0], %pad + {in_bounds = [true, true, true]} : tensor<1x?x16xf16>, vector<1x32x16xf16> + return %r : vector<1x32x16xf16> +} + +// ----- + +// Negative test: transfer_read is inside a vector.mask — the pattern must not +// fold because the external mask is not visible through getMask(). +// CHECK-LABEL: func @negative_fold_transfer_raw_vector_mask +// CHECK: vector.transfer_write +// CHECK: vector.mask +// CHECK: vector.transfer_read +func.func @negative_fold_transfer_raw_vector_mask( + %t: tensor<128xf16>, %val: vector<128xf16>, + %wmask: vector<128xi1>, %rmask: vector<128xi1>) -> vector<128xf16> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %w = vector.transfer_write %val, %t[%c0], %wmask {in_bounds = [true]} + : vector<128xf16>, tensor<128xf16> + %r = vector.mask %rmask { + vector.transfer_read %w[%c0], %cst {in_bounds = [true]} + : tensor<128xf16>, vector<128xf16> + } : vector<128xi1> -> vector<128xf16> + return %r : vector<128xf16> +} + +// ----- + // CHECK-LABEL: func @dead_store_tensor // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index