Skip to content

Conversation

banach-space
Copy link
Contributor

NOTE: See #149156 for a smilar change for linalg.unpack

This PR makes sure that we don't generate unnecessary tensor.empty
when vectorizing linalg.pack.

To better visualize the changes implemented here, consider this IR:

func.func @example(
    %src: tensor<64x4xf32>,
    %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {

  %pack = linalg.pack %src
    outer_dims_perm = [1, 0]
    inner_dims_pos = [0, 1]
    inner_tiles = [16, 2]
    into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>

  return %pack : tensor<2x4x16x2xf32>
}

Below is the output after vectorization, BEFORE and AFTER this PR.

BEFORE (note tensor.empty and the fact that %arg1 is not used):

  func.func @example(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
    %1 = vector.shape_cast %0 : vector<64x4xf32> to vector<4x16x2x2xf32>
    %2 = vector.transpose %1, [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
    %3 = tensor.empty() : tensor<2x4x16x2xf32>
    %c0_0 = arith.constant 0 : index
    %4 = vector.transfer_write %2, %3[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
    return %4 : tensor<2x4x16x2xf32>
  }

AFTER (note that %arg1 is correctly used):

func.func @example(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
  %1 = vector.shape_cast %0 : vector<64x4xf32> to vector<4x16x2x2xf32>
  %2 = vector.transpose %1, [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
  %c0_0 = arith.constant 0 : index
  %3 = vector.transfer_write %2, %arg1[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
  return %3 : tensor<2x4x16x2xf32>
}

ADDITIONAL CHANGES:

  • Adds missing CHECK-LABEL in tests.
  • Capitalize LIT test variables names.

NOTE: See llvm#149156 for a smilar change for `linalg.unpack`

This PR makes sure that we don't generate unnecessary `tensor.empty`
when vectorizing `linalg.pack`.

To better visualize the changes implemented here, consider this IR:
```mlir
func.func @example(
    %src: tensor<64x4xf32>,
    %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {

  %pack = linalg.pack %src
    outer_dims_perm = [1, 0]
    inner_dims_pos = [0, 1]
    inner_tiles = [16, 2]
    into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>

  return %pack : tensor<2x4x16x2xf32>
}
```

Below is the output after vectorization, BEFORE and AFTER this PR.

BEFORE (note `tensor.empty` and the fact that `%arg1` is not used):
```mlir
  func.func @example(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
    %1 = vector.shape_cast %0 : vector<64x4xf32> to vector<4x16x2x2xf32>
    %2 = vector.transpose %1, [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
    %3 = tensor.empty() : tensor<2x4x16x2xf32>
    %c0_0 = arith.constant 0 : index
    %4 = vector.transfer_write %2, %3[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
    return %4 : tensor<2x4x16x2xf32>
  }
```

AFTER (note that `%arg1` is correctly used):
```mlir
func.func @example(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
  %1 = vector.shape_cast %0 : vector<64x4xf32> to vector<4x16x2x2xf32>
  %2 = vector.transpose %1, [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
  %c0_0 = arith.constant 0 : index
  %3 = vector.transfer_write %2, %arg1[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
  return %3 : tensor<2x4x16x2xf32>
}
```

ADDITIONAL CHANGES:
  * Adds missing `CHECK-LABEL` in tests.
  * Capitalize LIT test variables names.
@llvmbot
Copy link
Member

llvmbot commented Sep 16, 2025

@llvm/pr-subscribers-mlir-linalg

Author: Andrzej Warzyński (banach-space)

Changes

NOTE: See #149156 for a smilar change for linalg.unpack

This PR makes sure that we don't generate unnecessary tensor.empty
when vectorizing linalg.pack.

To better visualize the changes implemented here, consider this IR:

func.func @<!-- -->example(
    %src: tensor&lt;64x4xf32&gt;,
    %dest: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {

  %pack = linalg.pack %src
    outer_dims_perm = [1, 0]
    inner_dims_pos = [0, 1]
    inner_tiles = [16, 2]
    into %dest : tensor&lt;64x4xf32&gt; -&gt; tensor&lt;2x4x16x2xf32&gt;

  return %pack : tensor&lt;2x4x16x2xf32&gt;
}

Below is the output after vectorization, BEFORE and AFTER this PR.

BEFORE (note tensor.empty and the fact that %arg1 is not used):

  func.func @<!-- -->example(%arg0: tensor&lt;64x4xf32&gt;, %arg1: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor&lt;64x4xf32&gt;, vector&lt;64x4xf32&gt;
    %1 = vector.shape_cast %0 : vector&lt;64x4xf32&gt; to vector&lt;4x16x2x2xf32&gt;
    %2 = vector.transpose %1, [2, 0, 1, 3] : vector&lt;4x16x2x2xf32&gt; to vector&lt;2x4x16x2xf32&gt;
    %3 = tensor.empty() : tensor&lt;2x4x16x2xf32&gt;
    %c0_0 = arith.constant 0 : index
    %4 = vector.transfer_write %2, %3[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector&lt;2x4x16x2xf32&gt;, tensor&lt;2x4x16x2xf32&gt;
    return %4 : tensor&lt;2x4x16x2xf32&gt;
  }

AFTER (note that %arg1 is correctly used):

func.func @<!-- -->example(%arg0: tensor&lt;64x4xf32&gt;, %arg1: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor&lt;64x4xf32&gt;, vector&lt;64x4xf32&gt;
  %1 = vector.shape_cast %0 : vector&lt;64x4xf32&gt; to vector&lt;4x16x2x2xf32&gt;
  %2 = vector.transpose %1, [2, 0, 1, 3] : vector&lt;4x16x2x2xf32&gt; to vector&lt;2x4x16x2xf32&gt;
  %c0_0 = arith.constant 0 : index
  %3 = vector.transfer_write %2, %arg1[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector&lt;2x4x16x2xf32&gt;, tensor&lt;2x4x16x2xf32&gt;
  return %3 : tensor&lt;2x4x16x2xf32&gt;
}

ADDITIONAL CHANGES:

  • Adds missing CHECK-LABEL in tests.
  • Capitalize LIT test variables names.

Full diff: https://github.com/llvm/llvm-project/pull/158926.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+2-11)
  • (modified) mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir (+85-77)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 406f05c1b08ef..3ee6ae1029f72 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1776,12 +1776,6 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
         rewriter, loc,
         rewriter.getZeroAttr(packOp.getSourceType().getElementType()));
   }
-  ReifiedRankedShapedTypeDims reifiedReturnShapes;
-  LogicalResult status =
-      cast<ReifyRankedShapedTypeOpInterface>(packOp.getOperation())
-          .reifyResultShapes(rewriter, reifiedReturnShapes);
-  (void)status; // prevent unused variable warning on non-assert builds.
-  assert(succeeded(status) && "failed to reify result shapes");
 
   // If the input vector sizes are not provided, then the vector sizes are
   // determined by the result tensor shape. In case the vector sizes aren't
@@ -1823,11 +1817,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
       rewriter, loc, shapeCastOp.getResult(), destPermutation);
 
   // Create TransferWriteOp.
-  Value dest = tensor::EmptyOp::create(
-      rewriter, loc, reifiedReturnShapes[0],
-      transposeOp.getResult().getType().getElementType());
-  Operation *write =
-      createWriteOrMaskedWrite(rewriter, loc, transposeOp.getResult(), dest);
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, transposeOp.getResult(), packOp.getDest());
   newResults.push_back(write->getResult(0));
   return success();
 }
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 01eb210a8ff5f..9c2ddb78bbee1 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1301,25 +1301,27 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf
 // different - vector sizes are inferred (rather than user-specified) and hence
 // masking was used.
 
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
-  %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x8x16xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<4x1x32x16x2xf32>
+func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+  %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:    {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
 //      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %src : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op
     transform.yield
   }
@@ -1331,26 +1333,28 @@ module attributes {transform.with_named_sequence} {
 // different - vector sizes are inferred (rather than user-specified) and hence
 // masking was used.
 
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// CHECK-LABEL: func @test_vectorize_padded_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c32:.*]] = arith.constant 32 : index
-//  CHECK-DAG: %[[c7:.*]] = arith.constant 7 : index
-//  CHECK-DAG: %[[c15:.*]] = arith.constant 15 : index
-//      CHECK: %[[mask:.*]] = vector.create_mask %[[c32]], %[[c7]], %[[c15]] : vector<32x8x16xi1>
-//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
-// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+//  CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index
+//  CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C7]], %[[C15]] : vector<32x8x16xi1>
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true, true]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
 // CHECK-SAME: } : vector<32x8x16xi1> -> vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
 //      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
 
@@ -1364,38 +1368,40 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @test_vectorize_dynamic_pack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
-  %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_dynamic_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+  %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
-//  CHECK-DAG: %[[d0:.*]] = tensor.dim {{.*}} %[[c0]] : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d1:.*]] = tensor.dim {{.*}} %[[c1]] : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c0_0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c1_0:.*]] = arith.constant 1 : index
-//  CHECK-DAG: %[[d0_0:.*]] = tensor.dim {{.*}} %[[c0_0]] : tensor<?x?xf32>
-//  CHECK-DAG: %[[d1_0:.*]] = tensor.dim {{.*}} %[[c1_0]] : tensor<?x?xf32>
-//      CHECK: %[[mask:.*]] = vector.create_mask %[[d0_0]], %[[d1_0]] : vector<8x16xi1>
-//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
-// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0_1]], %[[c0_1]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+//  CHECK-DAG: %[[D0:.*]] = tensor.dim {{.*}} %[[C0]] : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[D1:.*]] = tensor.dim {{.*}} %[[C1]] : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+//  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
+//  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
 // CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<8x16xf32> to vector<4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_2:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c16:.*]] = arith.constant 16 : index
-//  CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty(%[[d0]], %[[d1]]) : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d2:.*]] = tensor.dim %[[empty]], {{.*}} : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d3:.*]] = tensor.dim %[[empty]], {{.*}} : tensor<?x?x16x2xf32>
-//      CHECK: %[[mask_0:.*]] = vector.create_mask %[[d2]], %[[d3]], %[[c16]], %[[c2]] : vector<4x1x16x2xi1>
-//      CHECK: %[[masked_write:.*]] = vector.mask %[[mask_0]] {
-// CHECK-SAME:   vector.transfer_write %[[transpose]], %[[empty]][%[[c0_2]], %[[c0_2]], %[[c0_2]], %[[c0_2]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
+//  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
+//      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[C16]], %[[C2]] : vector<4x1x16x2xi1>
+//      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
+// CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x16x2xf32>
-//      CHECK: return %[[masked_write]] : tensor<?x?x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<?x?x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1407,22 +1413,23 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
-func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<64x4xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<2x4x16x2xf32>
+func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
+  %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
   return %pack : tensor<2x4x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1435,22 +1442,23 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {

@llvmbot
Copy link
Member

llvmbot commented Sep 16, 2025

@llvm/pr-subscribers-mlir

Author: Andrzej Warzyński (banach-space)

Changes

NOTE: See #149156 for a smilar change for linalg.unpack

This PR makes sure that we don't generate unnecessary tensor.empty
when vectorizing linalg.pack.

To better visualize the changes implemented here, consider this IR:

func.func @<!-- -->example(
    %src: tensor&lt;64x4xf32&gt;,
    %dest: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {

  %pack = linalg.pack %src
    outer_dims_perm = [1, 0]
    inner_dims_pos = [0, 1]
    inner_tiles = [16, 2]
    into %dest : tensor&lt;64x4xf32&gt; -&gt; tensor&lt;2x4x16x2xf32&gt;

  return %pack : tensor&lt;2x4x16x2xf32&gt;
}

Below is the output after vectorization, BEFORE and AFTER this PR.

BEFORE (note tensor.empty and the fact that %arg1 is not used):

  func.func @<!-- -->example(%arg0: tensor&lt;64x4xf32&gt;, %arg1: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor&lt;64x4xf32&gt;, vector&lt;64x4xf32&gt;
    %1 = vector.shape_cast %0 : vector&lt;64x4xf32&gt; to vector&lt;4x16x2x2xf32&gt;
    %2 = vector.transpose %1, [2, 0, 1, 3] : vector&lt;4x16x2x2xf32&gt; to vector&lt;2x4x16x2xf32&gt;
    %3 = tensor.empty() : tensor&lt;2x4x16x2xf32&gt;
    %c0_0 = arith.constant 0 : index
    %4 = vector.transfer_write %2, %3[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector&lt;2x4x16x2xf32&gt;, tensor&lt;2x4x16x2xf32&gt;
    return %4 : tensor&lt;2x4x16x2xf32&gt;
  }

AFTER (note that %arg1 is correctly used):

func.func @<!-- -->example(%arg0: tensor&lt;64x4xf32&gt;, %arg1: tensor&lt;2x4x16x2xf32&gt;) -&gt; tensor&lt;2x4x16x2xf32&gt; {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor&lt;64x4xf32&gt;, vector&lt;64x4xf32&gt;
  %1 = vector.shape_cast %0 : vector&lt;64x4xf32&gt; to vector&lt;4x16x2x2xf32&gt;
  %2 = vector.transpose %1, [2, 0, 1, 3] : vector&lt;4x16x2x2xf32&gt; to vector&lt;2x4x16x2xf32&gt;
  %c0_0 = arith.constant 0 : index
  %3 = vector.transfer_write %2, %arg1[%c0_0, %c0_0, %c0_0, %c0_0] {in_bounds = [true, true, true, true]} : vector&lt;2x4x16x2xf32&gt;, tensor&lt;2x4x16x2xf32&gt;
  return %3 : tensor&lt;2x4x16x2xf32&gt;
}

ADDITIONAL CHANGES:

  • Adds missing CHECK-LABEL in tests.
  • Capitalize LIT test variables names.

Full diff: https://github.com/llvm/llvm-project/pull/158926.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+2-11)
  • (modified) mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir (+85-77)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 406f05c1b08ef..3ee6ae1029f72 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1776,12 +1776,6 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
         rewriter, loc,
         rewriter.getZeroAttr(packOp.getSourceType().getElementType()));
   }
-  ReifiedRankedShapedTypeDims reifiedReturnShapes;
-  LogicalResult status =
-      cast<ReifyRankedShapedTypeOpInterface>(packOp.getOperation())
-          .reifyResultShapes(rewriter, reifiedReturnShapes);
-  (void)status; // prevent unused variable warning on non-assert builds.
-  assert(succeeded(status) && "failed to reify result shapes");
 
   // If the input vector sizes are not provided, then the vector sizes are
   // determined by the result tensor shape. In case the vector sizes aren't
@@ -1823,11 +1817,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
       rewriter, loc, shapeCastOp.getResult(), destPermutation);
 
   // Create TransferWriteOp.
-  Value dest = tensor::EmptyOp::create(
-      rewriter, loc, reifiedReturnShapes[0],
-      transposeOp.getResult().getType().getElementType());
-  Operation *write =
-      createWriteOrMaskedWrite(rewriter, loc, transposeOp.getResult(), dest);
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, transposeOp.getResult(), packOp.getDest());
   newResults.push_back(write->getResult(0));
   return success();
 }
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 01eb210a8ff5f..9c2ddb78bbee1 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1301,25 +1301,27 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf
 // different - vector sizes are inferred (rather than user-specified) and hence
 // masking was used.
 
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
-  %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x8x16xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<4x1x32x16x2xf32>
+func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+  %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:    {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
 //      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %src : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op
     transform.yield
   }
@@ -1331,26 +1333,28 @@ module attributes {transform.with_named_sequence} {
 // different - vector sizes are inferred (rather than user-specified) and hence
 // masking was used.
 
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// CHECK-LABEL: func @test_vectorize_padded_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c32:.*]] = arith.constant 32 : index
-//  CHECK-DAG: %[[c7:.*]] = arith.constant 7 : index
-//  CHECK-DAG: %[[c15:.*]] = arith.constant 15 : index
-//      CHECK: %[[mask:.*]] = vector.create_mask %[[c32]], %[[c7]], %[[c15]] : vector<32x8x16xi1>
-//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
-// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+//  CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index
+//  CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[C32]], %[[C7]], %[[C15]] : vector<32x8x16xi1>
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true, true]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
 // CHECK-SAME: } : vector<32x8x16xi1> -> vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
 //      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
 
@@ -1364,38 +1368,40 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @test_vectorize_dynamic_pack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
-  %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_dynamic_pack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+  %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
-//  CHECK-DAG: %[[d0:.*]] = tensor.dim {{.*}} %[[c0]] : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d1:.*]] = tensor.dim {{.*}} %[[c1]] : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c0_0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c1_0:.*]] = arith.constant 1 : index
-//  CHECK-DAG: %[[d0_0:.*]] = tensor.dim {{.*}} %[[c0_0]] : tensor<?x?xf32>
-//  CHECK-DAG: %[[d1_0:.*]] = tensor.dim {{.*}} %[[c1_0]] : tensor<?x?xf32>
-//      CHECK: %[[mask:.*]] = vector.create_mask %[[d0_0]], %[[d1_0]] : vector<8x16xi1>
-//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
-// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0_1]], %[[c0_1]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+//  CHECK-DAG: %[[D0:.*]] = tensor.dim {{.*}} %[[C0]] : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[D1:.*]] = tensor.dim {{.*}} %[[C1]] : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+//  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
+//  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
 // CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<8x16xf32> to vector<4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_2:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[c16:.*]] = arith.constant 16 : index
-//  CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty(%[[d0]], %[[d1]]) : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d2:.*]] = tensor.dim %[[empty]], {{.*}} : tensor<?x?x16x2xf32>
-//  CHECK-DAG: %[[d3:.*]] = tensor.dim %[[empty]], {{.*}} : tensor<?x?x16x2xf32>
-//      CHECK: %[[mask_0:.*]] = vector.create_mask %[[d2]], %[[d3]], %[[c16]], %[[c2]] : vector<4x1x16x2xi1>
-//      CHECK: %[[masked_write:.*]] = vector.mask %[[mask_0]] {
-// CHECK-SAME:   vector.transfer_write %[[transpose]], %[[empty]][%[[c0_2]], %[[c0_2]], %[[c0_2]], %[[c0_2]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
+//  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
+//  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
+//      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[C16]], %[[C2]] : vector<4x1x16x2xi1>
+//      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
+// CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x16x2xf32>
-//      CHECK: return %[[masked_write]] : tensor<?x?x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<?x?x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1407,22 +1413,23 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
-func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
+// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<64x4xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<2x4x16x2xf32>
+func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
+  %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
   return %pack : tensor<2x4x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1435,22 +1442,23 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {

Comment on lines 1776 to 1778
rewriter, loc,
rewriter.getZeroAttr(packOp.getSourceType().getElementType()));
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ig we cant fix it in this patch but this is wrong and should be using posion

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out! #159536

@banach-space banach-space merged commit 2f6b433 into llvm:main Sep 18, 2025
8 checks passed
@banach-space banach-space deleted the andrzej/linalg/pack_vectorization branch September 18, 2025 09:33
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants