[SLP]Reorder counters for same values, if the root node is reordered.

The counters for the repeated scalars are ordered in the natural order, but the original scalars might be reordered during SLP graph reordering and this order can be dropped. Need to use the scalars after the reordering, not the original ones, to emit correct code for same value counters.
llvm · Apr 3, 2023 · c166000 · c166000
1 parent b15a946
commit c166000
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 3 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1165,6 +1165,12 @@ class BoUpSLP {
            !VectorizableTree.front()->UserTreeIndices.empty();
   }
 
+  /// Return the scalars of the root node.
+  ArrayRef<Value *> getRootNodeScalars() const {
+    assert(!VectorizableTree.empty() && "No graph to get the first node from");
+    return VectorizableTree.front()->Scalars;
+  }
+
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
   /// ExternallyUsedValues contains additional list of external uses to handle
@@ -13355,8 +13361,9 @@ class HorizontalReduction {
 
         // Emit code to correctly handle reused reduced values, if required.
         if (OptReusedScalars && !SameScaleFactor) {
-          VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, VL,
-                                         SameValuesCounter, TrackedToOrig);
+          VectorizedRoot =
+              emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
+                            SameValuesCounter, TrackedToOrig);
         }
 
         Value *ReducedSubTree =

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalars-reordered-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalars-reordered-in-reduction.ll
@@ -7,7 +7,7 @@ define i32 @test() {
 ; CHECK-NEXT:    [[SQ:%.*]] = alloca [64 x i32], i32 0, align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [64 x i32], ptr [[SQ]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], <i32 3, i32 2, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], <i32 2, i32 3, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;