From 9139d484d46a0b63275e00b988895bfb419bbe71 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 25 May 2022 07:15:49 -0700
Subject: [PATCH] [SLP]Fix crash on reordering of ScatterVectorize nodes.

ScatterVectorize nodes should be handled same way as gathers in
reorderBottomToTop function, since we can simple reorder the loads in
this node. Because of that need to include such nodes to the list of
gathered nodes to fix compiler crash.

Differential Revision: https://reviews.llvm.org/D126378
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 ++
 .../X86/scatter-vectorize-reorder.ll          | 74 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8cedb63eead04..5efb7148cc03d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3810,6 +3810,11 @@ bool BoUpSLP::canReorderOperands(
       // Add the node to the list of the ordered nodes with the identity
       // order.
       Edges.emplace_back(I, TE);
+      // Add ScatterVectorize nodes to the list of operands, where just
+      // reordering of the scalars is required. Similar to the gathers, so
+      // simply add to the list of gathered ops.
+      if (TE->State != TreeEntry::Vectorize)
+        GatherOps.push_back(TE);
       continue;
     }
     ArrayRef<Value *> VL = UserTE->getOperand(I);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
new file mode 100644
index 0000000000000..e269117102fab
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=cascadelake < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX10_I_I86:%.*]] = getelementptr inbounds float, ptr undef, i64 2
+; CHECK-NEXT:    [[ARRAYIDX21_I:%.*]] = getelementptr inbounds [4 x float], ptr undef, i64 2
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> <float 0.000000e+00, float poison>, <2 x float> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx10.i.i86 = getelementptr inbounds float, ptr undef, i64 2
+  %arrayidx6.i66.i = getelementptr inbounds float, ptr undef, i64 1
+  %arrayidx21.i = getelementptr inbounds [4 x float], ptr undef, i64 2
+  %arrayidx6.i109.i = getelementptr inbounds [4 x float], ptr undef, i64 2, i64 1
+  br label %bb1
+
+bb1:
+  %0 = load float, ptr undef, align 4
+  %sub.i71.i = fsub float 0.000000e+00, %0
+  %1 = load float, ptr %arrayidx6.i66.i, align 4
+  %sub5.i74.i = fsub float 0.000000e+00, %1
+  %2 = load float, ptr %arrayidx10.i.i86, align 4
+  %3 = call float @llvm.fmuladd.f32(float %1, float %2, float 0.000000e+00)
+  %4 = load float, ptr undef, align 4
+  %5 = call float @llvm.fmuladd.f32(float 0.000000e+00, float %4, float %2)
+  br i1 false, label %bb2, label %bb3
+
+bb2:
+  %mul.i95 = fmul float %3, 0.000000e+00
+  %mul3.i96 = fmul float %5, 0.000000e+00
+  br label %bb3
+
+bb3:
+  %vddir.sroa.8.0.i = phi float [ %mul3.i96, %bb2 ], [ 0.000000e+00, %bb1 ]
+  %vddir.sroa.0.0.i = phi float [ %mul.i95, %bb2 ], [ 0.000000e+00, %bb1 ]
+  %add.i.i = fadd float %sub.i71.i, %vddir.sroa.0.0.i
+  %add5.i.i = fadd float %sub5.i74.i, %vddir.sroa.8.0.i
+  %add.i105.i = fadd float %add.i.i, 0.000000e+00
+  %add5.i108.i = fadd float %add5.i.i, 0.000000e+00
+  %sub.i114.i = fsub float %add.i105.i, 0.000000e+00
+  %sub4.i.i = fsub float %add5.i108.i, 0.000000e+00
+  %sub.i118.i = fsub float %sub.i114.i, 0.000000e+00
+  store float %sub.i118.i, ptr %arrayidx21.i, align 16
+  %sub4.i121.i = fsub float %sub4.i.i, 0.000000e+00
+  store float %sub4.i121.i, ptr %arrayidx6.i109.i, align 4
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+