[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

alexey-bataev · 2024-04-29T13:03:01Z

Adds transformation of consecutive vector store + reverse to strided
stores with stride -1, if it is profitable

Created using spr 1.3.5

llvmbot · 2024-04-29T13:03:36Z

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Adds transformation of consecutive vector store + reverse to strided
stores with stride -1, if it is profitable

Full diff: https://github.com/llvm/llvm-project/pull/90464.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+65-8)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll (+5-26)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f62270fe62ebea..2cca7130a0be70 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7860,6 +7860,32 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::Store: {
+      Type *ScalarTy = cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
+      auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+      Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
+      // Check if profitable to represent consecutive load + reverse as strided
+      // load with stride -1.
+      if (isReverseOrder(E.ReorderIndices) &&
+          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+        SmallVector<int> Mask;
+        inversePermutation(E.ReorderIndices, Mask);
+        auto *BaseSI = cast<StoreInst>(E.Scalars.back());
+        InstructionCost OriginalVecCost =
+            TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
+                                 BaseSI->getPointerAddressSpace(), CostKind,
+                                 TTI::OperandValueInfo()) +
+            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
+        if (StridedCost < OriginalVecCost)
+          // Strided load is more profitable than consecutive load + reverse -
+          // transform the node to strided load.
+          E.State = TreeEntry::StridedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -9343,11 +9369,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       // We know that we can merge the stores. Calculate the cost.
-      TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
-      return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
-                                  BaseSI->getPointerAddressSpace(), CostKind,
-                                  OpInfo) +
-             CommonCost;
+      InstructionCost VecStCost;
+      if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
+        VecStCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
+      } else {
+        assert(E->State == TreeEntry::Vectorize &&
+               "Expected either strided or consecutive stores.");
+        TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
+        VecStCost = TTI->getMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getAlign(),
+            BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
+      }
+      return VecStCost + CommonCost;
     };
     SmallVector<Value *> PointerOps(VL.size());
     for (auto [I, V] : enumerate(VL)) {
@@ -12251,7 +12288,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
     ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
-    if (E->getOpcode() == Instruction::Store) {
+    if (E->getOpcode() == Instruction::Store &&
+        E->State == TreeEntry::Vectorize) {
       ArrayRef<int> Mask =
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
@@ -12848,8 +12886,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       VecValue = FinalShuffle(VecValue, E, VecTy);
 
       Value *Ptr = SI->getPointerOperand();
-      StoreInst *ST =
-          Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      Instruction *ST;
+      if (E->State == TreeEntry::Vectorize) {
+        ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      } else {
+        assert(E->State == TreeEntry::StridedVectorize &&
+               "Expected either strided or conseutive stores.");
+        Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
+        Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_store,
+            {VecTy, Ptr->getType(), StrideTy},
+            {VecValue, Ptr,
+             ConstantInt::get(
+                 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
+             Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/1,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        ST = Inst;
+      }
 
       Value *V = propagateMetadata(ST, E->Scalars);
 
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
index 0dfa45da9d87f4..56e8829b0ec68b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -4,33 +4,12 @@
 define void @store_reverse(ptr %p3) {
 ; CHECK-LABEL: @store_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P3:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 7
-; CHECK-NEXT:    store i64 [[SHL]], ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 9
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 6
-; CHECK-NEXT:    store i64 [[SHL5]], ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX7]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[SHL9:%.*]] = shl i64 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 5
-; CHECK-NEXT:    store i64 [[SHL9]], ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX11]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 11
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[SHL13:%.*]] = shl i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3:%.*]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
-; CHECK-NEXT:    store i64 [[SHL13]], ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP2]], ptr align 8 [[ARRAYIDX14]], i64 -8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
 ; CHECK-NEXT:    ret void
 ;
 entry:

github-actions · 2024-04-29T13:05:47Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

RKSimon

LGTM

Created using spr 1.3.5

rofirrim · 2024-05-06T16:17:43Z

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

alexey-bataev · 2024-05-06T16:23:26Z

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

Yes, looks like you're right, forgot to take the last pointer. Will fix it ASAP.

alexey-bataev · 2024-05-06T17:19:21Z

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

Must be fixed in c144157

[𝘀𝗽𝗿] initial version

6ff9226

Created using spr 1.3.5

llvmbot added vectorization llvm:transforms labels Apr 29, 2024

alexey-bataev requested review from preames and RKSimon April 29, 2024 13:03

Fix formatting

75b96ed

Created using spr 1.3.5

RKSimon approved these changes Apr 29, 2024

View reviewed changes

Rebase

c72e615

Created using spr 1.3.5

alexey-bataev merged commit 67e726a into main May 1, 2024
3 of 4 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slptransform-stores-reverse-to-strided-stores-with-stride-1-if-profitable branch May 1, 2024 11:32

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

alexey-bataev commented Apr 29, 2024

llvmbot commented Apr 29, 2024

github-actions bot commented Apr 29, 2024 •

edited

Loading

RKSimon left a comment

rofirrim commented May 6, 2024 •

edited

Loading

alexey-bataev commented May 6, 2024

alexey-bataev commented May 6, 2024

[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

Conversation

alexey-bataev commented Apr 29, 2024

llvmbot commented Apr 29, 2024

github-actions bot commented Apr 29, 2024 • edited Loading

RKSimon left a comment

Choose a reason for hiding this comment

rofirrim commented May 6, 2024 • edited Loading

alexey-bataev commented May 6, 2024

alexey-bataev commented May 6, 2024

github-actions bot commented Apr 29, 2024 •

edited

Loading

rofirrim commented May 6, 2024 •

edited

Loading