Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Transform stores + reverse to strided stores with stride -1, if profitable. #90464

Conversation

alexey-bataev
Copy link
Member

Adds transformation of consecutive vector store + reverse to strided
stores with stride -1, if it is profitable

Created using spr 1.3.5
@llvmbot
Copy link
Collaborator

llvmbot commented Apr 29, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Adds transformation of consecutive vector store + reverse to strided
stores with stride -1, if it is profitable


Full diff: https://github.com/llvm/llvm-project/pull/90464.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+65-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll (+5-26)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f62270fe62ebea..2cca7130a0be70 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7860,6 +7860,32 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::Store: {
+      Type *ScalarTy = cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
+      auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+      Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
+      // Check if profitable to represent consecutive load + reverse as strided
+      // load with stride -1.
+      if (isReverseOrder(E.ReorderIndices) &&
+          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+        SmallVector<int> Mask;
+        inversePermutation(E.ReorderIndices, Mask);
+        auto *BaseSI = cast<StoreInst>(E.Scalars.back());
+        InstructionCost OriginalVecCost =
+            TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
+                                 BaseSI->getPointerAddressSpace(), CostKind,
+                                 TTI::OperandValueInfo()) +
+            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
+        if (StridedCost < OriginalVecCost)
+          // Strided load is more profitable than consecutive load + reverse -
+          // transform the node to strided load.
+          E.State = TreeEntry::StridedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -9343,11 +9369,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       // We know that we can merge the stores. Calculate the cost.
-      TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
-      return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
-                                  BaseSI->getPointerAddressSpace(), CostKind,
-                                  OpInfo) +
-             CommonCost;
+      InstructionCost VecStCost;
+      if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
+        VecStCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
+      } else {
+        assert(E->State == TreeEntry::Vectorize &&
+               "Expected either strided or consecutive stores.");
+        TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
+        VecStCost = TTI->getMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getAlign(),
+            BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
+      }
+      return VecStCost + CommonCost;
     };
     SmallVector<Value *> PointerOps(VL.size());
     for (auto [I, V] : enumerate(VL)) {
@@ -12251,7 +12288,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
     ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
-    if (E->getOpcode() == Instruction::Store) {
+    if (E->getOpcode() == Instruction::Store &&
+        E->State == TreeEntry::Vectorize) {
       ArrayRef<int> Mask =
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
@@ -12848,8 +12886,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       VecValue = FinalShuffle(VecValue, E, VecTy);
 
       Value *Ptr = SI->getPointerOperand();
-      StoreInst *ST =
-          Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      Instruction *ST;
+      if (E->State == TreeEntry::Vectorize) {
+        ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      } else {
+        assert(E->State == TreeEntry::StridedVectorize &&
+               "Expected either strided or conseutive stores.");
+        Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
+        Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_store,
+            {VecTy, Ptr->getType(), StrideTy},
+            {VecValue, Ptr,
+             ConstantInt::get(
+                 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
+             Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/1,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        ST = Inst;
+      }
 
       Value *V = propagateMetadata(ST, E->Scalars);
 
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
index 0dfa45da9d87f4..56e8829b0ec68b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -4,33 +4,12 @@
 define void @store_reverse(ptr %p3) {
 ; CHECK-LABEL: @store_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P3:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 7
-; CHECK-NEXT:    store i64 [[SHL]], ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 9
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 6
-; CHECK-NEXT:    store i64 [[SHL5]], ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX7]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[SHL9:%.*]] = shl i64 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 5
-; CHECK-NEXT:    store i64 [[SHL9]], ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX11]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 11
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[SHL13:%.*]] = shl i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3:%.*]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
-; CHECK-NEXT:    store i64 [[SHL13]], ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP2]], ptr align 8 [[ARRAYIDX14]], i64 -8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
 ; CHECK-NEXT:    ret void
 ;
 entry:

Copy link

github-actions bot commented Apr 29, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Created using spr 1.3.5
@alexey-bataev alexey-bataev merged commit 67e726a into main May 1, 2024
3 of 4 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slptransform-stores-reverse-to-strided-stores-with-stride-1-if-profitable branch May 1, 2024 11:32
@rofirrim
Copy link
Collaborator

rofirrim commented May 6, 2024

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

@alexey-bataev
Copy link
Member Author

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

Yes, looks like you're right, forgot to take the last pointer. Will fix it ASAP.

@alexey-bataev
Copy link
Member Author

Hi @alexey-bataev.

I'm confused by the code generation of this GCC torture test after this change:

https://www.godbolt.org/z/frnz9K1n4

Check line 5. I would expect the pointer of a negative strided store be moved at the address of the last element of the store. Instead we store at the original place (i.e. I'd expect something like addi a2, a0, 4 and then vsse32.v v8, (a2), a1)

Must be fixed in c144157

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants