-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Model address separately. #72164
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesMove vector pointer generation to a separate VPInstruction opcode. Patch is 333.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72164.diff 57 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ae8d306c44dd885..e3374724b04a144 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8231,13 +8231,24 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+ if (Decision != LoopVectorizationCostModel::CM_GatherScatter &&
+ Decision != LoopVectorizationCostModel::CM_Interleave) {
+ auto *VectorPtr = Reverse
+ ? new VPInstruction(VPInstruction::CreateVectorPtr,
+ {Ptr, Ptr}, I->getDebugLoc())
+ : new VPInstruction(VPInstruction::CreateVectorPtr,
+ {Ptr}, I->getDebugLoc());
+ Builder.getInsertBlock()->appendRecipe(VectorPtr);
+ Ptr = VectorPtr;
+ }
if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
- Consecutive, Reverse);
+ return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
+ Reverse);
StoreInst *Store = cast<StoreInst>(I);
- return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
- Mask, Consecutive, Reverse);
+ return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
+ Consecutive, Reverse);
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9532,44 +9543,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
BlockInMaskParts[Part] = Mask;
}
- const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
- // Calculate the pointer for the specific unroll-part.
- Value *PartPtr = nullptr;
-
- // Use i32 for the gep index type when the value is constant,
- // or query DataLayout for a more suitable index type otherwise.
- const DataLayout &DL =
- Builder.GetInsertBlock()->getModule()->getDataLayout();
- Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
- ? DL.getIndexType(PointerType::getUnqual(
- ScalarDataTy->getContext()))
- : Builder.getInt32Ty();
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = gep->isInBounds();
- if (isReverse()) {
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
- // RunTimeVF = VScale * VF.getKnownMinValue()
- // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
- Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
- // NumElt = -Part * RunTimeVF
- Value *NumElt =
- Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
- // LastLane = 1 - RunTimeVF
- Value *LastLane =
- Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
- PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
- PartPtr =
- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
- } else {
- Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
- PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
- }
-
- return PartPtr;
- };
-
// Handle Stores:
if (SI) {
State.setDebugLocFrom(SI->getDebugLoc());
@@ -9590,8 +9563,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
- auto *VecPtr =
- CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ auto *VecPtr = State.get(getAddr(), Part);
if (isMaskRequired)
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
BlockInMaskParts[Part]);
@@ -9615,8 +9587,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
nullptr, "wide.masked.gather");
State.addMetadata(NewLI, LI);
} else {
- auto *VecPtr =
- CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ auto *VecPtr = State.get(getAddr(), Part);
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a26308a212bbd3c..be770e33e92a32b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1038,7 +1038,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
// canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
BranchOnCount,
- BranchOnCond
+ BranchOnCond,
+ CreateVectorPtr
};
private:
@@ -1146,6 +1147,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
case VPInstruction::CanonicalIVIncrement:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
+ case VPInstruction::CreateVectorPtr:
return true;
};
llvm_unreachable("switch should return");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6b3218dca1b18b0..1dac8a806d657cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -122,6 +122,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrement:
case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::CreateVectorPtr:
return false;
default:
return true;
@@ -404,6 +405,49 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
return CondBr;
}
+ case VPInstruction::CreateVectorPtr: {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr = nullptr;
+ bool IsReverse = getNumOperands() > 1;
+ auto *MemR = cast<VPWidenMemoryInstructionRecipe>(*user_begin());
+ Type *ScalarDataTy =
+ MemR->isStore() ? cast<StoreInst>(&MemR->getIngredient())
+ ->getValueOperand()
+ ->getType()
+ : cast<LoadInst>(&MemR->getIngredient())->getType();
+ // Use i32 for the gep index type when the value is constant,
+ // or query DataLayout for a more suitable index type otherwise.
+ const DataLayout &DL =
+ Builder.GetInsertBlock()->getModule()->getDataLayout();
+ Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
+ ? DL.getIndexType(ScalarDataTy->getPointerTo())
+ : Builder.getInt32Ty();
+ Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+ if (IsReverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ // RunTimeVF = VScale * VF.getKnownMinValue()
+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+ Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
+ // NumElt = -Part * RunTimeVF
+ Value *NumElt =
+ Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
+ // LastLane = 1 - RunTimeVF
+ Value *LastLane =
+ Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
+ PartPtr =
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
+ } else {
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
+ }
+
+ return PartPtr;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -483,6 +527,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::CreateVectorPtr:
+ O << "create-vector-pointer";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index 4a8e07eaaf757fa..cbc4733cf5cf5fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -179,8 +179,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8>
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]]
-; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16
+; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -193,18 +193,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0
; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]]
; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8>
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX3]] to i64
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]]
; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1
-; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000
+; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 8
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 1000
; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -268,7 +268,7 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
; CHECK: vec.epilog.ph:
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[A]] to i16
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i64 0
; CHECK-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP11]], <i16 99, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>
@@ -276,11 +276,11 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
; CHECK-NEXT: [[TMP14:%.*]] = lshr <8 x i16> [[TMP13]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], <8 x i16> [[TMP14]], <8 x i16> [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i16> [[TMP15]] to <8 x i8>
-; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX2]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX1]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP17]]
; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1
-; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 8
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 1000
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 8
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 1000
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 24d6d2d532aa0c2..24c59fdb47b6133 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -38,8 +38,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
+; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -55,22 +55,22 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX9]], 0
-; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX9]], 1
-; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP10]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP11]], i32 1
+; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX8]], 1
+; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP9]], i32 0
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP10]], i32 1
; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]])
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i32 0
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
-; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 2
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 10000
+; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 10000
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @test_widen_induction(ptr %A, i64 %N) {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4
; CHECK-NEXT: store <2 x i64> [[STEP_ADD]], ptr [[TMP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
@@ -156,13 +156,13 @@ define void @test_widen_induction(ptr %A, i64 %N) {
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX7]], 0
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
; CHECK-NEXT: store <2 x i64> [[VEC_IND8]], ptr [[TMP9]], align 4
-; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 2
; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <2 x i64> [[VEC_IND8]], <i64 2, i64 2>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
@@ -224,8 +224,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2
+; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4
; CHECK-NEXT: stor...
[truncated]
|
Note that this patch depends on #72163 |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Decision != LoopVectorizationCostModel::CM_Interleave) { | ||
auto *VectorPtr = Reverse | ||
? new VPInstruction(VPInstruction::CreateVectorPtr, | ||
{Ptr, Ptr}, I->getDebugLoc()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think better to add some kind of bool flag for reverse rather than mimic it with unused operand. Can you make it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, that was a left-over from early on. Replaced by a dedicated opcode for reverse.
auto *VectorPtr = Reverse | ||
? new VPInstruction(VPInstruction::CreateVectorPtr, | ||
{Ptr, Ptr}, I->getDebugLoc()) | ||
: new VPInstruction(VPInstruction::CreateVectorPtr, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just VectorPtr maybe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
: Builder.getInt32Ty(); | ||
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); | ||
bool InBounds = false; | ||
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Gep
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
Move vector pointer generation to a separate VPInstruction opcode. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. Pull Request: #72164
16968a0
to
9bb3ad8
Compare
Rebased and addressed comments, thanks! |
Was this user error or is this an expected result of the spr workflow? |
@nikic Not entirely sure, but probably a user failure. For some reason |
9bb3ad8
to
1aeee98
Compare
Move vector pointer generation to a separate VPInstruction opcode. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. Pull Request: #72164
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good intermediate refactoring of VPWidenMemoryInstructionRecipe!
Perhaps better to peel off a single GEP-for-consecutive-memory Recipe rather than two VPInstructions, as first step?
May be good to eventually have separate recipes/VPInstructions for load, stores, gathers, scatters, possibly in some hierarchy to share common parts.
// Calculate the pointer for the specific unroll-part. | ||
Value *PartPtr = nullptr; | ||
bool IsReverse = getOpcode() == VPInstruction::VectorPtrReverse; | ||
auto *MemR = cast<VPWidenMemoryInstructionRecipe>(*user_begin()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Relying on properties of the first user? This preserves the current behavior, but breaks recipe atomicity. Perhaps better to record ScalarDataTy inside VectorPtr[Reverse] recipes, for now, (or DL's index type for a pointer to it) rather than trying to simplify them immediately into VPInstructions. Can also record IsReverse inside the recipe rather than mangle it inside Opcode.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated to add a separate recipe, thanks!
bool InBounds = false; | ||
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) | ||
InBounds = GEP->isInBounds(); | ||
if (IsReverse) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Follow-ups should probably continue to break down Reverse case into multiple elementary VPInstructions including Mul and Sub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed!
// wide store needs to start at the last vector element. | ||
// RunTimeVF = VScale * VF.getKnownMinValue() | ||
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() | ||
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Follow-up should probably use the modelling of VF as VPValue, following the modelling of VFxUF, coupled with unrolling-by-UF as a VPlan-to-VPlan transformation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed!
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); | ||
bool InBounds = false; | ||
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) | ||
InBounds = GEP->isInBounds(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than replicating InBound of operand at code-gen, perhaps better record it in recipe, or possibly fold two geps into one as VPlan-to-VPlan transform. Fine for current refactoring patch to preserve current behavior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, this should be recorded on the recipe/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to manage inbounds via VPRecipeWithIRFlags in 18ec330
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; | ||
if (Decision != LoopVectorizationCostModel::CM_GatherScatter && | ||
Decision != LoopVectorizationCostModel::CM_Interleave) { | ||
auto *VectorPtr = new VPInstruction( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that tryToWiden*() methods typically return one recipe for the given Instruction I
(or none), also aligning with VPlan evolution roadmap, which is then placed inside the VPBB and set to correspond to I
. Here two recipes are being introduced, the first placed at the end of Builder's insert block.
Perhaps a single recipe can continue to be returned (possibly abstract w/o an execute()), and later augmented by introducing a complementary recipe or broken it into two recipes, in a VPlan-to-VPlan transform.
Similar to the premature optimization behind VPRecipeOrVPValueTy which should be cleaned up.
Note that VectorPtr
may be confusing -- it's a scalar pointer used by a vector load or store, as opposed to Ptr
which may otherwise be a vector of pointers feeding a gather/scatter. But better keep names consistent in this refactoring patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, maybe VPWidenMemoryInstructionRecipe
will be able to serve this role if we decide to split up the recipe? For now, it would mean to add an additional recipe + extra complexity to this patch IIUC?
@@ -8174,13 +8174,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, | |||
bool Consecutive = | |||
Reverse || Decision == LoopVectorizationCostModel::CM_Widen; | |||
|
|||
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; | |||
if (Decision != LoopVectorizationCostModel::CM_GatherScatter && | |||
Decision != LoopVectorizationCostModel::CM_Interleave) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check instead if Consecutive? I.e., if Decision is Widen or Widen_Reverse.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to simplify check `Consecutive)
@@ -9485,44 +9494,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { | |||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: "only neede[d] for" typo above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 516cc98
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(only pushed the latest changes to address the comments now)
@@ -8174,13 +8174,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, | |||
bool Consecutive = | |||
Reverse || Decision == LoopVectorizationCostModel::CM_Widen; | |||
|
|||
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; | |||
if (Decision != LoopVectorizationCostModel::CM_GatherScatter && | |||
Decision != LoopVectorizationCostModel::CM_Interleave) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to simplify check `Consecutive)
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; | ||
if (Decision != LoopVectorizationCostModel::CM_GatherScatter && | ||
Decision != LoopVectorizationCostModel::CM_Interleave) { | ||
auto *VectorPtr = new VPInstruction( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, maybe VPWidenMemoryInstructionRecipe
will be able to serve this role if we decide to split up the recipe? For now, it would mean to add an additional recipe + extra complexity to this patch IIUC?
@@ -9485,44 +9494,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { | |||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 516cc98
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, looks good to me, with a final cleanup nit.
Note that VPVectorPointerRecipes could be quite easily introduced as an "adjusting" VPlan-to-VPlan pass, placed between every consecutive VPWidenMemoryInstructionRecipe and its address operand. With some indication marking a VPWidenMemoryInstructionRecipe as adjusted or not, validated at execute time. But further breaking up of VPWidenMemoryInstructionRecipe, unrolling as a VPlan-to-VPlan pass, and starting with initial scalar VPlan, should eventually clarify this.
Hope subsequent scheduling is indifferent to the clustering of loads/stores separate from their GEPs.
// Use i32 for the gep index type when the value is constant, | ||
// or query DataLayout for a more suitable index type otherwise. | ||
const DataLayout &DL = | ||
Builder.GetInsertBlock()->getModule()->getDataLayout(); | ||
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) | ||
? DL.getIndexType(IndexedTy->getPointerTo()) | ||
: Builder.getInt32Ty(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The recipe holds IndexTy to avoid the need to look it up.
Note that DL is overloaded - as a member of recipe it stands for Debug Location.
// Use i32 for the gep index type when the value is constant, | |
// or query DataLayout for a more suitable index type otherwise. | |
const DataLayout &DL = | |
Builder.GetInsertBlock()->getModule()->getDataLayout(); | |
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) | |
? DL.getIndexType(IndexedTy->getPointerTo()) | |
: Builder.getInt32Ty(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The recipe stores the indexed type, which is different to the index type (integer type used for the indices of the generated GEP). Left as is for now.
: Builder.getInt32Ty(); | ||
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); | ||
bool InBounds = false; | ||
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit (unrelated to this patch): should this if
be an assert
, must consecutive Ptr
's be GEP
's, do we risk losing inbound information by retrieving it at code-gen rather than recording it when introducing the recipe.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, the inbounds info should be recorded in the recipe, which I'll do as follow-up.
Move vector pointer generation to a separate VPInstruction opcode. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. Pull Request: #72164
1693f98
to
20d79f7
Compare
As suggested as follow-up in #72164, manage inbounds via VPRecipeWithIRFlags. Note that in some cases we can now preserve inbounds in a few more cases.
As suggested as follow-up in llvm#72164, manage inbounds via VPRecipeWithIRFlags. Note that in some cases we can now preserve inbounds in a few more cases.
Move vector pointer generation to a separate VPInstruction opcode.
This untangles address computation from the memory recipes future
and is also needed to enable explicit unrolling in VPlan.