-
Notifications
You must be signed in to change notification settings - Fork 10.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve reordering for consts, splats and ops from same nodes + improved analysis. #87091
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesImproved detection of const/splat candidates, their matching and analysis of instructions from same nodes. Metric: size..text Program size..text
MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE - small Patch is 55.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87091.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2875e71081d928..46243c60324a3d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1334,12 +1334,19 @@ class BoUpSLP {
return LookAheadHeuristics::ScoreSplat;
}
+ auto CheckSameEntryOrFail = [&]() {
+ if (const TreeEntry *TE1 = R.getTreeEntry(V1);
+ TE1 && TE1 == R.getTreeEntry(V2))
+ return LookAheadHeuristics::ScoreSplatLoads;
+ return LookAheadHeuristics::ScoreFail;
+ };
+
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
!LI2->isSimple())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
std::optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
@@ -1351,7 +1358,7 @@ class BoUpSLP {
FixedVectorType::get(LI1->getType(), NumLanes),
LI1->getAlign()))
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
// The distance is too large - still may be profitable to use masked
// loads/gathers.
@@ -1408,14 +1415,14 @@ class BoUpSLP {
}
return LookAheadHeuristics::ScoreAltOpcodes;
}
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
Ops.push_back(I1);
Ops.push_back(I2);
@@ -1436,7 +1443,7 @@ class BoUpSLP {
if (isa<UndefValue>(V2))
return LookAheadHeuristics::ScoreUndef;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
/// Go through the operands of \p LHS and \p RHS recursively until
@@ -1599,6 +1606,7 @@ class BoUpSLP {
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
+ const Loop *L = nullptr;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1767,8 +1775,9 @@ class BoUpSLP {
// Track if the operand must be marked as used. If the operand is set to
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
// want to reestimate the operands again on the following iterations).
- bool IsUsed =
- RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
+ bool IsUsed = RMode == ReorderingMode::Splat ||
+ RMode == ReorderingMode::Constant ||
+ RMode == ReorderingMode::Load;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
@@ -1789,23 +1798,44 @@ class BoUpSLP {
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
- case ReorderingMode::Constant:
case ReorderingMode::Opcode: {
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
OpIdx, Idx, IsUsed);
- if (Score > static_cast<int>(BestOp.Score)) {
+ if (Score > static_cast<int>(BestOp.Score) ||
+ (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
+ Idx == OpIdx)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
}
break;
}
+ case ReorderingMode::Constant:
+ if (isa<Constant>(Op) ||
+ (!BestOp.Score && L && L->isLoopInvariant(Op))) {
+ BestOp.Idx = Idx;
+ if (isa<Constant>(Op)) {
+ BestOp.Score = LookAheadHeuristics::ScoreConstants;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreConstants;
+ }
+ if (isa<UndefValue>(Op) || !isa<Constant>(Op))
+ IsUsed = false;
+ }
+ break;
case ReorderingMode::Splat:
- if (Op == OpLastLane)
+ if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
+ IsUsed = Op == OpLastLane;
+ if (Op == OpLastLane) {
+ BestOp.Score = LookAheadHeuristics::ScoreSplat;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreSplat;
+ }
BestOp.Idx = Idx;
+ }
break;
case ReorderingMode::Failed:
llvm_unreachable("Not expected Failed reordering mode.");
@@ -1999,6 +2029,8 @@ class BoUpSLP {
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
bool OpAPO = getData(OpIdx, Lane).APO;
+ bool IsInvariant = L && L->isLoopInvariant(Op);
+ unsigned Cnt = 0;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
continue;
@@ -2008,22 +2040,37 @@ class BoUpSLP {
OperandData &Data = getData(OpI, Ln);
if (Data.APO != OpAPO || Data.IsUsed)
continue;
- if (Data.V == Op) {
+ Value *OpILane = getValue(OpI, Lane);
+ bool IsConstantOp = isa<Constant>(OpILane);
+ if (Data.V == Op ||
+ (!IsConstantOp &&
+ ((Lns > 2 && isa<Constant>(Data.V)) ||
+ (Lns == 2 &&
+ !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
+ .getOpcode() &&
+ isa<Constant>(Data.V)))) ||
+ (IsInvariant && !isa<Constant>(Data.V) &&
+ !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
+ L->isLoopInvariant(Data.V))) {
FoundCandidate = true;
- Data.IsUsed = true;
+ Data.IsUsed = Data.V == Op;
+ if (Data.V == Op)
+ ++Cnt;
break;
}
}
if (!FoundCandidate)
return false;
}
- return true;
+ return getNumLanes() == 2 || Cnt > 1;
}
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
- : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
+ : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
+ L(R.LI->getLoopFor(
+ (cast<Instruction>(RootVL.front())->getParent()))) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -2155,8 +2202,6 @@ class BoUpSLP {
// getBestOperand().
swap(OpIdx, *BestIdx, Lane);
} else {
- // We failed to find a best operand, set mode to 'Failed'.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index d87bdfe2689916..aa9a070a794509 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -64,15 +64,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
-; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT: [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
-; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
+; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]]
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,10 +104,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT: [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
@@ -115,19 +115,19 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
-; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
-; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
+; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
+; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
@@ -151,21 +151,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
-; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
-; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
-; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
+; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
+; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
+; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT: [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
-; CHECK-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
+; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
+; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]]
+; CHECK-NEXT: [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]]
+; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -220,37 +220,37 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
-; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
+; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
-; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32...
[truncated]
|
Ping! |
3 similar comments
Ping! |
Ping! |
Ping! |
Ping! |
Ping! |
1 similar comment
Ping! |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
case ReorderingMode::Opcode: { | ||
bool LeftToRight = Lane > LastLane; | ||
Value *OpLeft = (LeftToRight) ? OpLastLane : Op; | ||
Value *OpRight = (LeftToRight) ? Op : OpLastLane; | ||
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane, | ||
OpIdx, Idx, IsUsed); | ||
if (Score > static_cast<int>(BestOp.Score)) { | ||
if (Score > static_cast<int>(BestOp.Score) || | ||
(Score > 0 && Score == static_cast<int>(BestOp.Score) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm trying to understand this change. Could you please explain the idea behind this == check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If scores are equal, prefer original operand.
Hi Alexey, with this change we are seeing more shuffles created by SLP. Thank you, |
I will check, probably some corner case analysis, thanks. |
That's exactly the reason why I asked about the Score check above. Without this, there are fewer shuffles generated. Could the problem be fixed with shiffles cost model tuning as well? |
I will double-check if something can be improved. Without. this check there are other regressions. |
Improved detection of const/splat candidates, their matching and analysis of instructions from same nodes.
Metric: size..text
Program size..text
results results0 diff
results results0 diff
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 92952.00 93096.00 0.2%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 779832.00 780136.00 0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 839923.00 840179.00 0.0%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 392708.00 392740.00 0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1171131.00 1171147.00 0.0%
MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE - small
reordering
External/SPEC/CINT2006/464.h264ref/464.h264ref - small better code after
reordering
MultiSource/Applications/JM/lencod/lencod - smaller code with less
shuffles
MultiSource/Applications/JM/ldecod/ldecod - same
External/SPEC/CFP2017rate/511.povray_r/511.povray_r - 2 extra loads
vectorized, smaller code
External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r - better code,
size increased because of more constant vectors.
External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s - same
External/SPEC/CFP2017rate/526.blender_r/526.blender_r - small change in
the vectorized code, some code a bit better, some a bit worse.