Skip to content

Commit

Permalink
[VectorCombine] switch on opcode to compile faster
Browse files Browse the repository at this point in the history
This follows 87debda to further eliminate wasting time
calling helper functions only to early return to the main
run loop.

Once again, this results in significant savings based on
experimental data:
https://llvm-compile-time-tracker.com/compare.php?from=01023bfcd33f922ed8c934ce563e54abe8bfe246&to=3dce4f70b73e48ccb045decb634c185e6b4c67d5&stat=instructions:u

This is NFCI other than making the pass faster. The total
cost of VectorCombine runs in an -O3 build appears to be
well under 0.1% of compile-time now, so there's not much
left to do AFAICT.

There's a TODO about making the code cleaner, but it
probably doesn't change timing much. I didn't include those
changes here because it requires updating much more code.
  • Loading branch information
rotateright committed Nov 22, 2022
1 parent 8effceb commit ede6d60
Showing 1 changed file with 68 additions and 21 deletions.
89 changes: 68 additions & 21 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Expand Up @@ -85,6 +85,9 @@ class VectorCombine {

InstructionWorklist Worklist;

// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
// parameter. That should be updated to specific sub-classes because the
// run loop was changed to dispatch on opcode.
bool vectorizeLoadInsert(Instruction &I);
bool widenSubvectorLoad(Instruction &I);
ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
Expand Down Expand Up @@ -271,8 +274,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
/// This removes a shuffle in IR and may allow combining of other loaded values.
bool VectorCombine::widenSubvectorLoad(Instruction &I) {
// Match subvector insert of fixed vector.
auto *Shuf = dyn_cast<ShuffleVectorInst>(&I);
if (!Shuf || !Shuf->isIdentityWithPadding())
auto *Shuf = cast<ShuffleVectorInst>(&I);
if (!Shuf->isIdentityWithPadding())
return false;

// Allow a non-canonical shuffle mask that is choosing elements from op1.
Expand Down Expand Up @@ -1061,8 +1064,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
// store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) {
StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI || !SI->isSimple() ||
auto *SI = cast<StoreInst>(&I);
if (!SI->isSimple() ||
!isa<FixedVectorType>(SI->getValueOperand()->getType()))
return false;

Expand Down Expand Up @@ -1371,10 +1374,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
/// architectures with no obvious "select" shuffle, this can reduce the total
/// number of operations if the target reports them as cheaper.
bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
if (!SVI)
return false;

auto *SVI = cast<ShuffleVectorInst>(&I);
auto *VT = cast<FixedVectorType>(I.getType());
auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
Expand Down Expand Up @@ -1698,26 +1698,73 @@ bool VectorCombine::run() {
bool MadeChange = false;
auto FoldInst = [this, &MadeChange](Instruction &I) {
Builder.SetInsertPoint(&I);
if (!TryEarlyFoldsOnly) {
if (isa<FixedVectorType>(I.getType())) {
bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
auto Opcode = I.getOpcode();

// These folds should be beneficial regardless of when this pass is run
// in the optimization pipeline.
// The type checking is for run-time efficiency. We can avoid wasting time
// dispatching to folding functions if there's no chance of matching.
if (IsFixedVectorType) {
switch (Opcode) {
case Instruction::InsertElement:
MadeChange |= vectorizeLoadInsert(I);
break;
case Instruction::ShuffleVector:
MadeChange |= widenSubvectorLoad(I);
break;
case Instruction::Load:
MadeChange |= scalarizeLoadExtract(I);
break;
default:
MadeChange |= scalarizeBinopOrCmp(I);
break;
}
}
if (Opcode == Instruction::Store)
MadeChange |= foldSingleElementStore(I);


// If this is an early pipeline invocation of this pass, we are done.
if (TryEarlyFoldsOnly)
return;

// Otherwise, try folds that improve codegen but may interfere with
// early IR canonicalizations.
// The type checking is for run-time efficiency. We can avoid wasting time
// dispatching to folding functions if there's no chance of matching.
if (IsFixedVectorType) {
switch (Opcode) {
case Instruction::InsertElement:
MadeChange |= foldInsExtFNeg(I);
MadeChange |= foldBitcastShuf(I);
break;
case Instruction::ShuffleVector:
MadeChange |= foldShuffleOfBinops(I);
MadeChange |= foldSelectShuffle(I);
} else {
MadeChange |= foldExtractExtract(I);
MadeChange |= foldExtractedCmps(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuf(I);
break;
}
} else {
switch (Opcode) {
case Instruction::Call:
MadeChange |= foldShuffleFromReductions(I);
break;
case Instruction::ICmp:
case Instruction::FCmp:
MadeChange |= foldExtractExtract(I);
break;
default:
if (I.isBinaryOp()) {
MadeChange |= foldExtractExtract(I);
MadeChange |= foldExtractedCmps(I);
}
break;
}
}
if (isa<FixedVectorType>(I.getType())) {
MadeChange |= vectorizeLoadInsert(I);
MadeChange |= widenSubvectorLoad(I);
MadeChange |= scalarizeBinopOrCmp(I);
MadeChange |= scalarizeLoadExtract(I);
}
MadeChange |= foldSingleElementStore(I);
};

for (BasicBlock &BB : F) {
// Ignore unreachable basic blocks.
if (!DT.isReachableFromEntry(&BB))
Expand Down

0 comments on commit ede6d60

Please sign in to comment.