[VectorCombine] switch on opcode to compile faster

This follows 87debda to further eliminate wasting time calling helper functions only to early return to the main run loop. Once again, this results in significant savings based on experimental data: https://llvm-compile-time-tracker.com/compare.php?from=01023bfcd33f922ed8c934ce563e54abe8bfe246&to=3dce4f70b73e48ccb045decb634c185e6b4c67d5&stat=instructions:u This is NFCI other than making the pass faster. The total cost of VectorCombine runs in an -O3 build appears to be well under 0.1% of compile-time now, so there's not much left to do AFAICT. There's a TODO about making the code cleaner, but it probably doesn't change timing much. I didn't include those changes here because it requires updating much more code.
llvm · Nov 22, 2022 · ede6d60 · ede6d60
1 parent 8effceb
commit ede6d60
Showing 1 changed file with 68 additions and 21 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -85,6 +85,9 @@ class VectorCombine {
 
   InstructionWorklist Worklist;
 
+  // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
+  //       parameter. That should be updated to specific sub-classes because the
+  //       run loop was changed to dispatch on opcode.
   bool vectorizeLoadInsert(Instruction &I);
   bool widenSubvectorLoad(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
@@ -271,8 +274,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
 /// This removes a shuffle in IR and may allow combining of other loaded values.
 bool VectorCombine::widenSubvectorLoad(Instruction &I) {
   // Match subvector insert of fixed vector.
-  auto *Shuf = dyn_cast<ShuffleVectorInst>(&I);
-  if (!Shuf || !Shuf->isIdentityWithPadding())
+  auto *Shuf = cast<ShuffleVectorInst>(&I);
+  if (!Shuf->isIdentityWithPadding())
     return false;
 
   // Allow a non-canonical shuffle mask that is choosing elements from op1.
@@ -1061,8 +1064,8 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
 //   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
 //   store i32 %b, i32* %1
 bool VectorCombine::foldSingleElementStore(Instruction &I) {
-  StoreInst *SI = dyn_cast<StoreInst>(&I);
-  if (!SI || !SI->isSimple() ||
+  auto *SI = cast<StoreInst>(&I);
+  if (!SI->isSimple() ||
       !isa<FixedVectorType>(SI->getValueOperand()->getType()))
     return false;
 
@@ -1371,10 +1374,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
 /// architectures with no obvious "select" shuffle, this can reduce the total
 /// number of operations if the target reports them as cheaper.
 bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
-  auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
-  if (!SVI)
-    return false;
-
+  auto *SVI = cast<ShuffleVectorInst>(&I);
   auto *VT = cast<FixedVectorType>(I.getType());
   auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
   auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
@@ -1698,26 +1698,73 @@ bool VectorCombine::run() {
   bool MadeChange = false;
   auto FoldInst = [this, &MadeChange](Instruction &I) {
     Builder.SetInsertPoint(&I);
-    if (!TryEarlyFoldsOnly) {
-      if (isa<FixedVectorType>(I.getType())) {
+    bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
+    auto Opcode = I.getOpcode();
+
+    // These folds should be beneficial regardless of when this pass is run
+    // in the optimization pipeline.
+    // The type checking is for run-time efficiency. We can avoid wasting time
+    // dispatching to folding functions if there's no chance of matching.
+    if (IsFixedVectorType) {
+      switch (Opcode) {
+      case Instruction::InsertElement:
+        MadeChange |= vectorizeLoadInsert(I);
+        break;
+      case Instruction::ShuffleVector:
+        MadeChange |= widenSubvectorLoad(I);
+        break;
+      case Instruction::Load:
+        MadeChange |= scalarizeLoadExtract(I);
+        break;
+      default:
+        MadeChange |= scalarizeBinopOrCmp(I);
+        break;
+      }
+    }
+    if (Opcode == Instruction::Store)
+      MadeChange |= foldSingleElementStore(I);
+
+
+    // If this is an early pipeline invocation of this pass, we are done.
+    if (TryEarlyFoldsOnly)
+      return;
+
+    // Otherwise, try folds that improve codegen but may interfere with
+    // early IR canonicalizations.
+    // The type checking is for run-time efficiency. We can avoid wasting time
+    // dispatching to folding functions if there's no chance of matching.
+    if (IsFixedVectorType) {
+      switch (Opcode) {
+      case Instruction::InsertElement:
         MadeChange |= foldInsExtFNeg(I);
-        MadeChange |= foldBitcastShuf(I);
+        break;
+      case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
         MadeChange |= foldSelectShuffle(I);
-      } else {
-        MadeChange |= foldExtractExtract(I);
-        MadeChange |= foldExtractedCmps(I);
+        break;
+      case Instruction::BitCast:
+        MadeChange |= foldBitcastShuf(I);
+        break;
+      }
+    } else {
+      switch (Opcode) {
+      case Instruction::Call:
         MadeChange |= foldShuffleFromReductions(I);
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        MadeChange |= foldExtractExtract(I);
+        break;
+      default:
+        if (I.isBinaryOp()) {
+          MadeChange |= foldExtractExtract(I);
+          MadeChange |= foldExtractedCmps(I);
+        }
+        break;
       }
     }
-    if (isa<FixedVectorType>(I.getType())) {
-      MadeChange |= vectorizeLoadInsert(I);
-      MadeChange |= widenSubvectorLoad(I);
-      MadeChange |= scalarizeBinopOrCmp(I);
-      MadeChange |= scalarizeLoadExtract(I);
-    }
-    MadeChange |= foldSingleElementStore(I);
   };
+
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
     if (!DT.isReachableFromEntry(&BB))