Skip to content

Conversation

skachkov-sc
Copy link
Contributor

Implement "SIMD within a register" (SWAR) loop vectorization. This technique can vectorize some loops on targets without vector registers. Currently supported instructions are:

  1. Consecutive loads/stores
  2. Bitwise operations (add/sub)
  3. Shifts (shl, lshr) with constant 2nd operand
  4. Addition/Subtraction

@skachkov-sc
Copy link
Contributor Author

@skachkov-sc
Copy link
Contributor Author

test-suite+SPEC2006+SPEC2017 vectorization results
Tests: 365
Metric: loop-vectorize.LoopsVectorized

Program                                                                                      loop-vectorize.LoopsVectorized
                                                                                             res                           
               test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 235.00                        
              test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 235.00                        
                               test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 218.00                        
                      test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 217.00                        
                          test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test 172.00                        
                           test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 126.00                        
                       test-suite :: External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test 112.00                        
                              test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test 102.00                        
                          test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test  68.00                        
                           test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test  68.00                        
            test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test  63.00                        
                    test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test  60.00                        
          test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_constant_folding.test  56.00                        
                                 test-suite :: MultiSource/Applications/ClamAV/clamscan.test  52.00                        
                                     test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  41.00                        
                          test-suite :: MultiSource/Benchmarks/Prolangs-C/bison/mybison.test  34.00                        
                             test-suite :: External/SPEC/CINT2017rate/557.xz_r/557.xz_r.test  33.00                        
                            test-suite :: External/SPEC/CINT2017speed/657.xz_s/657.xz_s.test  33.00                        
                                test-suite :: MultiSource/Applications/JM/lencod/lencod.test  31.00                        
                      test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  29.00                        
                     test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test  21.00                        
                                test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test  19.00                        
                            test-suite :: MultiSource/Benchmarks/Prolangs-C/agrep/agrep.test  19.00                        
              test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test  16.00                        
                          test-suite :: External/SPEC/CINT2017speed/605.mcf_s/605.mcf_s.test  16.00                        
               test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test  16.00                        
                           test-suite :: External/SPEC/CINT2017rate/505.mcf_r/505.mcf_r.test  16.00                        
                              test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  15.00                        
                     test-suite :: MultiSource/Benchmarks/MallocBench/espresso/espresso.test  14.00                        
                                   test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test  14.00                        
                       test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test  13.00                        
                                 test-suite :: MultiSource/Applications/SIBsim4/SIBsim4.test  13.00                        
                             test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test  13.00                        
                                     test-suite :: MultiSource/Applications/SPASS/SPASS.test  13.00                        
                               test-suite :: External/SPEC/CINT2006/456.hmmer/456.hmmer.test  13.00                        
                       test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  13.00                        
                              test-suite :: External/SPEC/CFP2006/450.soplex/450.soplex.test  12.00                        
                               test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test  12.00                        
                              test-suite :: MultiSource/Benchmarks/mafft/pairlocalalign.test  11.00                        
                                     test-suite :: SingleSource/Benchmarks/McGill/chomp.test   8.00                        
                         test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   7.00                        
                   test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test   7.00                        
                   test-suite :: External/SPEC/CINT2017rate/520.omnetpp_r/520.omnetpp_r.test   7.00                        
                    test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test   7.00                        
         test-suite :: MultiSource/Benchmarks/MiBench/consumer-typeset/consumer-typeset.test   7.00                        
                        test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   7.00                        
                                       test-suite :: MultiSource/Benchmarks/PAQ8p/paq8p.test   7.00                        
                  test-suite :: External/SPEC/CINT2017speed/620.omnetpp_s/620.omnetpp_s.test   7.00                        
                               test-suite :: External/SPEC/CINT2006/401.bzip2/401.bzip2.test   7.00                        
               test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test   6.00                        
             test-suite :: MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/mpeg2decode.test   6.00                        
                           test-suite :: External/SPEC/CFP2017speed/644.nab_s/644.nab_s.test   5.00                        
                                     test-suite :: MultiSource/Benchmarks/Ptrdist/bc/bc.test   5.00                        
               test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test   5.00                        
                            test-suite :: External/SPEC/CFP2017rate/544.nab_r/544.nab_r.test   5.00                        
                   test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test   4.00                        
                           test-suite :: MultiSource/Benchmarks/ASC_Sequoia/AMGmk/AMGmk.test   4.00                        
                          test-suite :: MicroBenchmarks/LCALS/SubsetBRawLoops/lcalsBRaw.test   4.00                        
                    test-suite :: MicroBenchmarks/LCALS/SubsetBLambdaLoops/lcalsBLambda.test   4.00                        
                          test-suite :: MicroBenchmarks/LCALS/SubsetARawLoops/lcalsARaw.test   4.00                        
                    test-suite :: MicroBenchmarks/LCALS/SubsetALambdaLoops/lcalsALambda.test   4.00                        
                          test-suite :: MicroBenchmarks/LCALS/SubsetCRawLoops/lcalsCRaw.test   4.00                        
                    test-suite :: MicroBenchmarks/LCALS/SubsetCLambdaLoops/lcalsCLambda.test   4.00                        
                test-suite :: MultiSource/Benchmarks/Trimaran/netbench-url/netbench-url.test   4.00                        
                                   test-suite :: MultiSource/Applications/oggenc/oggenc.test   3.00                        
                                 test-suite :: MultiSource/Applications/minisat/minisat.test   3.00                        
                             test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test   3.00                        
                       test-suite :: External/SPEC/CINT2017rate/541.leela_r/541.leela_r.test   2.00                        
                                    test-suite :: SingleSource/Benchmarks/McGill/queens.test   2.00                        
                      test-suite :: External/SPEC/CINT2017speed/641.leela_s/641.leela_s.test   2.00                        
                         test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.test   2.00                        
                           test-suite :: External/SPEC/CINT2006/471.omnetpp/471.omnetpp.test   2.00                        
                            test-suite :: External/SPEC/CFP2006/482.sphinx3/482.sphinx3.test   2.00                        
                  test-suite :: MultiSource/Applications/ALAC/decode/alacconvert-decode.test   2.00                        
                     test-suite :: MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg.test   2.00                        
                                    test-suite :: MultiSource/Applications/kimwitu++/kc.test   2.00                        
                     test-suite :: MultiSource/Benchmarks/Rodinia/pathfinder/pathfinder.test   2.00                        
                  test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test   2.00                        
                 test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PENNANT.test   1.00                        
              test-suite :: External/SPEC/CINT2017speed/631.deepsjeng_s/631.deepsjeng_s.test   1.00                        
                                  test-suite :: MultiSource/Applications/d/make_dparser.test   1.00                        
                               test-suite :: External/SPEC/CINT2006/473.astar/473.astar.test   1.00                        
                                 test-suite :: MultiSource/Applications/sqlite3/sqlite3.test   1.00                        
                                  test-suite :: SingleSource/Benchmarks/Stanford/Puzzle.test   1.00                        
               test-suite :: External/SPEC/CINT2017rate/531.deepsjeng_r/531.deepsjeng_r.test   1.00                        
                                     test-suite :: MultiSource/Benchmarks/nbench/nbench.test   1.00                        
                            test-suite :: MicroBenchmarks/ImageProcessing/Dither/Dither.test   1.00                        
                            test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test   1.00                        
                                   test-suite :: SingleSource/Benchmarks/Misc/whetstone.test   1.00                        
             test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/Pathfinder/PathFinder.test   1.00                        
test-suite :: MicroBenchmarks/ImageProcessing/AnisotropicDiffusion/AnisotropicDiffusion.test   1.00                        
                                 test-suite :: SingleSource/Benchmarks/Misc/ReedSolomon.test   1.00

Implement "SIMD within a register" (SWAR) loop vectorization. This technique can
vectorize some loops on targets without vector registers by using full width of
scalar registers. Currently supported instructions are:
1. Consecutive loads/stores
2. Bitwise operations (add/sub)
3. Shifts (shl, lshr) with constant 2nd operand
4. Addition/Subtraction
@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2023

@llvm/pr-subscribers-llvm-transforms

Author: Sergey Kachkov (skachkov-sc)

Changes

Implement "SIMD within a register" (SWAR) loop vectorization. This technique can vectorize some loops on targets without vector registers. Currently supported instructions are:

  1. Consecutive loads/stores
  2. Bitwise operations (add/sub)
  3. Shifts (shl, lshr) with constant 2nd operand
  4. Addition/Subtraction

Patch is 95.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69306.diff

6 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+223-17)
  • (modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+4)
  • (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+72)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+122)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+3)
  • (added) llvm/test/Transforms/LoopVectorize/swar-vectorization.ll (+1150)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ca7e75f97f0f02..df2bbb509ae091c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,10 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 
+static cl::opt<bool> EnableSWARVectorization(
+    "enable-swar-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable SWAR (SIMD within a register) vectorization"));
+
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
@@ -1203,10 +1207,10 @@ class LoopVectorizationCostModel {
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             InterleavedAccessInfo &IAI)
+                             InterleavedAccessInfo &IAI, bool UseSWAR)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {}
+        Hints(Hints), InterleaveInfo(IAI), UseSWAR(UseSWAR) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1712,6 +1716,9 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
+  /// Calculate cost of SWAR instruction.
+  InstructionCost getSWARInstructionCost(Instruction *I, unsigned VF);
+
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1921,6 +1928,9 @@ class LoopVectorizationCostModel {
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+  /// Use SWAR vectorization mode.
+  const bool UseSWAR;
 };
 } // end namespace llvm
 
@@ -5071,9 +5081,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, bool FoldTailByMasking) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
-  const TypeSize WidestRegister = TTI.getRegisterBitWidth(
+  const TargetTransformInfo::RegisterKind RegKind =
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
-                           : TargetTransformInfo::RGK_FixedWidthVector);
+      : UseSWAR            ? TargetTransformInfo::RGK_Scalar
+                           : TargetTransformInfo::RGK_FixedWidthVector;
+  const TypeSize WidestRegister = TTI.getRegisterBitWidth(RegKind);
 
   // Convenience function to return the minimum of two ElementCounts.
   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
@@ -5128,9 +5140,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
   }
 
-  TargetTransformInfo::RegisterKind RegKind =
-      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
-                           : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
@@ -6684,6 +6693,65 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
+InstructionCost
+LoopVectorizationCostModel::getSWARInstructionCost(Instruction *I,
+                                                   unsigned VF) {
+  uint64_t RegSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
+  auto *RegType = IntegerType::get(I->getModule()->getContext(), RegSize);
+  auto GetMultiplier = [&](IntegerType *Ty) -> uint64_t {
+    return divideCeil(Ty->getBitWidth() * VF, RegSize);
+  };
+  if (isa<LoadInst, StoreInst>(I)) {
+    if (getWideningDecision(I, ElementCount::getFixed(VF)) !=
+        LoopVectorizationCostModel::CM_Widen)
+      return InstructionCost::getInvalid();
+    auto *ValTy = dyn_cast<IntegerType>(getLoadStoreType(I));
+    if (!ValTy)
+      return InstructionCost::getInvalid();
+    const auto &DL = I->getModule()->getDataLayout();
+    const Align Alignment = DL.getPrefTypeAlign(RegType);
+    unsigned AddressSpace =
+        getLoadStorePointerOperand(I)->getType()->getPointerAddressSpace();
+    return GetMultiplier(ValTy) * TTI.getMemoryOpCost(I->getOpcode(), RegType,
+                                                      Alignment, AddressSpace);
+  }
+  auto *ValTy = dyn_cast<IntegerType>(I->getType());
+  if (!ValTy)
+    return InstructionCost::getInvalid();
+  if (auto *PN = dyn_cast<PHINode>(I))
+    if (Legal->isReductionVariable(PN))
+      return TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+  auto Multiplier = GetMultiplier(ValTy);
+  if (I->isBitwiseLogicOp())
+    return Multiplier * TTI.getArithmeticInstrCost(I->getOpcode(), RegType);
+  switch (I->getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::LShr:
+    // Shl: (LHS << ShiftAmnt) & Mask
+    // LShr: (LHS >> ShiftAmnt) & Mask
+    if (!isa<ConstantInt>(I->getOperand(1)))
+      return InstructionCost::getInvalid();
+    return Multiplier * (TTI.getArithmeticInstrCost(I->getOpcode(), RegType) +
+                         TTI.getArithmeticInstrCost(Instruction::And, RegType));
+  case Instruction::Add:
+    // Add: ((LHS & ~Mask) + (RHS & ~Mask)) ^ ((LHS ^  RHS) & Mask)
+    return Multiplier *
+           (TTI.getArithmeticInstrCost(Instruction::Add, RegType) +
+            2 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType) +
+            3 * TTI.getArithmeticInstrCost(Instruction::And, RegType));
+  case Instruction::Sub:
+    // Sub: ((LHS |  Mask) - (RHS & ~Mask)) ^ ((LHS ^ ~RHS) & Mask)
+    return Multiplier *
+           (TTI.getArithmeticInstrCost(Instruction::Sub, RegType) +
+            TTI.getArithmeticInstrCost(Instruction::Or, RegType) +
+            2 * TTI.getArithmeticInstrCost(Instruction::And, RegType) +
+            3 * TTI.getArithmeticInstrCost(Instruction::Xor, RegType));
+  default:
+    return InstructionCost::getInvalid();
+  }
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
@@ -6706,6 +6774,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
           false);
   }
 
+  if (UseSWAR && VF.isVector()) {
+    assert(!VF.isScalable() && "Scalable VF not supported");
+    if (!I->isTerminator())
+      return VectorizationCostTy(getSWARInstructionCost(I, VF.getFixedValue()),
+                                 true);
+  }
+
   Type *VectorTy;
   InstructionCost C = getInstructionCost(I, VF, VectorTy);
 
@@ -8208,6 +8283,23 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
   return BlockMaskCache[BB] = BlockMask;
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToSWARMemory(Instruction *I,
+                                               ArrayRef<VPValue *> Operands,
+                                               VFRange &Range) {
+  if (Legal->isMaskRequired(I))
+    return nullptr;
+  if (CM.getWideningDecision(I, Range.Start) !=
+      LoopVectorizationCostModel::CM_Widen)
+    return nullptr;
+  if (!isa<IntegerType>(getLoadStoreType(I)))
+    return nullptr;
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return new VPSWARMemoryInstructionRecipe(*LI, Operands[0]);
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    return new VPSWARMemoryInstructionRecipe(*SI, Operands[1], Operands[0]);
+  llvm_unreachable("Unhandled instruction!");
+}
+
 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
                                                 ArrayRef<VPValue *> Operands,
                                                 VFRange &Range,
@@ -8474,6 +8566,25 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
                                                              Range);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToSWAR(Instruction *I,
+                                         ArrayRef<VPValue *> Operands) {
+  switch (I->getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::LShr:
+    if (!isa<ConstantInt>(I->getOperand(1)))
+      return nullptr;
+    [[fallthrough]];
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+    return new VPSWARRecipe(*I, make_range(Operands.begin(), Operands.end()));
+  default:
+    return nullptr;
+  }
+}
+
 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
                                           ArrayRef<VPValue *> Operands,
                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
@@ -8656,7 +8767,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
+    return toVPRecipeResult(
+        CM.UseSWAR ? tryToSWARMemory(Instr, Operands, Range)
+                   : tryToWidenMemory(Instr, Operands, Range, Plan));
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8675,7 +8788,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
         new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
   }
 
-  return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+  return toVPRecipeResult(CM.UseSWAR ? tryToSWAR(Instr, Operands)
+                                     : tryToWiden(Instr, Operands, VPBB, Plan));
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9117,7 +9231,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
                  "must be a select recipe");
           IndexOfFirstOperand = 1;
         } else {
-          assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
+          assert((MinVF.isScalar() || CM.UseSWAR ||
+                  isa<VPWidenRecipe>(CurrentLink)) &&
                  "Expected to replace a VPWidenSC");
           IndexOfFirstOperand = 0;
         }
@@ -9454,6 +9569,49 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
 }
 
+static Type *getSWARType(Type *ScalarTy, ElementCount VF) {
+  assert(isa<IntegerType>(ScalarTy));
+  unsigned ScalarBitWidth = cast<IntegerType>(ScalarTy)->getBitWidth();
+  assert(!VF.isScalable() && "Scalable VF not supported");
+  return IntegerType::get(ScalarTy->getContext(),
+                          ScalarBitWidth * VF.getFixedValue());
+}
+
+void VPSWARMemoryInstructionRecipe::execute(VPTransformState &State) {
+  auto VF = State.VF;
+  Value *Ptr = State.get(getAddr(), VPIteration(0, 0));
+  bool InBounds = false;
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+    InBounds = GEP->isInBounds();
+  Type *ScalarTy = getLoadStoreType(&Ingredient);
+  Type *SWARTy = getSWARType(ScalarTy, VF);
+  Type *VecTy = VectorType::get(ScalarTy, VF);
+  const auto &DL = Ingredient.getModule()->getDataLayout();
+  const Align Alignment = DL.getPrefTypeAlign(SWARTy);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(Ingredient.getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *GEP = Builder.CreateGEP(ScalarTy, Ptr,
+                                   Builder.getInt32(VF.getFixedValue() * Part),
+                                   Ptr->getName() + ".swar", InBounds);
+    Value *SWARPtr = Builder.CreateBitCast(
+        GEP, SWARTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()));
+    Instruction *Res = nullptr;
+    if (isa<LoadInst>(Ingredient)) {
+      Res = Builder.CreateAlignedLoad(SWARTy, SWARPtr, Alignment,
+                                      Ingredient.getName() + ".swar");
+      State.set(getVPSingleValue(), Builder.CreateBitCast(Res, VecTy), Part);
+    } else if (isa<StoreInst>(Ingredient))
+      Res = Builder.CreateAlignedStore(
+          Builder.CreateBitCast(State.get(getStoredValue(), Part), SWARTy),
+          SWARPtr, Alignment);
+    else
+      llvm_unreachable("Unhandled instruction!");
+    State.addMetadata(Res, &Ingredient);
+  }
+}
+
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
 
@@ -9643,7 +9801,7 @@ static bool processLoopInVPlanNativePath(
     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
-    LoopVectorizationRequirements &Requirements) {
+    LoopVectorizationRequirements &Requirements, bool UseSWAR) {
 
   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9657,7 +9815,7 @@ static bool processLoopInVPlanNativePath(
       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+                                &Hints, IAI, UseSWAR);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -9841,6 +9999,45 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   return true;
 }
 
+static const SCEVPredicate *getAlignPredicate(ScalarEvolution *SE,
+                                              const DataLayout &DL,
+                                              const SCEV *Start,
+                                              Align Alignment) {
+  Type *IntTy = DL.getIntPtrType(Start->getType());
+  const SCEV *Rem = SE->getURemExpr(SE->getPtrToIntExpr(Start, IntTy),
+                                    SE->getConstant(IntTy, Alignment.value()));
+  if (Rem->isZero())
+    return nullptr;
+  return SE->getEqualPredicate(Rem, SE->getZero(IntTy));
+}
+
+static void generateAlignChecks(PredicatedScalarEvolution &PSE,
+                                const VPlan &Plan, ElementCount VF) {
+  ScalarEvolution *SE = PSE.getSE();
+  const DataLayout &DL = SE->getDataLayout();
+  MapVector<const SCEV *, Align> Checks;
+  for (const auto *VPBlock : vp_depth_first_shallow(Plan.getEntry()))
+    for (const auto &Recipe : *VPBlock->getEntryBasicBlock()) {
+      auto *SWARRecipe = dyn_cast<VPSWARMemoryInstructionRecipe>(&Recipe);
+      if (!SWARRecipe)
+        continue;
+      auto &MemInst = SWARRecipe->getIngredient();
+      const SCEVAddRecExpr *PtrSCEV =
+          PSE.getAsAddRec(getLoadStorePointerOperand(&MemInst));
+      assert(PtrSCEV && "Consecutive Ptr expected");
+      const SCEV *Start = PtrSCEV->getStart();
+      Type *SWARTy = getSWARType(getLoadStoreType(&MemInst), VF);
+      Align Alignment = DL.getPrefTypeAlign(SWARTy);
+      if (auto It = Checks.find(Start); It != Checks.end())
+        It->second = std::max(It->second, Alignment);
+      else
+        Checks.insert({Start, Alignment});
+    }
+  for (auto [Start, Alignment] : Checks)
+    if (auto *Predicate = getAlignPredicate(SE, DL, Start, Alignment))
+      PSE.addPredicate(*Predicate);
+}
+
 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
                                !EnableLoopInterleaving),
@@ -9905,9 +10102,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // even evaluating whether vectorization is profitable. Since we cannot modify
   // the incoming IR, we need to build VPlan upfront in the vectorization
   // pipeline.
+  bool UseSWAR =
+      !TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      EnableSWARVectorization;
   if (!L->isInnermost())
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
-                                        ORE, BFI, PSI, Hints, Requirements);
+                                        ORE, BFI, PSI, Hints, Requirements,
+                                        UseSWAR);
 
   assert(L->isInnermost() && "Inner loop expected.");
 
@@ -10001,7 +10202,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Use the cost model.
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
+                                F, &Hints, IAI, UseSWAR);
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
@@ -10026,8 +10227,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
-    if (VF.Width.isVector() || SelectedIC > 1)
+    if (VF.Width.isVector() || SelectedIC > 1) {
+      if (UseSWAR)
+        generateAlignChecks(PSE, LVP.getBestPlanFor(VF.Width), VF.Width);
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+    }
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
@@ -10299,12 +10503,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
 
   // Don't attempt if
   // 1. the target claims to have no vector registers, and
-  // 2. interleaving won't help ILP.
+  // 2. SWAR vectorization is disabled, and
+  // 3. interleaving won't help ILP.
   //
-  // The second condition is necessary because, even if the target has no
+  // The last condition is necessary because, even if the target has no
   // vector registers, loop vectorization may still enable scalar
   // interleaving.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+      !EnableSWARVectorization &&
       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
     return LoopVectorizeResult(false, false);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e9..7369de4320cddd8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -103,6 +103,10 @@ class VPRecipeBuilder {
   VPRecipeBase *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
                            VPBasicBlock *VPBB, VPlanPtr &Plan);
 
+  VPRecipeBase *tryToSWARMemory(Instruction *I, ArrayRef<VPValue *> Operands,
+                                VFRange &Range);
+  VPRecipeBase *tryToSWAR(Instruction *I, ArrayRef<VPValue *> Operands);
+
   /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
   VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..4c29e843401e82c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1148,6 +1148,32 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
   }
 };
 
+// VPSWARRecipe is a recipe for producing a SIMD-within-a-register (SWAR)
+// operation for its ingredient. The operation works on values that are packed
+// into scalar registers. This recipe covers the following cases:
+// 1. Bitwise operations (and, or, xor)
+// 2. Shifts (shl, lshr) with constant second operand
+// 3. Add/Sub operations.
+class VPSWARRecipe : public VPRecipeBase, public VPValue {
+public:
+  template <typename IterT>
+  VPSWARRecipe(Inst...
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants