From 5ea0c558937c19f85b2382da17e9399463840772 Mon Sep 17 00:00:00 2001 From: Felipe Magno de Almeida Date: Tue, 26 Aug 2025 17:32:44 -0300 Subject: [PATCH 1/4] RISCV: Add basic test for Strided Loop Unroll passes --- .../Transforms/StridedLoopUnroll/pixel_avg.ll | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll diff --git a/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll b/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll new file mode 100644 index 0000000000000..9b43d4836343f --- /dev/null +++ b/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='function(slp-vectorizer,instcombine<>,simplifycfg<>)' -mcpu=spacemit-x60 -S %s | FileCheck %s + +; ModuleID = 'test-noopt.ll' +source_filename = "test.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; +; Function Attrs: inlinehint nounwind uwtable vscale_range(8,1024) +define dso_local void @pixel_avg(ptr noundef writeonly captures(none) %dst, i32 noundef signext %i_dst_stride, ptr noundef readonly captures(none) %src1, i32 noundef signext %i_src1_stride, ptr noundef readonly captures(none) %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) local_unnamed_addr #0 align 128 { +; CHECK-LABEL: define dso_local void @pixel_avg( +; CHECK-SAME: ptr noundef writeonly captures(none) [[DST:%.*]], i32 noundef signext [[I_DST_STRIDE:%.*]], ptr noundef readonly captures(none) [[SRC1:%.*]], i32 noundef signext [[I_SRC1_STRIDE:%.*]], ptr noundef readonly captures(none) [[SRC2:%.*]], i32 noundef signext [[I_SRC2_STRIDE:%.*]], i32 noundef signext [[I_WIDTH:%.*]], i32 noundef signext [[I_HEIGHT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 128 { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP29:%.*]] = icmp sgt i32 [[I_HEIGHT]], 0 +; CHECK-NEXT: br i1 [[CMP29]], label %[[FOR_COND1_PREHEADER_LR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_COND1_PREHEADER_LR_PH]]: +; CHECK-NEXT: [[CMP227:%.*]] = icmp sgt i32 [[I_WIDTH]], 0 +; CHECK-NEXT: [[IDX_EXT1:%.*]] = sext i32 [[I_DST_STRIDE]] to i64 +; CHECK-NEXT: [[IDX_EXT13:%.*]] = sext i32 [[I_SRC1_STRIDE]] to i64 +; CHECK-NEXT: [[IDX_EXT15:%.*]] = sext i32 [[I_SRC2_STRIDE]] to i64 +; CHECK-NEXT: br i1 [[CMP227]], label %[[FOR_COND1_PREHEADER_US_PREHEADER:.*]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND1_PREHEADER_US_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[I_WIDTH]] to i64 +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US:.*]] +; CHECK: [[FOR_COND1_PREHEADER_US]]: +; CHECK-NEXT: [[Y_033_US_LVER_ORIG:%.*]] = phi i32 [ [[INC17_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[DST_ADDR_032_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[DST]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[SRC1_ADDR_031_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR13_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[SRC1]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[SRC2_ADDR_030_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR15_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[SRC2]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY4_US:.*]] +; CHECK: [[FOR_BODY4_US]]: +; CHECK-NEXT: [[INDVARS_IV_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT_LVER_ORIG:%.*]], %[[FOR_BODY4_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US_LVER_ORIG:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC1_ADDR_031_US_LVER_ORIG]], i64 [[INDVARS_IV_LVER_ORIG]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_US_LVER_ORIG]], align 1 +; CHECK-NEXT: [[CONV_US_LVER_ORIG:%.*]] = zext i8 [[TMP10]] to i16 +; CHECK-NEXT: [[ARRAYIDX6_US_LVER_ORIG:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC2_ADDR_030_US_LVER_ORIG]], i64 [[INDVARS_IV_LVER_ORIG]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX6_US_LVER_ORIG]], align 1 +; CHECK-NEXT: [[CONV7_US_LVER_ORIG:%.*]] = zext i8 [[TMP11]] to i16 +; CHECK-NEXT: [[ADD_US_LVER_ORIG:%.*]] = add nuw nsw i16 [[CONV_US_LVER_ORIG]], 1 +; CHECK-NEXT: [[ADD8_US_LVER_ORIG:%.*]] = add nuw nsw i16 [[ADD_US_LVER_ORIG]], [[CONV7_US_LVER_ORIG]] +; CHECK-NEXT: [[SHR_US_LVER_ORIG:%.*]] = lshr i16 [[ADD8_US_LVER_ORIG]], 1 +; CHECK-NEXT: [[CONV9_US_LVER_ORIG:%.*]] = trunc nuw i16 [[SHR_US_LVER_ORIG]] to i8 +; CHECK-NEXT: [[ARRAYIDX11_US_LVER_ORIG:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_032_US_LVER_ORIG]], i64 [[INDVARS_IV_LVER_ORIG]] +; CHECK-NEXT: store i8 [[CONV9_US_LVER_ORIG]], ptr [[ARRAYIDX11_US_LVER_ORIG]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV_LVER_ORIG]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_LVER_ORIG]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_LVER_ORIG]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label %[[FOR_BODY4_US]] +; CHECK: [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]: +; CHECK-NEXT: [[ADD_PTR_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[DST_ADDR_032_US_LVER_ORIG]], i64 [[IDX_EXT1]] +; CHECK-NEXT: [[ADD_PTR13_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[SRC1_ADDR_031_US_LVER_ORIG]], i64 [[IDX_EXT13]] +; CHECK-NEXT: [[ADD_PTR15_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[SRC2_ADDR_030_US_LVER_ORIG]], i64 [[IDX_EXT15]] +; CHECK-NEXT: [[INC17_US_LVER_ORIG]] = add nuw nsw i32 [[Y_033_US_LVER_ORIG]], 1 +; CHECK-NEXT: [[EXITCOND35_NOT_LVER_ORIG:%.*]] = icmp eq i32 [[INC17_US_LVER_ORIG]], [[I_HEIGHT]] +; CHECK-NEXT: br i1 [[EXITCOND35_NOT_LVER_ORIG]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER_US]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; +entry: + %cmp29 = icmp sgt i32 %i_height, 0 + br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp227 = icmp sgt i32 %i_width, 0 + %idx.ext = sext i32 %i_dst_stride to i64 + %idx.ext12 = sext i32 %i_src1_stride to i64 + %idx.ext14 = sext i32 %i_src2_stride to i64 + br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup + +for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph + %wide.trip.count = zext nneg i32 %i_width to i64 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us + %y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ] + %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ] + %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ] + br label %for.body4.us + +for.body4.us: ; preds = %for.cond1.preheader.us, %for.body4.us + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.body4.us ] + %arrayidx.us = getelementptr inbounds nuw i8, ptr %src1.addr.031.us, i64 %indvars.iv + %0 = load i8, ptr %arrayidx.us, align 1 + %conv.us = zext i8 %0 to i16 + %arrayidx6.us = getelementptr inbounds nuw i8, ptr %src2.addr.030.us, i64 %indvars.iv + %1 = load i8, ptr %arrayidx6.us, align 1 + %conv7.us = zext i8 %1 to i16 + %add.us = add nuw nsw i16 %conv.us, 1 + %add8.us = add nuw nsw i16 %add.us, %conv7.us + %shr.us = lshr i16 %add8.us, 1 + %conv9.us = trunc nuw i16 %shr.us to i8 + %arrayidx11.us = getelementptr inbounds nuw i8, ptr %dst.addr.032.us, i64 %indvars.iv + store i8 %conv9.us, ptr %arrayidx11.us, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext + %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12 + %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14 + %inc17.us = add nuw nsw i32 %y.033.us, 1 + %exitcond35.not = icmp eq i32 %inc17.us, %i_height + br i1 %exitcond35.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader.us + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond1.preheader.lr.ph, %entry + ret void +} From 4eb3b59e111aafa0e9fde051d9e9cdb2b9b2618b Mon Sep 17 00:00:00 2001 From: Felipe Magno de Almeida Date: Mon, 25 Aug 2025 19:59:38 -0300 Subject: [PATCH 2/4] LoopVersioning: Add option to hoist runtime checks --- llvm/include/llvm/Transforms/Utils/LoopVersioning.h | 4 +++- llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h index ea4fe27c90f5c..f6ccc8803491f 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h +++ b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h @@ -46,7 +46,8 @@ class LoopVersioning { /// object having no checks and we expect the user to add them. LoopVersioning(const LoopAccessInfo &LAI, ArrayRef Checks, Loop *L, LoopInfo *LI, - DominatorTree *DT, ScalarEvolution *SE); + DominatorTree *DT, ScalarEvolution *SE, + bool HoistRuntimeChecks = false); /// Performs the CFG manipulation part of versioning the loop including /// the DominatorTree and LoopInfo updates. @@ -147,6 +148,7 @@ class LoopVersioning { LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; + bool HoistRuntimeChecks; }; /// Expose LoopVersioning as a pass. Currently this is only used for diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1ab796b..684f58cc58d22 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -42,9 +42,10 @@ static cl::opt LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, ArrayRef Checks, Loop *L, LoopInfo *LI, DominatorTree *DT, - ScalarEvolution *SE) + ScalarEvolution *SE, bool HoistRuntimeChecks) : VersionedLoop(L), AliasChecks(Checks), Preds(LAI.getPSE().getPredicate()), - LAI(LAI), LI(LI), DT(DT), SE(SE) {} + LAI(LAI), LI(LI), DT(DT), SE(SE), HoistRuntimeChecks(HoistRuntimeChecks) { +} void LoopVersioning::versionLoop( const SmallVectorImpl &DefsUsedOutside) { @@ -63,8 +64,9 @@ void LoopVersioning::versionLoop( SCEVExpander Exp2(*RtPtrChecking.getSE(), VersionedLoop->getHeader()->getDataLayout(), "induction"); - MemRuntimeCheck = addRuntimeChecks(RuntimeCheckBB->getTerminator(), - VersionedLoop, AliasChecks, Exp2); + MemRuntimeCheck = + addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, + AliasChecks, Exp2, HoistRuntimeChecks); SCEVExpander Exp(*SE, RuntimeCheckBB->getDataLayout(), "scev.check"); From 763c7924178eab88b40590b51ab909336e2e47fc Mon Sep 17 00:00:00 2001 From: Felipe Magno de Almeida Date: Tue, 3 Jun 2025 22:37:05 -0300 Subject: [PATCH 3/4] [Vectorize] Add Strided Unroll Loop pass that versions two-dimensional loops Introduce two passes that recognize a common 2-D row/column loop idiom (e.g. image/matrix kernels) and produce: * a **fallback** version for LoopVectorize, and * a **strided** version using widened ops, VP strided loads/stores, and controlled unrolling on scalable-vector targets. * Outer canonical IV (rows) and inner canonical IV (cols), both starting at 0, step = 1, `!=` predicate. * Inner loop: **unit-stride** loads/stores of uniform element size. * Outer loop: base pointers advanced by a **regular (dynamic) stride** SCEV. * Single store in inner body drives the producer graph. * Target supports scalable vectors (`TTI::supportsScalableVectors()`). Function pass (`StridedLoopUnrollVersioningPass`): * Builds LAA with **AssumptionCache**; uses `LoopVersioning` with runtime pointer checks. * Adds guards: inner TC divisible by unroll, outer TC divisible by `vscale`, and alignment if required by target. * Unrolls inner loop (heuristic `8 / elemSize`), hoists invariant loads, eliminates duplicate loads. * Marks loops (alias scopes, `llvm.stride.loop_idiom`, etc.). Loop pass (`StridedLoopUnrollPass`): * On loops marked `llvm.stride.loop_idiom`, widens supported ops by `vscale`. * Lowers unit-stride memory to **`experimental.vp.strided_{load,store}`**, adjusts IV increments, and cleans up dead code. --- .../Transforms/Vectorize/StridedLoopUnroll.h | 37 + llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 2 + llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 13 + llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/StridedLoopUnroll.cpp | 1414 +++++++++++++++++ 6 files changed, 1468 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Vectorize/StridedLoopUnroll.h create mode 100644 llvm/lib/Transforms/Vectorize/StridedLoopUnroll.cpp diff --git a/llvm/include/llvm/Transforms/Vectorize/StridedLoopUnroll.h b/llvm/include/llvm/Transforms/Vectorize/StridedLoopUnroll.h new file mode 100644 index 0000000000000..9667851a8ed12 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/StridedLoopUnroll.h @@ -0,0 +1,37 @@ +//===----------StridedLoopUnroll.h -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPSTRIDEIDIOMVECTORIZE_H +#define LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPSTRIDEIDIOMVECTORIZE_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +class ScalarEvolution; +class LoopInfo; + +class StridedLoopUnrollPass : public PassInfoMixin { +public: + StridedLoopUnrollPass() = default; + + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +class StridedLoopUnrollVersioningPass + : public PassInfoMixin { +public: + StridedLoopUnrollVersioningPass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm +#endif // LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMVECTORIZE_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 587f0ece0859b..12d80bd5dfb48 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -378,6 +378,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h" +#include "llvm/Transforms/Vectorize/StridedLoopUnroll.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" #include diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 299aaa801439b..adc0f6e5d880f 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -538,6 +538,7 @@ FUNCTION_PASS("sjlj-eh-prepare", SjLjEHPreparePass(TM)) FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass()) FUNCTION_PASS("slsr", StraightLineStrengthReducePass()) FUNCTION_PASS("stack-protector", StackProtectorPass(TM)) +FUNCTION_PASS("strided-loop-unroll-versioning", StridedLoopUnrollVersioningPass()) FUNCTION_PASS("strip-gc-relocates", StripGCRelocates()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) @@ -764,6 +765,7 @@ LOOP_PASS("loop-bound-split", LoopBoundSplitPass()) LOOP_PASS("loop-deletion", LoopDeletionPass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) LOOP_PASS("loop-idiom-vectorize", LoopIdiomVectorizePass()) +LOOP_PASS("strided-loop-unroll", StridedLoopUnrollPass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-predication", LoopPredicationPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 460bb33f2553a..6a8047ebd535c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -38,7 +38,9 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopUnrollPass.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" +#include "llvm/Transforms/Vectorize/StridedLoopUnroll.h" #include using namespace llvm; @@ -655,6 +657,17 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (Level != OptimizationLevel::O0) LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); }); + + PB.registerScalarOptimizerLateEPCallback( + [=](FunctionPassManager &LPM, OptimizationLevel) { + LPM.addPass(StridedLoopUnrollVersioningPass()); + }); + PB.registerOptimizerLastEPCallback([=](ModulePassManager &MPM, + OptimizationLevel Level, + llvm::ThinOrFullLTOPhase) { + MPM.addPass(createModuleToFunctionPassAdaptor( + createFunctionToLoopPassAdaptor(StridedLoopUnrollPass()))); + }); } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 96670fe3ea195..07f0a10beeb25 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_component_library(LLVMVectorize EVLIndVarSimplify.cpp LoadStoreVectorizer.cpp LoopIdiomVectorize.cpp + StridedLoopUnroll.cpp LoopVectorizationLegality.cpp LoopVectorize.cpp SandboxVectorizer/DependencyGraph.cpp diff --git a/llvm/lib/Transforms/Vectorize/StridedLoopUnroll.cpp b/llvm/lib/Transforms/Vectorize/StridedLoopUnroll.cpp new file mode 100644 index 0000000000000..eb658b398d554 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/StridedLoopUnroll.cpp @@ -0,0 +1,1414 @@ +//===-------- StridedLoopUnroll.cpp - Loop idiom vectorization +//-----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation pass that identifies and optimizes +// a specific class of nested loops operating over 2D data (e.g., image or +// matrix). +// +// The pass looks for loops with the following characteristics: +// +// - An outer loop with canonical induction over rows (y-dimension). +// - An inner loop with canonical induction over columns (x-dimension). +// - Inner loop performs unit-stride loads/stores via pointer induction. +// - Outer loop increments the base pointers with constant strides. +// - Loops have predictable trip counts (starting at zero, step = 1). +// +// When such a structure is recognized, the pass performs loop versioning: +// +// 1. The first version is intended to be consumed by the default +// LoopVectorize pass. +// +// 2. The second version assumes regular strided memory access and is marked +// for further transformation (e.g., unrolling or custom vectorization). +// +// This enables aggressive optimization of memory-bound loop nests, particularly +// for architectures where strided memory patterns can be handled efficiently. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/StridedLoopUnroll.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "strided-loop-unroll" + +namespace { +struct LoadInfo { + LoadInst *Instr; + Value *Stride; +}; + +struct StoreInfo { + StoreInst *Instr; + Value *Stride; +}; + +class StridedLoopUnroll { + Loop *CurLoop = nullptr; + TargetTransformInfo *TTI; + const DataLayout *DL; + ScalarEvolution *SE; + LoopAccessInfoManager LAIs; + +public: + StridedLoopUnroll(DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, + const DataLayout *DL, ScalarEvolution *SE, + AliasAnalysis *AA, AssumptionCache *AC) + : TTI(TTI), DL(DL), SE(SE), LAIs(*SE, *AA, *DT, *LI, TTI, nullptr, AC) {} + + bool run(Loop *L); + +private: + /// \name Stride Loop Idiom Handling + /// @{ + + Value *widenVectorizedInstruction(Instruction *I, SmallVectorImpl &, + FixedVectorType *VecTy, unsigned int VF); + bool recognizeStridedSpecialCases(); + + void transformStridedSpecialCases(BasicBlock *Header, BasicBlock *Latch, + BasicBlock *Preheader, Loop *SubLoop, + SmallVectorImpl &Loads, + StoreInst *Store, + SmallVectorImpl &PostOrder, + SmallVectorImpl &PreOrder); + void changeInductionVarIncrement(Value *IncomingValue, unsigned VF); + std::optional getDynamicStrideFromMemOp(Value *Value, + Instruction *InsertionPt); + std::optional getStrideFromAddRecExpr(const SCEVAddRecExpr *AR, + Instruction *InsertionPt); + + /// @} +}; + +static cl::opt + SkipPass("strided-loop-unroll-disable", cl::init(false), cl::Hidden, + cl::desc("Skip running strided loop unroll optimization.")); + +class StridedLoopUnrollVersioning { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + TargetTransformInfo *TTI; + const DataLayout *DL; + ScalarEvolution *SE; + LoopAccessInfoManager LAIs; + AssumptionCache *AC; + OptimizationRemarkEmitter *ORE; + + const LoopAccessInfo *LAI = nullptr; + +public: + StridedLoopUnrollVersioning(DominatorTree *DT, LoopInfo *LI, + TargetTransformInfo *TTI, const DataLayout *DL, + ScalarEvolution *SE, AliasAnalysis *AA, + AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, Function *F) + : DT(DT), LI(LI), TTI(TTI), DL(DL), SE(SE), + LAIs(*SE, *AA, *DT, *LI, TTI, nullptr, AC), AC(AC), ORE(ORE) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + void setNoAliasToLoop(Loop *VerLoop); + bool recognizeStridedSpecialCases(); + void transformStridedSpecialCases( + PHINode *OuterInductionVar, PHINode *InnerInductionVar, StoreInst *Stores, + BasicBlock *PreheaderBB, BasicBlock *BodyBB, BasicBlock *HeaderBB, + BasicBlock *LatchBB, SmallVectorImpl &AlignmentInfo, + unsigned UnrollSize); + void eliminateRedundantLoads(BasicBlock *BB) { + // Map from load pointer to the first load instruction + DenseMap LoadMap; + SmallVector ToDelete; + + // First pass: collect all loads and find duplicates + for (Instruction &I : *BB) { + if (auto *LI = dyn_cast(&I)) { + Value *Ptr = LI->getPointerOperand(); + + // Check if we've seen a load from this address + auto It = LoadMap.find(Ptr); + if (It != LoadMap.end() && !LI->isVolatile()) { + // Found duplicate - check if they're compatible + LoadInst *FirstLoad = It->second; + if (FirstLoad->getType() == LI->getType() && + FirstLoad->getAlign() == LI->getAlign() && + !FirstLoad->isVolatile()) { + // Replace this load with the first one + LI->replaceAllUsesWith(FirstLoad); + ToDelete.push_back(LI); + } + } else { + // First load from this address + LoadMap[Ptr] = LI; + } + } + } + + // Delete redundant loads + for (LoadInst *LI : ToDelete) { + LI->eraseFromParent(); + } + } + + void hoistInvariantLoadsToPreheader(Loop *L); + /// @} +}; + +} // anonymous namespace + +PreservedAnalyses StridedLoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + const auto *DL = &L.getHeader()->getDataLayout(); + + StridedLoopUnroll LIV(&AR.DT, &AR.LI, &AR.TTI, DL, &AR.SE, &AR.AA, &AR.AC); + if (!LIV.run(&L)) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===----------------------------------------------------------------------===// +// +// Implementation of StridedLoopUnroll +// +//===----------------------------------------------------------------------===// +bool StridedLoopUnroll::run(Loop *L) { + CurLoop = L; + + Function &F = *L->getHeader()->getParent(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" + << CurLoop->getHeader()->getName() << "\n"); + + if (recognizeStridedSpecialCases()) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Will transform: F[" << F.getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + return true; + } + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Will not transform: F[" << F.getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + return false; +} + +bool StridedLoopUnrollVersioning::run(Loop *L) { + CurLoop = L; + + if (!TTI->getVScaleForTuning()) + return false; + + Function &F = *L->getHeader()->getParent(); + if (F.hasOptSize()) + return false; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" + << CurLoop->getHeader()->getName() << "\n"); + + if (recognizeStridedSpecialCases()) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Will transform: F[" << F.getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + return true; + } + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Will not transform: F[" << F.getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + return false; +} + +/// \returns the number of elements for Ty. +static unsigned getNumElements(Type *Ty) { + assert(!isa(Ty) && + "ScalableVectorType is not supported."); + if (auto *VecTy = dyn_cast(Ty)) + return VecTy->getNumElements(); + return 1; +} + +/// \returns the vector type of ScalarTy based on vectorization factor. +static FixedVectorType *getGroupedWidenedType(Type *OriginalVecTy, unsigned VF, + const DataLayout &DL) { + auto ScalarTy = OriginalVecTy->getScalarType(); + auto GroupScalarTy = Type::getIntNTy( + ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy).getFixedValue() * + getNumElements(OriginalVecTy)); + return FixedVectorType::get(GroupScalarTy, VF); +} + +/// \returns the vector type of ScalarTy based on vectorization factor. +static FixedVectorType *getWidenedType(Type *VecTy, unsigned VF) { + return FixedVectorType::get(VecTy->getScalarType(), + VF * getNumElements(VecTy)); +} + +static void findUnconnectedToLoad(Instruction *start, + SmallPtrSetImpl &NotConnected, + SmallPtrSetImpl &Connected) { + SmallPtrSet outerVisited; + SmallVector Worklist; + + Worklist.push_back(start); + + while (!Worklist.empty()) { + SmallVector innerWorklist; + SmallPtrSet innerVisited; + bool connected = false; + Value *OuterVal = Worklist.back(); + Worklist.pop_back(); + + if (isa(OuterVal)) + continue; + + innerWorklist.push_back(OuterVal); + if (!outerVisited.insert(OuterVal).second) + continue; + + while (!innerWorklist.empty()) { + Value *val = innerWorklist.back(); + innerWorklist.pop_back(); + + // ignore phinodes + if (dyn_cast(val)) { + continue; + } + + // Only process instructions (skip constants, arguments, etc.) + auto *inst = dyn_cast(val); + if (!inst) { + continue; + } + + // Already innerVisited? + if (!innerVisited.insert(val).second) + continue; + + bool shouldBreak = isa(inst); + // If this is a load, do not proceed from here! + connected = isa(inst) && + start->getParent()->getName() == inst->getParent()->getName(); + if (shouldBreak) + break; + + // Add operands to the worklist + for (auto &op : inst->operands()) { + if (auto I = dyn_cast(op.get())) { + if (I->getParent() == start->getParent()) + innerWorklist.push_back(op.get()); + } else + innerWorklist.push_back(op.get()); + } + } + if (!connected) + NotConnected.insert(OuterVal); + else + Connected.insert(OuterVal); + if (auto I = dyn_cast(OuterVal)) { + for (auto &op : I->operands()) { + Worklist.push_back(op.get()); + } + } + } +} + +void StridedLoopUnroll::changeInductionVarIncrement(Value *IncomingValue, + unsigned VF) { + if (auto I = dyn_cast(IncomingValue)) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::GetElementPtr: { + IRBuilder<> Builder(I); + I->setOperand(1, Builder.CreateMul( + I->getOperand(1), + ConstantInt::get(I->getOperand(1)->getType(), VF))); + break; + } + default: + llvm_unreachable("Can't change increment in this InductionVar"); + } + } +} + +Value *StridedLoopUnroll::widenVectorizedInstruction( + Instruction *I, SmallVectorImpl &Ops, FixedVectorType *VecTy, + unsigned int VF) { + IRBuilder<> Builder(I); + auto Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Value *V = Builder.CreateNAryOp(Opcode, Ops); + return V; + } + case Instruction::Select: { + Value *V = Builder.CreateSelect(Ops[0], Ops[1], Ops[2]); + return V; + } + case Instruction::ICmp: { + auto Cmp = dyn_cast(I); + Value *V = Builder.CreateICmp(Cmp->getPredicate(), Ops[0], Ops[1]); + return V; + } + case Instruction::ShuffleVector: { + auto SV = dyn_cast(I); + ArrayRef Mask = SV->getShuffleMask(); + std::vector RepeatedMask; + RepeatedMask.reserve(Mask.size() * VF); + + for (unsigned int i = 0; i < VF; ++i) { + llvm::append_range(RepeatedMask, Mask); + } + + ArrayRef NewMask(RepeatedMask); + Value *Shuffle = + Builder.CreateShuffleVector(Ops[0], Ops[1], NewMask, I->getName()); + return Shuffle; + } + case Instruction::InsertElement: { + Value *A = + Builder.CreateInsertElement(Ops[0], Ops[1], Ops[2], I->getName()); + return A; + } + case Instruction::Load: { + Value *L = Builder.CreateLoad(VecTy, Ops[0], I->getName()); + return L; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::FPExt: { + Value *V = Builder.CreateCast(static_cast(Opcode), + Ops[0], VecTy, ""); + return V; + } + default: + llvm_unreachable("Can't handle widening this Opcode"); + } +} +bool isInstructionDepends(Instruction *Dependant, Instruction *Target) { + SmallPtrSet Visited; + SmallVector Worklist; + + // Start from the terminator + Instruction *Terminator = Dependant->getParent()->getTerminator(); + Worklist.push_back(Terminator); + + while (!Worklist.empty()) { + Instruction *Current = Worklist.pop_back_val(); + + // Skip if already visited + if (!Visited.insert(Current).second) + continue; + + // Found our target + if (Current == Target) { + return true; + } + + // Add operands that are instructions in the same BB + for (Use &U : Current->operands()) { + if (Instruction *OpInst = dyn_cast(U.get())) { + if (OpInst->getParent() == Dependant->getParent() && + Visited.find(OpInst) == Visited.end()) { + Worklist.push_back(OpInst); + } + } + } + } + + return false; +} + +// InnerInductionVar will be transformed to static +void StridedLoopUnroll::transformStridedSpecialCases( + BasicBlock *Header, BasicBlock *Latch, BasicBlock *Preheader, Loop *SubLoop, + SmallVectorImpl &Loads, StoreInst *Store, + SmallVectorImpl &PostOrder, SmallVectorImpl &PreOrder) { + + // auto InnerPreheader = SubLoop->getLoopPreheader(); + + auto Stride = getDynamicStrideFromMemOp(Store->getPointerOperand(), + Preheader->getTerminator()); + + SmallPtrSet Connected; + SmallPtrSet NotConnected; + SmallDenseMap Replacements; + + auto StoredInstruction = dyn_cast(Store->getValueOperand()); + findUnconnectedToLoad(StoredInstruction, NotConnected, Connected); + + auto convertConstant = [&](auto val) { + auto constVal = cast(val); + unsigned numElements = + cast(val->getType())->getNumElements(); + SmallVector elements; + + // Extract original elements + for (unsigned i = 0; i < numElements; ++i) + elements.push_back(constVal->getAggregateElement(i)); + + auto originalElements = elements; + for (unsigned int copy = 0; copy != (*TTI->getVScaleForTuning()) - 1; + ++copy) + elements.append(originalElements); + Constant *newConst = ConstantVector::get(elements); + return newConst; + }; + + // Process in post-order (leafs to root) + for (Value *val : PostOrder) { + if (Connected.contains(val)) { + if (auto *I = dyn_cast(val)) { + SmallVector Operands(I->operands()); + for (auto op_it = Operands.begin(); op_it != Operands.end(); ++op_it) { + if (Replacements.contains(*op_it)) + *op_it = Replacements[*op_it]; + else if (auto OrigVecTy = + llvm::dyn_cast((*op_it)->getType())) { + if (auto Iop = dyn_cast(*op_it)) { + if (Iop->getParent() != Store->getParent()) { + assert(!Connected.contains(*op_it)); + + IRBuilder<> Builder(I); + + std::vector Consts; + for (unsigned int i = 0; i != *TTI->getVScaleForTuning(); i++) { + for (size_t j = 0; + j != OrigVecTy->getElementCount().getFixedValue(); j++) { + Consts.push_back(llvm::ConstantInt::get( + llvm::Type::getInt32Ty(Builder.getContext()), j)); + } + } + + llvm::Constant *maskConst = llvm::ConstantVector::get(Consts); + assert(maskConst != nullptr); + + llvm::Value *splat = Builder.CreateShuffleVector( + Iop, llvm::PoisonValue::get(Iop->getType()), maskConst); + assert(splat != nullptr); + Replacements.insert({*op_it, splat}); + *op_it = splat; + } + } else if (isa(*op_it)) { // not instruction + auto replacement = convertConstant(*op_it); + assert(!!replacement); + Replacements.insert({*op_it, replacement}); + *op_it = replacement; + } + } + } + + auto NewVecTy = + getWidenedType(I->getType(), *TTI->getVScaleForTuning()); + Value *NI = widenVectorizedInstruction(I, Operands, NewVecTy, + *TTI->getVScaleForTuning()); + + assert(NI != nullptr); + Replacements.insert({I, NI}); + } + } else if (NotConnected.contains(val)) { + if (val->getType()->isVectorTy() && isa(val)) { + auto replacement = convertConstant(val); + Replacements.insert({val, replacement}); + } + } else if (auto Load = dyn_cast(val)) { + auto It = + std::find_if(Loads.begin(), Loads.end(), + [Load](auto &&LoadInstr) { return LoadInstr == Load; }); + if (It != Loads.end()) { + auto Stride = getDynamicStrideFromMemOp((*It)->getPointerOperand(), + Preheader->getTerminator()); + + auto GroupedVecTy = getGroupedWidenedType( + Load->getType(), *TTI->getVScaleForTuning(), *DL); + auto VecTy = + getWidenedType(Load->getType(), *TTI->getVScaleForTuning()); + ElementCount NewElementCount = GroupedVecTy->getElementCount(); + + IRBuilder<> Builder(Load); + auto *NewInst = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, + {GroupedVecTy, Load->getPointerOperand()->getType(), + (*Stride)->getType()}, + {Load->getPointerOperand(), *Stride, + Builder.getAllOnesMask(NewElementCount), + Builder.getInt32(NewElementCount.getKnownMinValue())}); + auto Cast = Builder.CreateBitCast(NewInst, VecTy); + Replacements.insert({Load, Cast}); + } + } + } + + IRBuilder<> Builder(Store); + auto VecTy = getGroupedWidenedType(Store->getValueOperand()->getType(), + *TTI->getVScaleForTuning(), *DL); + ElementCount NewElementCount = VecTy->getElementCount(); + + assert(Replacements.find(Store->getValueOperand()) != Replacements.end()); + auto Cast = + Builder.CreateBitCast(Replacements[Store->getValueOperand()], VecTy); + + Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_store, + {VecTy, Store->getPointerOperand()->getType(), (*Stride)->getType()}, + {Cast, Store->getPointerOperand(), *Stride, + Builder.getAllOnesMask(NewElementCount), + Builder.getInt32(NewElementCount.getKnownMinValue())}); + + for (auto &&PN : CurLoop->getHeader()->phis()) { + InductionDescriptor IndDesc; + + if (InductionDescriptor::isInductionPHI(&PN, CurLoop, SE, IndDesc)) { + if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) + changeInductionVarIncrement( + PN.getIncomingValueForBlock(CurLoop->getLoopLatch()), + *TTI->getVScaleForTuning()); + else if (IndDesc.getKind() == InductionDescriptor::IK_IntInduction) + changeInductionVarIncrement(IndDesc.getInductionBinOp(), + *TTI->getVScaleForTuning()); + } + } + + if (Store->use_empty()) + Store->eraseFromParent(); + + for (auto OldOp : PreOrder) + if (OldOp->use_empty()) + if (auto I = dyn_cast(OldOp)) + I->eraseFromParent(); +} + +std::optional +StridedLoopUnroll::getStrideFromAddRecExpr(const SCEVAddRecExpr *AR, + Instruction *InsertionPt) { + auto Step = AR->getStepRecurrence(*SE); + if (isa(Step)) + return std::nullopt; + SCEVExpander Expander(*SE, *DL, "stride"); + Value *StrideValue = + Expander.expandCodeFor(Step, Step->getType(), InsertionPt); + return StrideValue; +} + +std::optional +StridedLoopUnroll::getDynamicStrideFromMemOp(Value *V, + Instruction *InsertionPt) { + const SCEV *S = SE->getSCEV(V); + if (const SCEVAddRecExpr *InnerLoopAR = dyn_cast(S)) { + if (auto *constant = + dyn_cast(InnerLoopAR->getStepRecurrence(*SE))) { + // We need to form 64-bit groups + if (constant->getAPInt() != 8) { + return std::nullopt; + } + + const auto *Add = dyn_cast(InnerLoopAR->getStart()); + if (Add) { + for (const SCEV *Op : Add->operands()) { + // Look for the outer recurrence: { %dst, +, sext(%i_dst_stride) } + // + const auto *AR = dyn_cast(Op); + if (!AR) + continue; + + return getStrideFromAddRecExpr(AR, InsertionPt); + } + } else if (const SCEVAddRecExpr *AR = + dyn_cast(InnerLoopAR->getStart())) { + return getStrideFromAddRecExpr(AR, InsertionPt); + } + } + } + return std::nullopt; +} + +bool StridedLoopUnroll::recognizeStridedSpecialCases() { + auto Stride = getOptionalIntLoopAttribute(CurLoop, "llvm.stride.loop_idiom"); + if (!Stride) + return false; + + auto SubLoops = CurLoop->getSubLoops(); + + if (SubLoops.size() > 2 || SubLoops.empty()) + return false; + + auto SubLoop = SubLoops.size() == 2 ? SubLoops[1] : SubLoops[0]; + + auto Preheader = SubLoop->getLoopPreheader(); + auto Header = SubLoop->getHeader(); + auto Latch = SubLoop->getLoopLatch(); + + if (Header != Latch) + return false; + + SmallVector Loads; + SmallVector Stores; + + llvm::SmallPtrSet NotVisited; + llvm::SmallVector WorkList; + + for (auto &&I : *Header) { + if (auto &&Store = dyn_cast(&I)) { + WorkList.push_back(Store); + } else { + NotVisited.insert(&I); + } + } + + if (WorkList.size() != 1) + return false; + + while (!WorkList.empty()) { + llvm::Instruction *I = WorkList.back(); + WorkList.pop_back(); + + if (auto *Load = dyn_cast(I)) { + NotVisited.erase(I); + Loads.push_back(Load); + } else if (auto *Store = dyn_cast(I)) { + if (auto *ValueInst = dyn_cast(Store->getValueOperand())) + WorkList.push_back(ValueInst); + NotVisited.erase(I); + Stores.push_back(Store); + } else { + // Add operand instructions to the worklist + for (llvm::Value *Op : I->operands()) { + if (auto *DepInst = llvm::dyn_cast(Op)) + if (DepInst->getParent() == Header) + WorkList.push_back(DepInst); + } + NotVisited.erase(I); + } + } + + if (Stores.size() != 1) + return false; + + SmallPtrSet Connected; + SmallPtrSet NotConnected; + + auto StoredInstruction = dyn_cast(Stores[0]->getValueOperand()); + findUnconnectedToLoad(StoredInstruction, NotConnected, Connected); + + llvm::SmallVector PostOrder; + llvm::SmallVector PreOrder; + llvm::SmallPtrSet Visited; + llvm::SmallPtrSet InStack; + SmallVector Worklist; + + Worklist.push_back(StoredInstruction); + + auto shouldVisit = [Header](auto *Val) { + return !isa(Val) && + (!isa(Val) || + dyn_cast(Val)->getParent() == Header); + }; + auto shouldVisitOperands = [](auto *Val) { + return !isa(Val) && !isa(Val); + }; + + while (!Worklist.empty()) { + Value *val = Worklist.back(); + assert(!isa(val) || + dyn_cast(val)->getParent() == Header || + dyn_cast(val)->getParent() == Preheader); + + if (InStack.contains(val)) { + // We've finished processing children, add to post-order + Worklist.pop_back(); + InStack.erase(val); + PostOrder.push_back(val); + } else if (!Visited.contains(val)) { + // First time seeing this node + Visited.insert(val); + InStack.insert(val); + PreOrder.push_back(val); + + // Add children to worklist + if (auto I = dyn_cast(val)) + if (shouldVisitOperands(I)) + for (auto &Op : I->operands()) + if (shouldVisit(Op.get()) && !Visited.contains(Op.get())) { + Worklist.push_back(Op.get()); + } + } else { + // Already visited, skip + Worklist.pop_back(); + } + } + + // Process in post-order (leafs to root) + for (Value *val : PostOrder) { + if (Connected.contains(val)) { + if (auto *I = dyn_cast(val)) { + SmallVector Operands(I->operands()); + for (auto op_it = Operands.begin(); op_it != Operands.end(); ++op_it) { + if (isa((*op_it)->getType())) { + if (!isa(*op_it) && !isa(*op_it)) { + return false; + } + } + } + } else { // We don't handle Non-instructions connected to Load + return false; + } + } else if (NotConnected.contains(val) && + (!val->getType()->isVectorTy() || !isa(val))) { + return false; + } else if (auto Load = dyn_cast(val)) { + if (std::find(Loads.begin(), Loads.end(), Load) == Loads.end()) + return false; + } + } + + transformStridedSpecialCases(Header, Latch, Preheader, SubLoop, Loads, + Stores[0], PostOrder, PreOrder); + + return true; +} + +namespace { + +bool canHandleInstruction(Instruction *I) { + auto Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::ShuffleVector: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::ICmp: + return true; + default: + return false; + } +} + +} // anonymous namespace + +PreservedAnalyses +StridedLoopUnrollVersioningPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = false; + + if (SkipPass) + return PreservedAnalyses::all(); + + auto &LI = FAM.getResult(F); + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = FAM.getResult(F); + auto &DT = FAM.getResult(F); + auto &TTI = FAM.getResult(F); + auto &AA = FAM.getResult(F); + auto &AC = FAM.getResult(F); + auto &ORE = FAM.getResult(F); + + // Iterate over all loops in the function + std::vector Loops(LI.begin(), LI.end()); + for (Loop *L : Loops) { + // L may be deleted, so check it's still valid! + if (std::find(LI.begin(), LI.end(), L) == LI.end()) + continue; + + const auto *DL = &L->getHeader()->getDataLayout(); + + StridedLoopUnrollVersioning LIV(&DT, &LI, &TTI, DL, &SE, &AA, &AC, &ORE, + &F); + bool ThisChanged = LIV.run(L); + Changed |= ThisChanged; + } + + if (Changed) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +void StridedLoopUnrollVersioning::setNoAliasToLoop(Loop *VerLoop) { + // Get latch terminator instruction. + Instruction *I = VerLoop->getLoopLatch()->getTerminator(); + // Create alias scope domain. + MDBuilder MDB(I->getContext()); + MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LIVVDomain"); + StringRef Name = "LVAliasScope"; + MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); + SmallVector Scopes{NewScope}, NoAliases{NewScope}; + // Iterate over each instruction of loop. + // set no-alias for all load & store instructions. + for (auto *Block : CurLoop->getBlocks()) { + for (auto &Inst : *Block) { + // Only interested in instruction that may modify or read memory. + if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory()) + continue; + // Set no-alias for current instruction. + Inst.setMetadata( + LLVMContext::MD_noalias, + MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Inst.getContext(), NoAliases))); + // set alias-scope for current instruction. + Inst.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Inst.getContext(), Scopes))); + } + } +} + +void StridedLoopUnrollVersioning::hoistInvariantLoadsToPreheader(Loop *L) { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + // If no preheader, try the header + Preheader = L->getHeader(); + } + + // Find all invariant loads in the loop + SmallVector InvariantLoads; + + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : *BB) { + if (auto *LI = dyn_cast(&I)) { + Value *Ptr = LI->getPointerOperand(); + + if (L->isLoopInvariant(Ptr)) { + InvariantLoads.push_back(LI); + } + } + } + } + + // Move loads to preheader and eliminate duplicates + DenseMap HoistedLoads; + + for (LoadInst *LI : InvariantLoads) { + Value *Ptr = LI->getPointerOperand(); + + if (HoistedLoads.count(Ptr)) { + // Already hoisted this load, replace uses + LI->replaceAllUsesWith(HoistedLoads[Ptr]); + LI->eraseFromParent(); + } else { + // Move to preheader + LI->moveBefore(*Preheader, Preheader->getTerminator()->getIterator()); + HoistedLoads[Ptr] = LI; + } + } +} + +// InnerInductionVar will be transformed to static +void StridedLoopUnrollVersioning::transformStridedSpecialCases( + PHINode *OuterInductionVar, PHINode *InnerInductionVar, StoreInst *Store, + BasicBlock *PreheaderBB, BasicBlock *BodyBB, BasicBlock *HeaderBB, + BasicBlock *LatchBB, SmallVectorImpl &AlignmentInfo, + unsigned UnrollSize) { + + PredicatedScalarEvolution PSE(*SE, *CurLoop); + + auto VLAI = &LAIs.getInfo(*CurLoop); + LoopVersioning LVer2(*VLAI, VLAI->getRuntimePointerChecking()->getChecks(), + CurLoop, LI, DT, SE, true); + LVer2.versionLoop(); + +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif + + addStringMetadataToLoop(LVer2.getNonVersionedLoop(), + "llvm.mem.string_loop_idiom"); + setNoAliasToLoop(LVer2.getVersionedLoop()); + + auto VersionedLoop = LVer2.getVersionedLoop(); + auto NewInnerLoop = VersionedLoop->getSubLoops()[0]; + auto InnerLoopBounds = NewInnerLoop->getBounds(*SE); + auto OuterLoopBounds = VersionedLoop->getBounds(*SE); + + for (BasicBlock *BB : VersionedLoop->blocks()) { + BB->setName(BB->getName() + ".strided.vectorized"); + } + + UnrollLoopOptions ULO; + ULO.Count = UnrollSize; + ULO.Force = true; + ULO.Runtime = true; + ULO.AllowExpensiveTripCount = false; + ULO.UnrollRemainder = false; + ULO.SCEVExpansionBudget = -1; + + UnrollLoop(NewInnerLoop, ULO, LI, SE, DT, AC, TTI, ORE, false); + + hoistInvariantLoadsToPreheader(VersionedLoop); + + for (BasicBlock *BB : VersionedLoop->blocks()) { + eliminateRedundantLoads(BB); + } + + for (BasicBlock *BB : VersionedLoop->blocks()) { + DenseMap LoadCSE; + SmallVector DeadInsts; + + for (Instruction &I : *BB) { + if (auto *LI = dyn_cast(&I)) { + if (!LI->isVolatile()) { + Value *Ptr = LI->getPointerOperand(); + if (LoadCSE.count(Ptr)) { + // Reuse previous load + LI->replaceAllUsesWith(LoadCSE[Ptr]); + DeadInsts.push_back(LI); + } else { + LoadCSE[Ptr] = LI; + } + } + } + } + + for (auto *I : DeadInsts) { + I->eraseFromParent(); + } + } + + for (BasicBlock *BB : VersionedLoop->blocks()) { + eliminateRedundantLoads(BB); + } + + if (InnerLoopBounds) { + setNoAliasToLoop(VersionedLoop); + setNoAliasToLoop(VersionedLoop->getSubLoops()[0]); + addStringMetadataToLoop(VersionedLoop, "llvm.stride.loop_idiom"); + VersionedLoop->setLoopAlreadyUnrolled(); + + assert(std::distance(pred_begin(VersionedLoop->getLoopPreheader()), + pred_end(VersionedLoop->getLoopPreheader())) == 1); + for (auto *Pred : predecessors(VersionedLoop->getLoopPreheader())) { + BranchInst *PHBranch = cast(Pred->getTerminator()); + IRBuilder<> Builder(PHBranch); + + Value *innerZero = + Constant::getNullValue(InnerLoopBounds->getFinalIVValue().getType()); + Value *outerZero = + Constant::getNullValue(OuterLoopBounds->getFinalIVValue().getType()); + + Value *innerMask = Builder.getIntN( + InnerLoopBounds->getFinalIVValue().getType()->getIntegerBitWidth(), + UnrollSize - 1); + Value *innerAndResult = Builder.CreateAnd( + &InnerLoopBounds->getFinalIVValue(), innerMask, "inner_mod_unroll"); + Value *innerIsNotDivisible = + Builder.CreateICmpNE(innerAndResult, innerZero, "innerIsDivUnroll"); + + Value *const32 = Builder.getIntN( + InnerLoopBounds->getFinalIVValue().getType()->getIntegerBitWidth(), + 32); + Value *innerNotSmallerThan = Builder.CreateICmpUGE( + &InnerLoopBounds->getFinalIVValue(), const32, "inner_not_less_32"); + + auto o = TTI->getVScaleForTuning(); + assert(!!o); + + Value *mask = Builder.getIntN( + OuterLoopBounds->getFinalIVValue().getType()->getIntegerBitWidth(), + *o - 1); + Value *andResult = Builder.CreateAnd(&OuterLoopBounds->getFinalIVValue(), + mask, "div_unroll"); + Value *isNotDivisible = + Builder.CreateICmpNE(andResult, outerZero, "is_div_unroll"); + Value *Check1 = Builder.CreateOr(innerIsNotDivisible, isNotDivisible); + Value *Check2 = Builder.CreateOr(Check1, innerNotSmallerThan); + + Value *AlignmentCheck = Builder.getFalse(); + + for (auto &&PtrSCEV : AlignmentInfo) { + const unsigned Alignment = 8; + // Expand SCEV to get runtime value + SCEVExpander Expander(*SE, *DL, "align.check"); + Value *PtrValue = + Expander.expandCodeFor(PtrSCEV, Builder.getPtrTy(), PHBranch); + + Type *I64 = Type::getInt64Ty(PtrValue->getContext()); + bool AllowsMisaligned = TTI->isLegalStridedLoadStore( + VectorType::get(I64, ElementCount::getFixed(8)), Align(1)); + + if (!AllowsMisaligned) { + // Create alignment check: (ptr & (alignment-1)) == 0 + Value *PtrInt = + Builder.CreatePtrToInt(PtrValue, Builder.getInt64Ty()); + Value *Mask = Builder.getInt64(Alignment - 1); + Value *Masked = Builder.CreateAnd(PtrInt, Mask); + Value *IsAligned = Builder.CreateICmpNE(Masked, Builder.getInt64(0)); + + AlignmentCheck = Builder.CreateOr(AlignmentCheck, IsAligned); + } + } + Value *Check3 = Builder.CreateOr(Check2, AlignmentCheck); + + PHBranch->setCondition(Check3); + +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif + } + } +} + +bool StridedLoopUnrollVersioning::recognizeStridedSpecialCases() { + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value()) + return false; + + auto LoopBlocks = CurLoop->getBlocks(); + + auto SubLoops = CurLoop->getSubLoops(); + + if (SubLoops.size() != 1) + return false; + + auto InnerLoop = SubLoops[0]; + + auto OuterLoopBounds = CurLoop->getBounds(*SE); + auto InnerLoopBounds = InnerLoop->getBounds(*SE); + + if (!OuterLoopBounds || !InnerLoopBounds) + return false; + + // We want both loops to be straightforward loops + if (!OuterLoopBounds->getStepValue() || !InnerLoopBounds->getStepValue()) + return false; + + // We want for-loops that start in zero and end in a variable that + // is immutable inside the loop + if (!isa(&OuterLoopBounds->getInitialIVValue()) || + !isa(&InnerLoopBounds->getInitialIVValue()) || + isa(&OuterLoopBounds->getFinalIVValue()) || + isa(&InnerLoopBounds->getFinalIVValue()) || + !dyn_cast(&OuterLoopBounds->getInitialIVValue()) + ->isZeroValue() || + !dyn_cast(&OuterLoopBounds->getInitialIVValue())->isZeroValue()) + return false; + + // We want to loop by one step and the condition must be to end + // at the specified final value + if (!isa(OuterLoopBounds->getStepValue()) || + !isa(InnerLoopBounds->getStepValue()) || + !dyn_cast(OuterLoopBounds->getStepValue())->isOneValue() || + !dyn_cast(OuterLoopBounds->getStepValue())->isOneValue() || + OuterLoopBounds->getCanonicalPredicate() != ICmpInst::ICMP_NE || + InnerLoopBounds->getCanonicalPredicate() != ICmpInst::ICMP_NE) + return false; + + BasicBlock *OuterLoopHeader = CurLoop->getHeader(); + BasicBlock *OuterLoopLatch = CurLoop->getLoopLatch(); + + // In StridedLoopUnrollVersioning::run we have already checked that the loop + // has a preheader so we can assume it's in a canonical form. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 3 || + !OuterLoopHeader || !OuterLoopLatch) + return false; + + BasicBlock *ForLoop = + OuterLoopHeader != LoopBlocks[0] && OuterLoopLatch != LoopBlocks[0] + ? LoopBlocks[0] + : OuterLoopHeader != LoopBlocks[1] && OuterLoopLatch != LoopBlocks[1] + ? LoopBlocks[1] + : LoopBlocks[2]; + + // We must have two canonical induction variables + auto OuterInductionVariable = CurLoop->getCanonicalInductionVariable(); + auto InnerInductionVariable = InnerLoop->getCanonicalInductionVariable(); + + SmallVector Loads; + SmallVector Stores; + + if (!OuterInductionVariable || !InnerInductionVariable) + return false; + + for (auto &&PN : OuterLoopHeader->phis()) { + if (PN.getNumIncomingValues() != 2) + return false; + + InductionDescriptor IndDesc; + + // Check if PN is a simple induction PHI: + // - For pointer IVs: require exactly one increment (feeds back into PN) + // and one mem-op address (feeding a single load/store). + // - For integer IVs: only accept the designated outer IV. + // Reject if shape is more complex (multiple users, non-load/store ops). + if (InductionDescriptor::isInductionPHI(&PN, CurLoop, SE, IndDesc)) { + if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) { + Value *IncrementGEP = nullptr, *MemOpGEP = nullptr; + for (auto &&User : PN.uses()) { + if (std::distance(User.getUser()->use_begin(), + User.getUser()->use_end()) != 1) + return false; + if (User.getUser()->use_begin()->getUser() == &PN) + IncrementGEP = User.getUser(); + else if (!MemOpGEP) + MemOpGEP = User.getUser(); + else + return false; + } + + if (!MemOpGEP || !IncrementGEP) + return false; + + auto MemOp = MemOpGEP->use_begin()->getUser(); + if (!isa(MemOp) && !isa(MemOp)) + return false; + } else if (IndDesc.getKind() == InductionDescriptor::IK_IntInduction) + if (&PN != OuterInductionVariable) + return false; + } else + return false; + } + + llvm::SmallPtrSet NotVisited; + llvm::SmallVector WorkList; + + for (auto &&BB : CurLoop->getBlocks()) + for (auto &&V : *BB) + if (BB != ForLoop) + if (!canHandleInstruction(&V)) + return false; + for (auto &&Loop : CurLoop->getSubLoops()) + for (auto &&BB : Loop->getBlocks()) + for (auto &&V : *BB) + if (BB != ForLoop) + if (!canHandleInstruction(&V)) + return false; + + // Collect pointers needing alignment + SmallVector AlignmentInfo; + unsigned UnrollSize = 8; + + for (BasicBlock *BB : CurLoop->blocks()) { + for (Instruction &I : *BB) { + Value *Ptr = nullptr; + uint64_t size = 0; + + if (LoadInst *LI = dyn_cast(&I)) { + Ptr = LI->getPointerOperand(); + TypeSize typeSize = DL->getTypeAllocSize(I.getType()); + if (size == 0) + size = typeSize; + else if (size != typeSize) + return false; + } else if (StoreInst *SI = dyn_cast(&I)) { + Ptr = SI->getPointerOperand(); + TypeSize typeSize = + DL->getTypeAllocSize(SI->getValueOperand()->getType()); + if (size == 0) + size = typeSize; + else if (size != typeSize) + return false; + UnrollSize = 8 / size; + } else + continue; + + const SCEV *S = SE->getSCEV(Ptr); + + if (const SCEVAddRecExpr *InnerLoopAR = dyn_cast(S)) { + if (auto *constant = + dyn_cast(InnerLoopAR->getStepRecurrence(*SE))) { + if (constant->getAPInt() != size) + return false; // must be contiguous + + if (const SCEVAddRecExpr *AR = + dyn_cast(InnerLoopAR->getStart())) { + auto Step = AR->getStepRecurrence(*SE); + if (isa(Step)) + return false; + else { + const SCEVUnknown *Unknown = nullptr; + + if (size > 1) { + if (auto mul = dyn_cast(Step)) { + if (mul->getNumOperands() == 2) { + if (auto constant = + dyn_cast(mul->getOperand(0))) { + if (constant->getAPInt() != size) + return false; + } else + return false; + Unknown = dyn_cast(mul->getOperand(1)); + if (auto CastExtend = + dyn_cast(mul->getOperand(1))) + Unknown = dyn_cast(CastExtend->getOperand()); + } else + return false; + } else + return false; + } + if (!Unknown) { + Unknown = dyn_cast(Step); + if (auto CastExtend = dyn_cast(Step)) + Unknown = dyn_cast(CastExtend->getOperand()); + } + if (Unknown) { // stride should be fixed but not constant + if (isa(Unknown->getValue())) + return false; + } else + return false; + } + + AlignmentInfo.push_back(AR->getStart()); + } else + return false; + } else + return false; + } else if (!CurLoop->isLoopInvariant(Ptr)) + return false; + } + } + + // Initialize NotVisited and WorkList + // Check that we can handle all instructions during the Strided Loop Unroll + // pass We will ignore the exit condition and the increment from the induction + // variable + for (auto &&I : *ForLoop) { + if (auto &&Store = dyn_cast(&I)) { + WorkList.push_back(Store); + Stores.push_back(Store); + } else if (&I != OuterInductionVariable && &I != InnerInductionVariable) { + if (I.getParent() != InnerLoop->getHeader() && + &I != InnerLoop->getHeader()->getTerminator() && + &I != dyn_cast(InnerLoop->getHeader()->getTerminator()) + ->getCondition()) + NotVisited.insert(&I); + } + } + + if (WorkList.size() != 1 || Stores.size() != 1) + return false; + + // Check dependencies between instructions that the outer loop + // arithmetic is self-contained + while (!WorkList.empty()) { + llvm::Instruction *I = WorkList.back(); + WorkList.pop_back(); + + /* Should check for loops, possibly with NotVisited */ + if (auto *Load = dyn_cast(I)) { + // We stop at load + NotVisited.erase(I); + + auto Pointer = Load->getPointerOperand(); + if (auto *GEP = dyn_cast(Pointer)) { + NotVisited.erase(GEP); + Loads.push_back(Load); + } else + return false; + } else if (auto *Store = dyn_cast(I)) { + if (auto *ValueInst = dyn_cast(Store->getValueOperand())) + WorkList.push_back(ValueInst); + NotVisited.erase(I); + auto Pointer = Store->getPointerOperand(); + if (auto *GEP = dyn_cast(Pointer)) { + NotVisited.erase(GEP); + } else + return false; + } else { + // Add operand instructions to the worklist + for (llvm::Value *Op : I->operands()) + if (auto *DepInst = llvm::dyn_cast(Op)) + WorkList.push_back(DepInst); + NotVisited.erase(I); + } + } + + if (!NotVisited.empty()) + return false; + + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + + LAI = &LAIs.getInfo(*SubLoops[0]); + + if (LAI->getRuntimePointerChecking()->getChecks().empty()) + return false; + + transformStridedSpecialCases(OuterInductionVariable, InnerInductionVariable, + Stores[0], Preheader, ForLoop, OuterLoopHeader, + OuterLoopLatch, AlignmentInfo, UnrollSize); + + return true; +} From 33e47f7ae1e39c0f9bd7ab199231d6a055c4ef32 Mon Sep 17 00:00:00 2001 From: Felipe Magno de Almeida Date: Tue, 26 Aug 2025 17:39:53 -0300 Subject: [PATCH 4/4] [Vectorize] Update Strided Loop Unroll test with optimization Adds initial test to show difference in code generation and for regression test for Strided Loop Unroll passes test --- .../Transforms/StridedLoopUnroll/pixel_avg.ll | 103 +++++++++++++++--- 1 file changed, 88 insertions(+), 15 deletions(-) diff --git a/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll b/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll index 9b43d4836343f..043c431423c80 100644 --- a/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll +++ b/llvm/test/Transforms/StridedLoopUnroll/pixel_avg.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes='function(slp-vectorizer,instcombine<>,simplifycfg<>)' -mcpu=spacemit-x60 -S %s | FileCheck %s +; RUN: opt -passes='function(strided-loop-unroll-versioning,verify,slp-vectorizer,instcombine<>,simplifycfg<>,loop(strided-loop-unroll))' -mcpu=spacemit-x60 -S %s | FileCheck %s ; ModuleID = 'test-noopt.ll' source_filename = "test.c" @@ -19,18 +19,37 @@ define dso_local void @pixel_avg(ptr noundef writeonly captures(none) %dst, i32 ; CHECK-NEXT: [[IDX_EXT1:%.*]] = sext i32 [[I_DST_STRIDE]] to i64 ; CHECK-NEXT: [[IDX_EXT13:%.*]] = sext i32 [[I_SRC1_STRIDE]] to i64 ; CHECK-NEXT: [[IDX_EXT15:%.*]] = sext i32 [[I_SRC2_STRIDE]] to i64 -; CHECK-NEXT: br i1 [[CMP227]], label %[[FOR_COND1_PREHEADER_US_PREHEADER:.*]], label %[[FOR_COND_CLEANUP]] -; CHECK: [[FOR_COND1_PREHEADER_US_PREHEADER]]: +; CHECK-NEXT: br i1 [[CMP227]], label %[[FOR_COND1_PREHEADER_US_LVER_CHECK:.*]], label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND1_PREHEADER_US_LVER_CHECK]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[I_WIDTH]] to i64 -; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US:.*]] -; CHECK: [[FOR_COND1_PREHEADER_US]]: -; CHECK-NEXT: [[Y_033_US_LVER_ORIG:%.*]] = phi i32 [ [[INC17_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[DST_ADDR_032_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[DST]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[SRC1_ADDR_031_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR13_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[SRC1]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[SRC2_ADDR_030_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR15_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[SRC2]], %[[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: br label %[[FOR_BODY4_US:.*]] -; CHECK: [[FOR_BODY4_US]]: -; CHECK-NEXT: [[INDVARS_IV_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT_LVER_ORIG:%.*]], %[[FOR_BODY4_US]] ] +; CHECK-NEXT: [[INNER_MOD_UNROLL:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7 +; CHECK-NEXT: [[INNERISDIVUNROLL:%.*]] = icmp ne i64 [[INNER_MOD_UNROLL]], 0 +; CHECK-NEXT: [[INNER_NOT_LESS_32:%.*]] = icmp ugt i32 [[I_WIDTH]], 31 +; CHECK-NEXT: [[DIV_UNROLL:%.*]] = and i32 [[I_HEIGHT]], 3 +; CHECK-NEXT: [[IS_DIV_UNROLL:%.*]] = icmp ne i32 [[DIV_UNROLL]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = or i1 [[INNERISDIVUNROLL]], [[IS_DIV_UNROLL]] +; CHECK-NEXT: [[TMP1:%.*]] = or i1 [[TMP0]], [[INNER_NOT_LESS_32]] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[SRC1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[SRC2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 7 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP1]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER:.*]], label %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER:.*]] +; CHECK: [[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED:.*]] +; CHECK: [[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_LVER_ORIG:.*]] +; CHECK: [[FOR_COND1_PREHEADER_US_LVER_ORIG]]: +; CHECK-NEXT: [[Y_033_US_LVER_ORIG:%.*]] = phi i32 [ [[INC17_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER]] ] +; CHECK-NEXT: [[DST_ADDR_032_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG]] ], [ [[DST]], %[[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER]] ] +; CHECK-NEXT: [[SRC1_ADDR_031_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR13_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG]] ], [ [[SRC1]], %[[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER]] ] +; CHECK-NEXT: [[SRC2_ADDR_030_US_LVER_ORIG:%.*]] = phi ptr [ [[ADD_PTR15_US_LVER_ORIG:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG]] ], [ [[SRC2]], %[[FOR_COND1_PREHEADER_US_LVER_ORIG_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY4_US_LVER_ORIG:.*]] +; CHECK: [[FOR_BODY4_US_LVER_ORIG]]: +; CHECK-NEXT: [[INDVARS_IV_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER_US_LVER_ORIG]] ], [ [[INDVARS_IV_NEXT_LVER_ORIG:%.*]], %[[FOR_BODY4_US_LVER_ORIG]] ] ; CHECK-NEXT: [[ARRAYIDX_US_LVER_ORIG:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC1_ADDR_031_US_LVER_ORIG]], i64 [[INDVARS_IV_LVER_ORIG]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_US_LVER_ORIG]], align 1 ; CHECK-NEXT: [[CONV_US_LVER_ORIG:%.*]] = zext i8 [[TMP10]] to i16 @@ -45,14 +64,61 @@ define dso_local void @pixel_avg(ptr noundef writeonly captures(none) %dst, i32 ; CHECK-NEXT: store i8 [[CONV9_US_LVER_ORIG]], ptr [[ARRAYIDX11_US_LVER_ORIG]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV_LVER_ORIG]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_LVER_ORIG]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_LVER_ORIG]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label %[[FOR_BODY4_US]] -; CHECK: [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]: +; CHECK-NEXT: br i1 [[EXITCOND_NOT_LVER_ORIG]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG]], label %[[FOR_BODY4_US_LVER_ORIG]] +; CHECK: [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LVER_ORIG]]: ; CHECK-NEXT: [[ADD_PTR_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[DST_ADDR_032_US_LVER_ORIG]], i64 [[IDX_EXT1]] ; CHECK-NEXT: [[ADD_PTR13_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[SRC1_ADDR_031_US_LVER_ORIG]], i64 [[IDX_EXT13]] ; CHECK-NEXT: [[ADD_PTR15_US_LVER_ORIG]] = getelementptr inbounds i8, ptr [[SRC2_ADDR_030_US_LVER_ORIG]], i64 [[IDX_EXT15]] ; CHECK-NEXT: [[INC17_US_LVER_ORIG]] = add nuw nsw i32 [[Y_033_US_LVER_ORIG]], 1 ; CHECK-NEXT: [[EXITCOND35_NOT_LVER_ORIG:%.*]] = icmp eq i32 [[INC17_US_LVER_ORIG]], [[I_HEIGHT]] -; CHECK-NEXT: br i1 [[EXITCOND35_NOT_LVER_ORIG]], label %[[FOR_COND_CLEANUP]], label %[[FOR_COND1_PREHEADER_US]] +; CHECK-NEXT: br i1 [[EXITCOND35_NOT_LVER_ORIG]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[FOR_COND1_PREHEADER_US_LVER_ORIG]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED]]: +; CHECK-NEXT: [[Y_033_US:%.*]] = phi i32 [ [[INC17_US:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[DST_ADDR_032_US:%.*]] = phi ptr [ [[ADD_PTR_US:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]] ], [ [[DST]], %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[SRC1_ADDR_031_US:%.*]] = phi ptr [ [[ADD_PTR13_US:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]] ], [ [[SRC1]], %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[SRC2_ADDR_030_US:%.*]] = phi ptr [ [[ADD_PTR15_US:%.*]], %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]] ], [ [[SRC2]], %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult i32 [[I_WIDTH]], 8 +; CHECK-NEXT: br i1 [[TMP12]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]], label %[[FOR_BODY4_US_STRIDED_VECTORIZED_PREHEADER:.*]] +; CHECK: [[FOR_BODY4_US_STRIDED_VECTORIZED_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY4_US_STRIDED_VECTORIZED:.*]] +; CHECK: [[FOR_BODY4_US_STRIDED_VECTORIZED]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY4_US_STRIDED_VECTORIZED]] ], [ 0, %[[FOR_BODY4_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY4_US_STRIDED_VECTORIZED]] ], [ 0, %[[FOR_BODY4_US_STRIDED_VECTORIZED_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC1_ADDR_031_US]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC2_ADDR_030_US]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX11_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_032_US]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr [[ARRAYIDX_US]], i64 [[IDX_EXT13]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[TMP13]] to <32 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = zext <32 x i8> [[TMP14]] to <32 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr [[ARRAYIDX6_US]], i64 [[IDX_EXT15]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i64> [[TMP16]] to <32 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = zext <32 x i8> [[TMP17]] to <32 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = add <32 x i16> [[TMP15]], splat (i16 1) +; CHECK-NEXT: [[TMP20:%.*]] = add <32 x i16> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = lshr <32 x i16> [[TMP20]], splat (i16 1) +; CHECK-NEXT: [[TMP22:%.*]] = trunc <32 x i16> [[TMP21]] to <32 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <32 x i8> [[TMP22]] to <4 x i64> +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP23]], ptr [[ARRAYIDX11_US]], i64 [[IDX_EXT1]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 +; CHECK-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 +; CHECK-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED_LOOPEXIT:.*]], label %[[FOR_BODY4_US_STRIDED_VECTORIZED]] +; CHECK: [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]] +; CHECK: [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_STRIDED_VECTORIZED]]: +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[IDX_EXT1]], 4 +; CHECK-NEXT: [[ADD_PTR_US]] = getelementptr inbounds i8, ptr [[DST_ADDR_032_US]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[IDX_EXT13]], 4 +; CHECK-NEXT: [[ADD_PTR13_US]] = getelementptr inbounds i8, ptr [[SRC1_ADDR_031_US]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[IDX_EXT15]], 4 +; CHECK-NEXT: [[ADD_PTR15_US]] = getelementptr inbounds i8, ptr [[SRC2_ADDR_030_US]], i64 [[TMP26]] +; CHECK-NEXT: [[INC17_US]] = add nuw nsw i32 [[Y_033_US]], 4 +; CHECK-NEXT: [[EXITCOND35_NOT:%.*]] = icmp eq i32 [[INC17_US]], [[I_HEIGHT]] +; CHECK-NEXT: br i1 [[EXITCOND35_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT2:.*]], label %[[FOR_COND1_PREHEADER_US_STRIDED_VECTORIZED]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT2]]: +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; @@ -110,3 +176,10 @@ for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond. for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %for.cond1.preheader.lr.ph, %entry ret void } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.mem.string_loop_idiom", i32 0} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK: [[META3]] = !{!"llvm.stride.loop_idiom", i32 0} +; CHECK: [[META4]] = !{!"llvm.loop.unroll.disable"} +;.