From f8a80b12ddaea0a47577edc163f68f9f7dc94b3e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 12 Nov 2025 14:57:18 +0800 Subject: [PATCH 1/5] [VPlan] Handle ExplicitVectorLength in opcodeMayReadOrWriteFromMemory This has no effect for now from what I can tell but is needed if we ever want to extend narrowInterleaveGroups to handle EVL tail folded loops. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8..488470d24796 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1259,6 +1259,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: + case VPInstruction::ExplicitVectorLength: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8a..488470d247968 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1259,6 +1259,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: + case VPInstruction::ExplicitVectorLength: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: From 6bb2fe095f6695b08eb6216cc6219ad4cd5aa8ff Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 12 Nov 2025 20:57:13 +0800 Subject: [PATCH 2/5] Simplify known EVLs --- .../Transforms/Vectorize/VPlanTransforms.cpp | 30 +++++++++++++++++++ .../LoopVectorize/RISCV/low-trip-count.ll | 28 ++++++++--------- ...ctor-loop-backedge-elimination-with-evl.ll | 3 +- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48bd697397f41..e540498acb8b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1816,6 +1816,35 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, return true; } +/// From the definition of llvm.experimental.get.vector.length, +/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF. +static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, + PredicatedScalarEvolution &PSE) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + VPValue *AVL; + if (!match(&R, m_EVL(m_VPValue(AVL)))) + continue; + + const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, *PSE.getSE()); + if (isa(AVLSCEV)) + continue; + const SCEV *VFSCEV = PSE.getSE()->getElementCount(AVLSCEV->getType(), VF); + if (!PSE.getSE()->isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV)) + continue; + + VPBuilder Builder(&R); + VPValue *AVLZExt = Builder.createScalarZExtOrTrunc( + AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(), + R.getDebugLoc()); + R.getVPSingleValue()->replaceAllUsesWith(AVLZExt); + return true; + } + } + return false; +} + void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { @@ -1825,6 +1854,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); + MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE); if (MadeChange) { Plan.setVF(BestVF); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 8ef53cade01ac..83e044e2b4a78 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -125,12 +125,11 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true) -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 8) ; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 8) ; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP7]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP7]], ptr align 1 [[DST]], splat (i1 true), i32 8) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -164,12 +163,11 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 16, i32 8, i1 true) -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP1:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 16) ; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP4:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 16) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP11]], ptr align 1 [[TMP4]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP11]], ptr align 1 [[DST]], splat (i1 true), i32 16) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -204,12 +202,11 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 32, i32 16, i1 true) -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP1:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 32) ; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP4:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 32) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP11]], ptr align 1 [[TMP4]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP11]], ptr align 1 [[DST]], splat (i1 true), i32 32) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -243,12 +240,11 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 24, i32 16, i1 true) -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 24) ; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 24) ; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP7]], ptr align 1 [[DST]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP7]], ptr align 1 [[DST]], splat (i1 true), i32 24) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll index 1676461863583..f1dda3d5a2f91 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll @@ -10,8 +10,7 @@ define void @foo(ptr %arg) #0 { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 3, i32 2, i1 true) -; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( zeroinitializer, ptr align 8 [[ARG]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( zeroinitializer, ptr align 8 [[ARG]], splat (i1 true), i32 3) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] From a4aa19e31f28421f51d9ee4c0765125c53c94eef Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 12 Nov 2025 21:05:17 +0800 Subject: [PATCH 3/5] Rename ZExt -> Trunc to reflect that we're casting from i64 -> i32 --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e540498acb8b8..8cdc859267cf3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1835,10 +1835,10 @@ static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, continue; VPBuilder Builder(&R); - VPValue *AVLZExt = Builder.createScalarZExtOrTrunc( + VPValue *Trunc = Builder.createScalarZExtOrTrunc( AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(), R.getDebugLoc()); - R.getVPSingleValue()->replaceAllUsesWith(AVLZExt); + R.getVPSingleValue()->replaceAllUsesWith(Trunc); return true; } } From 027c4623ee09c54005d9647234d145e86f9afeb9 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 13 Nov 2025 00:17:38 +0800 Subject: [PATCH 4/5] Store SE in var --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8cdc859267cf3..2a9c9b5495bcc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1827,11 +1827,12 @@ static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, if (!match(&R, m_EVL(m_VPValue(AVL)))) continue; - const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, *PSE.getSE()); + ScalarEvolution &SE = *PSE.getSE(); + const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE); if (isa(AVLSCEV)) continue; - const SCEV *VFSCEV = PSE.getSE()->getElementCount(AVLSCEV->getType(), VF); - if (!PSE.getSE()->isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV)) + const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF); + if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV)) continue; VPBuilder Builder(&R); From fa1844c86aade7dd7a81a33b47e36e26e3386242 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 13 Nov 2025 00:19:17 +0800 Subject: [PATCH 5/5] Remove single use builder var --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2a9c9b5495bcc..9ea74d58a67e4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1835,8 +1835,7 @@ static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV)) continue; - VPBuilder Builder(&R); - VPValue *Trunc = Builder.createScalarZExtOrTrunc( + VPValue *Trunc = VPBuilder(&R).createScalarZExtOrTrunc( AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(), R.getDebugLoc()); R.getVPSingleValue()->replaceAllUsesWith(Trunc);