Skip to content

Conversation

@Mel-Chen
Copy link
Contributor

@Mel-Chen Mel-Chen commented Nov 5, 2025

This patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe.

@llvmbot
Copy link
Member

llvmbot commented Nov 5, 2025

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-backend-risc-v

Author: Mel Chen (Mel-Chen)

Changes

This patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe.

Based on #143552


Patch is 30.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166514.diff

8 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+11-1)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+4)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll (+5-5)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll (+124-37)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll (+14-43)
  • (modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+3-3)
  • (modified) llvm/test/Transforms/LoopVectorize/cse-casts.ll (+1-4)
  • (modified) llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll (+3-5)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..2740fcd1a21fc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1386,7 +1386,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
+               VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1422,6 +1423,15 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
           }))
         continue;
 
+      if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
+        VPBuilder Builder(CastR);
+        auto *Clone = Builder.createScalarCast(
+            CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
+            CastR->getDebugLoc());
+        CastR->replaceAllUsesWith(Clone);
+        CastR->eraseFromParent();
+        continue;
+      }
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
                                           RepOrWidenR->operands(),
                                           true /*IsSingleScalar*/);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d30ab2e2..6dd4e5ac697c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -171,6 +171,10 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
     return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
+  if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+    return PreservesUniformity(CastR->getOpcode()) &&
+           all_of(CastR->operands(), isSingleScalar);
+  }
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b430efc9e5283..a5f7764898055 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -492,18 +492,18 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -520,7 +520,7 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
 ; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 1dcd665817196..4642144f8d6d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -66,63 +66,150 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
 ; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[TMP2]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE46:.*]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i8 [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP29]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
 ; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
-; CHECK-NEXT:    [[TMP105:%.*]] = or i64 [[TMP104]], 1
-; CHECK-NEXT:    store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
 ; CHECK:       [[PRED_STORE_IF17]]:
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
-; CHECK-NEXT:    [[TMP111:%.*]] = or i64 [[TMP110]], 1
-; CHECK-NEXT:    store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
 ; CHECK:       [[PRED_STORE_IF19]]:
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
-; CHECK-NEXT:    [[TMP117:%.*]] = or i64 [[TMP116]], 1
-; CHECK-NEXT:    store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
 ; CHECK:       [[PRED_STORE_CONTINUE20]]:
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
 ; CHECK:       [[PRED_STORE_IF21]]:
-; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
 ; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
-; CHECK-NEXT:    [[TMP123:%.*]] = or i64 [[TMP122]], 1
-; CHECK-NEXT:    store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
 ; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK:       [[PRED_STORE_IF23]]:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP41]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP42]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; CHECK:       [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK:       [[PRED_STORE_IF25]]:
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP44]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP45]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT:    br i1 [[TMP46]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK:       [[PRED_STORE_IF27]]:
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP76]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP77]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; CHECK:       [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK:       [[PRED_STORE_IF29]]:
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP50]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP51]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; CHECK-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
+; CHECK:       [[PRED_STORE_IF31]]:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP53]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP54]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 9
+; CHECK-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
+; CHECK:       [[PRED_STORE_IF33]]:
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP56]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP57]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE34]]
+; CHECK:       [[PRED_STORE_CONTINUE34]]:
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 10
+; CHECK-NEXT:    br i1 [[TMP58]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
+; CHECK:       [[PRED_STORE_IF35]]:
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP59]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP60]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
+; CHECK:       [[PRED_STORE_CONTINUE36]]:
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 11
+; CHECK-NEXT:    br i1 [[TMP61]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
+; CHECK:       [[PRED_STORE_IF37]]:
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP62]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP63]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE38]]
+; CHECK:       [[PRED_STORE_CONTINUE38]]:
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 12
+; CHECK-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
+; CHECK:       [[PRED_STORE_IF39]]:
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP65]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP66]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE40]]
+; CHECK:       [[PRED_STORE_CONTINUE40]]:
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 13
+; CHECK-NEXT:    br i1 [[TMP67]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
+; CHECK:       [[PRED_STORE_IF41]]:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP68]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP69]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE42]]
+; CHECK:       [[PRED_STORE_CONTINUE42]]:
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 14
+; CHECK-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
+; CHECK:       [[PRED_STORE_IF43]]:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP71]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP72]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE44]]
+; CHECK:       [[PRED_STORE_CONTINUE44]]:
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 15
+; CHECK-NEXT:    br i1 [[TMP73]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46]]
+; CHECK:       [[PRED_STORE_IF45]]:
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP74]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP75]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE46]]
+; CHECK:       [[PRED_STORE_CONTINUE46]]:
 ; CHECK-NEXT:    store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = xor i1 [[TMP47]], true
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 64)
 ; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/L...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 5, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Mel Chen (Mel-Chen)

Changes

This patch narrows uniform VPWidenCastRecipe into scalar cast recipes. Since cast operations involve type conversions, directly cloning them would produce an incorrect destination type. Therefore, we emit scalar cast recipes instead of using VPReplicateRecipe.

Based on #143552


Patch is 30.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166514.diff

8 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+11-1)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+4)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll (+5-5)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll (+124-37)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll (+14-43)
  • (modified) llvm/test/Transforms/LoopVectorize/X86/cost-model.ll (+3-3)
  • (modified) llvm/test/Transforms/LoopVectorize/cse-casts.ll (+1-4)
  • (modified) llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll (+3-5)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..2740fcd1a21fc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1386,7 +1386,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
+               VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1422,6 +1423,15 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
           }))
         continue;
 
+      if (auto *CastR = dyn_cast<VPWidenCastRecipe>(RepOrWidenR)) {
+        VPBuilder Builder(CastR);
+        auto *Clone = Builder.createScalarCast(
+            CastR->getOpcode(), CastR->getOperand(0), CastR->getResultType(),
+            CastR->getDebugLoc());
+        CastR->replaceAllUsesWith(Clone);
+        CastR->eraseFromParent();
+        continue;
+      }
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
                                           RepOrWidenR->operands(),
                                           true /*IsSingleScalar*/);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d30ab2e2..6dd4e5ac697c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -171,6 +171,10 @@ bool vputils::isSingleScalar(const VPValue *VPV) {
     return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isSingleScalar);
   }
+  if (auto *CastR = dyn_cast<VPWidenCastRecipe>(VPV)) {
+    return PreservesUniformity(CastR->getOpcode()) &&
+           all_of(CastR->operands(), isSingleScalar);
+  }
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b430efc9e5283..a5f7764898055 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -492,18 +492,18 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -520,7 +520,7 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
 ; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 1dcd665817196..4642144f8d6d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -66,63 +66,150 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
 ; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[TMP2]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE46:.*]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE46]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i8 [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP29]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[VEC_IND]] to <16 x i64>
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
 ; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
-; CHECK-NEXT:    [[TMP105:%.*]] = or i64 [[TMP104]], 1
-; CHECK-NEXT:    store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
 ; CHECK:       [[PRED_STORE_IF17]]:
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
-; CHECK-NEXT:    [[TMP111:%.*]] = or i64 [[TMP110]], 1
-; CHECK-NEXT:    store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
 ; CHECK:       [[PRED_STORE_IF19]]:
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
-; CHECK-NEXT:    [[TMP117:%.*]] = or i64 [[TMP116]], 1
-; CHECK-NEXT:    store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
 ; CHECK:       [[PRED_STORE_CONTINUE20]]:
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
 ; CHECK:       [[PRED_STORE_IF21]]:
-; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
 ; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
-; CHECK-NEXT:    [[TMP123:%.*]] = or i64 [[TMP122]], 1
-; CHECK-NEXT:    store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
 ; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK:       [[PRED_STORE_IF23]]:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP41]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP42]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; CHECK:       [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK:       [[PRED_STORE_IF25]]:
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP44]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP45]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
+; CHECK-NEXT:    br i1 [[TMP46]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK:       [[PRED_STORE_IF27]]:
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP76]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP77]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; CHECK:       [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
+; CHECK-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK:       [[PRED_STORE_IF29]]:
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP50]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP51]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
+; CHECK-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
+; CHECK:       [[PRED_STORE_IF31]]:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP53]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP54]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 9
+; CHECK-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
+; CHECK:       [[PRED_STORE_IF33]]:
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP56]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP57]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE34]]
+; CHECK:       [[PRED_STORE_CONTINUE34]]:
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 10
+; CHECK-NEXT:    br i1 [[TMP58]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
+; CHECK:       [[PRED_STORE_IF35]]:
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP59]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP60]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
+; CHECK:       [[PRED_STORE_CONTINUE36]]:
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 11
+; CHECK-NEXT:    br i1 [[TMP61]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
+; CHECK:       [[PRED_STORE_IF37]]:
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP62]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP63]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE38]]
+; CHECK:       [[PRED_STORE_CONTINUE38]]:
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 12
+; CHECK-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
+; CHECK:       [[PRED_STORE_IF39]]:
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP65]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP66]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE40]]
+; CHECK:       [[PRED_STORE_CONTINUE40]]:
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 13
+; CHECK-NEXT:    br i1 [[TMP67]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
+; CHECK:       [[PRED_STORE_IF41]]:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP68]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP69]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE42]]
+; CHECK:       [[PRED_STORE_CONTINUE42]]:
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 14
+; CHECK-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
+; CHECK:       [[PRED_STORE_IF43]]:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP71]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP72]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE44]]
+; CHECK:       [[PRED_STORE_CONTINUE44]]:
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 15
+; CHECK-NEXT:    br i1 [[TMP73]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46]]
+; CHECK:       [[PRED_STORE_IF45]]:
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP74]], i64 [[OFF]]
+; CHECK-NEXT:    store i64 [[TMP30]], ptr [[TMP75]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE46]]
+; CHECK:       [[PRED_STORE_CONTINUE46]]:
 ; CHECK-NEXT:    store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = xor i1 [[TMP47]], true
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 64)
 ; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/L...
[truncated]

Copy link
Contributor

@lukel97 lukel97 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have thought that a VPWidenCastRecipe could have been replicated with its original underlying instruction, but I see now that it's sometimes used for synthetic instructions with no underlying value. We should probably finish moving those over to VPInstructionWithType

Copy link
Contributor

@artagnon artagnon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks!

; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[COUNT_NEXT]], 0
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an interesting change, do you know if this is actual profitable? Going from VF vscale x 8 with 2 scalar stores to VF 2 with 2 scatters.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants