From cb03669fcfdd799c38413ff3db2245f700373a04 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 23 Sep 2025 18:55:52 +0800 Subject: [PATCH 1/4] [VPlan] Fix packed replication of struct types I ran into this crash when #158690 caused a loop with a struct call to be vectorized. If we have a replicate recipe in a branch-on-mask predicated region that's used by a widened recipe in another block then it will be packed together with the other lanes via a VPPredInstPHIRecipe. If we're replicating a call with a struct return type then we currently crash. The code that handles structs in packScalarIntoVectorizedValue seemed to be untested at least on test/Transforms/LoopVectorize. There's two places that need to be fixed. The poison value that the scalar is packed into needs to use toVectorizedTy to correctly handle structs (not to be confused with toVectorTy!) The other is that VPPredInstPHIRecipe expects its operand to be an InsertElementInstr when stringing together the different lanes. For structs this will be an InsertVlaueInstr, and the value for the previous lane will be at the back of a chain of InsertValueInstrs. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 22 +- .../LoopVectorize/struct-return-replicate.ll | 227 +++++++++++++++++- 2 files changed, 239 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa3de3613b68e..d7f8c832ed9ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3051,7 +3051,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.VF.isVector() && shouldPack()) { Value *WideValue = State.Lane->isFirstLane() - ? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) + ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF)) : State.get(this); State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, *State.Lane)); @@ -3267,11 +3267,21 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. if (State.hasVectorValue(getOperand(0))) { - Value *VectorValue = State.get(getOperand(0)); - InsertElementInst *IEI = cast(VectorValue); - PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); - VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. - VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. + auto *VecI = cast(State.get(getOperand(0))); + assert(isa(VecI) || isa(VecI)); + + // If VectorI is a struct, it will be a sequence like: + // %1 = insertvalue %unmodified, %x, 0 + // %2 = insertvalue %1, %y, 1 + // %VectorI = insertvalue %2, %z, 2 + // To get the unmodified vector we need to look through the chain. + if (auto *StructTy = dyn_cast(VecI->getType())) + for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) + VecI = cast(VecI->getOperand(0)); + + PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2); + VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector. + VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element. if (State.hasVectorValue(this)) State.reset(this, VPhi); else diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll index 5c622f825beaf..93054bb275560 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll @@ -166,7 +166,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: store <4 x float> [[TMP42]], ptr [[TMP45]], align 4 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VF4: [[MIDDLE_BLOCK]]: ; ; VF2IC2-LABEL: define void @struct_return_2xf32_replicate( @@ -233,7 +233,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: store <2 x float> [[TMP44]], ptr [[TMP50]], align 4 ; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2IC2-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VF2IC2: [[MIDDLE_BLOCK]]: ; entry: @@ -336,7 +336,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: store <4 x i32> [[TMP63]], ptr [[TMP64]], align 4 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF4: [[MIDDLE_BLOCK]]: ; ; VF2IC2-LABEL: define void @struct_return_3xi32_replicate( @@ -425,7 +425,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4 ; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2IC2-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF2IC2: [[MIDDLE_BLOCK]]: ; entry: @@ -453,6 +453,224 @@ exit: ret void } +define void @struct_return_2xf32_replicate_predicated(ptr %a) { +; CHECK-LABEL: define void @scalarized_predicated_struct_return +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; VF4-LABEL: define void @struct_return_2xf32_replicate_predicated( +; VF4-SAME: ptr [[A:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 8 +; VF4-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; VF4-NEXT: br i1 [[TMP2]], label %[[PRED_CALL_IF:.*]], label %[[PRED_CALL_CONTINUE:.*]] +; VF4: [[PRED_CALL_IF]]: +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 +; VF4-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR3:[0-9]+]] +; VF4-NEXT: [[TMP5:%.*]] = extractvalue { float, float } [[TMP4]], 0 +; VF4-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 +; VF4-NEXT: [[TMP7:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP6]], 0 +; VF4-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP4]], 1 +; VF4-NEXT: [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP7]], 1 +; VF4-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP8]], i32 0 +; VF4-NEXT: [[TMP11:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP7]], <4 x float> [[TMP10]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE]] +; VF4: [[PRED_CALL_CONTINUE]]: +; VF4-NEXT: [[TMP12:%.*]] = phi { <4 x float>, <4 x float> } [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_CALL_IF]] ] +; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; VF4-NEXT: br i1 [[TMP13]], label %[[PRED_CALL_IF1:.*]], label %[[PRED_CALL_CONTINUE2:.*]] +; VF4: [[PRED_CALL_IF1]]: +; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 +; VF4-NEXT: [[TMP15:%.*]] = tail call { float, float } @fn2(float [[TMP14]]) #[[ATTR3]] +; VF4-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP15]], 0 +; VF4-NEXT: [[TMP17:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 0 +; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP16]], i32 1 +; VF4-NEXT: [[TMP19:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP18]], 0 +; VF4-NEXT: [[TMP20:%.*]] = extractvalue { float, float } [[TMP15]], 1 +; VF4-NEXT: [[TMP21:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP19]], 1 +; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i32 1 +; VF4-NEXT: [[TMP23:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP19]], <4 x float> [[TMP22]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE2]] +; VF4: [[PRED_CALL_CONTINUE2]]: +; VF4-NEXT: [[TMP24:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP12]], %[[PRED_CALL_CONTINUE]] ], [ [[TMP19]], %[[PRED_CALL_IF1]] ] +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; VF4-NEXT: br i1 [[TMP25]], label %[[PRED_CALL_IF3:.*]], label %[[PRED_CALL_CONTINUE4:.*]] +; VF4: [[PRED_CALL_IF3]]: +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = tail call { float, float } @fn2(float [[TMP26]]) #[[ATTR3]] +; VF4-NEXT: [[TMP28:%.*]] = extractvalue { float, float } [[TMP27]], 0 +; VF4-NEXT: [[TMP29:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0 +; VF4-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP28]], i32 2 +; VF4-NEXT: [[TMP31:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP30]], 0 +; VF4-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP27]], 1 +; VF4-NEXT: [[TMP33:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP31]], 1 +; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP32]], i32 2 +; VF4-NEXT: [[TMP35:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP31]], <4 x float> [[TMP34]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE4]] +; VF4: [[PRED_CALL_CONTINUE4]]: +; VF4-NEXT: [[TMP36:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP24]], %[[PRED_CALL_CONTINUE2]] ], [ [[TMP31]], %[[PRED_CALL_IF3]] ] +; VF4-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; VF4-NEXT: br i1 [[TMP37]], label %[[PRED_CALL_IF5:.*]], label %[[PRED_CALL_CONTINUE6:.*]] +; VF4: [[PRED_CALL_IF5]]: +; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP39:%.*]] = tail call { float, float } @fn2(float [[TMP38]]) #[[ATTR3]] +; VF4-NEXT: [[TMP40:%.*]] = extractvalue { float, float } [[TMP39]], 0 +; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 0 +; VF4-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP40]], i32 3 +; VF4-NEXT: [[TMP43:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP42]], 0 +; VF4-NEXT: [[TMP44:%.*]] = extractvalue { float, float } [[TMP39]], 1 +; VF4-NEXT: [[TMP45:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP43]], 1 +; VF4-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP44]], i32 3 +; VF4-NEXT: [[TMP47:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP43]], <4 x float> [[TMP46]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE6]] +; VF4: [[PRED_CALL_CONTINUE6]]: +; VF4-NEXT: [[TMP48:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP36]], %[[PRED_CALL_CONTINUE4]] ], [ [[TMP43]], %[[PRED_CALL_IF5]] ] +; VF4-NEXT: [[TMP49:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP48]], 0 +; VF4-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP49]], [[WIDE_LOAD]] +; VF4-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; VF4-NEXT: br i1 [[TMP51]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF4: [[PRED_STORE_IF]]: +; VF4-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], 0 +; VF4-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP52]] +; VF4-NEXT: [[TMP54:%.*]] = extractelement <4 x float> [[TMP50]], i32 0 +; VF4-NEXT: store float [[TMP54]], ptr [[TMP53]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF4: [[PRED_STORE_CONTINUE]]: +; VF4-NEXT: [[TMP55:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; VF4-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF4: [[PRED_STORE_IF7]]: +; VF4-NEXT: [[TMP56:%.*]] = add i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP56]] +; VF4-NEXT: [[TMP58:%.*]] = extractelement <4 x float> [[TMP50]], i32 1 +; VF4-NEXT: store float [[TMP58]], ptr [[TMP57]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF4: [[PRED_STORE_CONTINUE8]]: +; VF4-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; VF4-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF4: [[PRED_STORE_IF9]]: +; VF4-NEXT: [[TMP60:%.*]] = add i64 [[INDEX]], 2 +; VF4-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP60]] +; VF4-NEXT: [[TMP62:%.*]] = extractelement <4 x float> [[TMP50]], i32 2 +; VF4-NEXT: store float [[TMP62]], ptr [[TMP61]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF4: [[PRED_STORE_CONTINUE10]]: +; VF4-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; VF4-NEXT: br i1 [[TMP63]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF4: [[PRED_STORE_IF11]]: +; VF4-NEXT: [[TMP64:%.*]] = add i64 [[INDEX]], 3 +; VF4-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP64]] +; VF4-NEXT: [[TMP66:%.*]] = extractelement <4 x float> [[TMP50]], i32 3 +; VF4-NEXT: store float [[TMP66]], ptr [[TMP65]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF4: [[PRED_STORE_CONTINUE12]]: +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF4-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; +; VF2IC2-LABEL: define void @struct_return_2xf32_replicate_predicated( +; VF2IC2-SAME: ptr [[A:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 8 +; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP1]], align 8 +; VF2IC2-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], zeroinitializer +; VF2IC2-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], zeroinitializer +; VF2IC2-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR3:[0-9]+]] +; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0 +; VF2IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP11:%.*]] = fdiv float [[TMP8]], [[TMP10]] +; VF2IC2-NEXT: store float [[TMP11]], ptr [[TMP9]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] +; VF2IC2: [[PRED_STORE_IF2]]: +; VF2IC2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP14:%.*]] = tail call { float, float } @fn2(float [[TMP13]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP14]], 0 +; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]] +; VF2IC2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP19:%.*]] = fdiv float [[TMP16]], [[TMP18]] +; VF2IC2-NEXT: store float [[TMP19]], ptr [[TMP17]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE3]] +; VF2IC2: [[PRED_STORE_CONTINUE3]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] +; VF2IC2: [[PRED_STORE_IF4]]: +; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP22:%.*]] = tail call { float, float } @fn2(float [[TMP21]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP24:%.*]] = extractvalue { float, float } [[TMP22]], 0 +; VF2IC2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; VF2IC2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP27:%.*]] = fdiv float [[TMP24]], [[TMP26]] +; VF2IC2-NEXT: store float [[TMP27]], ptr [[TMP25]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; VF2IC2: [[PRED_STORE_CONTINUE5]]: +; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] +; VF2IC2: [[PRED_STORE_IF6]]: +; VF2IC2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP30:%.*]] = tail call { float, float } @fn2(float [[TMP29]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP30]], 0 +; VF2IC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] +; VF2IC2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP35:%.*]] = fdiv float [[TMP32]], [[TMP34]] +; VF2IC2-NEXT: store float [[TMP35]], ptr [[TMP33]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; VF2IC2: [[PRED_STORE_CONTINUE7]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %in_val = load float, ptr %arrayidx, align 8 + %sgt_zero = fcmp ogt float %in_val, 0.0 + br i1 %sgt_zero, label %if.then, label %for.inc + +if.then: + %call = tail call { float, float } @fn2(float %in_val) #3 + %extract_a = extractvalue { float, float } %call, 0 + %div = fdiv float %extract_a, %in_val + store float %div, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + declare { i64 } @fn1(float) declare { float, float } @fn2(float) declare { i32, i32, i32 } @fn3(i32) @@ -464,3 +682,4 @@ declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>) attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" } attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" } attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" } +attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVnM8v_fn2(fixed_vec_fn2)" } From 767ec0445922cb5a452d2badd35f1233d854b320 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 24 Sep 2025 21:46:11 +0800 Subject: [PATCH 2/4] Add assert message, tighten cast, remove test diffs and leftover check line --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++++-- .../LoopVectorize/struct-return-replicate.ll | 11 ++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d7f8c832ed9ec..6c8b18117c75d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3268,7 +3268,9 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // Otherwise, a phi node for the scalar value is needed. if (State.hasVectorValue(getOperand(0))) { auto *VecI = cast(State.get(getOperand(0))); - assert(isa(VecI) || isa(VecI)); + assert(isa(VecI) || + isa(VecI) && + "Packed recipes must generate an insertelement or insertvalue"); // If VectorI is a struct, it will be a sequence like: // %1 = insertvalue %unmodified, %x, 0 @@ -3277,7 +3279,7 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // To get the unmodified vector we need to look through the chain. if (auto *StructTy = dyn_cast(VecI->getType())) for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) - VecI = cast(VecI->getOperand(0)); + VecI = cast(VecI->getOperand(0)); PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2); VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector. diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll index 93054bb275560..99916a503750a 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll @@ -166,7 +166,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: store <4 x float> [[TMP42]], ptr [[TMP45]], align 4 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF4: [[MIDDLE_BLOCK]]: ; ; VF2IC2-LABEL: define void @struct_return_2xf32_replicate( @@ -233,7 +233,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: store <2 x float> [[TMP44]], ptr [[TMP50]], align 4 ; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2IC2-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF2IC2: [[MIDDLE_BLOCK]]: ; entry: @@ -336,7 +336,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF4-NEXT: store <4 x i32> [[TMP63]], ptr [[TMP64]], align 4 ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VF4: [[MIDDLE_BLOCK]]: ; ; VF2IC2-LABEL: define void @struct_return_3xi32_replicate( @@ -425,7 +425,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl ; VF2IC2-NEXT: store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4 ; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF2IC2-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VF2IC2: [[MIDDLE_BLOCK]]: ; entry: @@ -454,9 +454,6 @@ exit: } define void @struct_return_2xf32_replicate_predicated(ptr %a) { -; CHECK-LABEL: define void @scalarized_predicated_struct_return -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; VF4-LABEL: define void @struct_return_2xf32_replicate_predicated( ; VF4-SAME: ptr [[A:%.*]]) { ; VF4-NEXT: [[ENTRY:.*:]] From 35cf689bb30791431a24622cb478752b89c82d1f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 25 Sep 2025 17:15:54 +0800 Subject: [PATCH 3/4] Update assert message --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6c8b18117c75d..f5a308dbfb701 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3270,7 +3270,7 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { auto *VecI = cast(State.get(getOperand(0))); assert(isa(VecI) || isa(VecI) && - "Packed recipes must generate an insertelement or insertvalue"); + "Packed operands must generate an insertelement or insertvalue"); // If VectorI is a struct, it will be a sequence like: // %1 = insertvalue %unmodified, %x, 0 From b70ffc2182b21b2d058c7a3c1408761c136de48f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 25 Sep 2025 18:39:27 +0800 Subject: [PATCH 4/4] Use parameter pack --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f5a308dbfb701..ef0fa375ed24d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3268,9 +3268,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // Otherwise, a phi node for the scalar value is needed. if (State.hasVectorValue(getOperand(0))) { auto *VecI = cast(State.get(getOperand(0))); - assert(isa(VecI) || - isa(VecI) && - "Packed operands must generate an insertelement or insertvalue"); + assert((isa(VecI)) && + "Packed operands must generate an insertelement or insertvalue"); // If VectorI is a struct, it will be a sequence like: // %1 = insertvalue %unmodified, %x, 0