Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3051,7 +3051,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.VF.isVector() && shouldPack()) {
Value *WideValue =
State.Lane->isFirstLane()
? PoisonValue::get(VectorType::get(UI->getType(), State.VF))
? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
: State.get(this);
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
*State.Lane));
Expand Down Expand Up @@ -3267,11 +3267,22 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
if (State.hasVectorValue(getOperand(0))) {
Value *VectorValue = State.get(getOperand(0));
InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
auto *VecI = cast<Instruction>(State.get(getOperand(0)));
assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
"Packed operands must generate an insertelement or insertvalue");

// If VectorI is a struct, it will be a sequence like:
// %1 = insertvalue %unmodified, %x, 0
// %2 = insertvalue %1, %y, 1
// %VectorI = insertvalue %2, %z, 2
// To get the unmodified vector we need to look through the chain.
if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
VecI = cast<InsertValueInst>(VecI->getOperand(0));

PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
if (State.hasVectorValue(this))
State.reset(this, VPhi);
else
Expand Down
216 changes: 216 additions & 0 deletions llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,221 @@ exit:
ret void
}

define void @struct_return_2xf32_replicate_predicated(ptr %a) {
; VF4-LABEL: define void @struct_return_2xf32_replicate_predicated(
; VF4-SAME: ptr [[A:%.*]]) {
; VF4-NEXT: [[ENTRY:.*:]]
; VF4-NEXT: br label %[[VECTOR_PH:.*]]
; VF4: [[VECTOR_PH]]:
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
; VF4: [[VECTOR_BODY]]:
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 8
; VF4-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
; VF4-NEXT: br i1 [[TMP2]], label %[[PRED_CALL_IF:.*]], label %[[PRED_CALL_CONTINUE:.*]]
; VF4: [[PRED_CALL_IF]]:
; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
; VF4-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR3:[0-9]+]]
; VF4-NEXT: [[TMP5:%.*]] = extractvalue { float, float } [[TMP4]], 0
; VF4-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0
; VF4-NEXT: [[TMP7:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP6]], 0
; VF4-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP4]], 1
; VF4-NEXT: [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP7]], 1
; VF4-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP8]], i32 0
; VF4-NEXT: [[TMP11:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP7]], <4 x float> [[TMP10]], 1
; VF4-NEXT: br label %[[PRED_CALL_CONTINUE]]
; VF4: [[PRED_CALL_CONTINUE]]:
; VF4-NEXT: [[TMP12:%.*]] = phi { <4 x float>, <4 x float> } [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_CALL_IF]] ]
; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
; VF4-NEXT: br i1 [[TMP13]], label %[[PRED_CALL_IF1:.*]], label %[[PRED_CALL_CONTINUE2:.*]]
; VF4: [[PRED_CALL_IF1]]:
; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
; VF4-NEXT: [[TMP15:%.*]] = tail call { float, float } @fn2(float [[TMP14]]) #[[ATTR3]]
; VF4-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP15]], 0
; VF4-NEXT: [[TMP17:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 0
; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP16]], i32 1
; VF4-NEXT: [[TMP19:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP18]], 0
; VF4-NEXT: [[TMP20:%.*]] = extractvalue { float, float } [[TMP15]], 1
; VF4-NEXT: [[TMP21:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP19]], 1
; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i32 1
; VF4-NEXT: [[TMP23:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP19]], <4 x float> [[TMP22]], 1
Comment on lines +488 to +495
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do account for the cost of this sequence somewhat accurately? I think you mentioned that you discovered an end-to-end crash for this. From looking at the code we need to generate, I am curious if there are cases where this is ever profitable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is. All the crashes happened in test files with -force-vector-width. It just so happened that changing the predication discount with BFI changed the scalarization decision which led to this in struct-return.ll

; VF4-NEXT: br label %[[PRED_CALL_CONTINUE2]]
; VF4: [[PRED_CALL_CONTINUE2]]:
; VF4-NEXT: [[TMP24:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP12]], %[[PRED_CALL_CONTINUE]] ], [ [[TMP19]], %[[PRED_CALL_IF1]] ]
; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
; VF4-NEXT: br i1 [[TMP25]], label %[[PRED_CALL_IF3:.*]], label %[[PRED_CALL_CONTINUE4:.*]]
; VF4: [[PRED_CALL_IF3]]:
; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
; VF4-NEXT: [[TMP27:%.*]] = tail call { float, float } @fn2(float [[TMP26]]) #[[ATTR3]]
; VF4-NEXT: [[TMP28:%.*]] = extractvalue { float, float } [[TMP27]], 0
; VF4-NEXT: [[TMP29:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0
; VF4-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP28]], i32 2
; VF4-NEXT: [[TMP31:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP30]], 0
; VF4-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP27]], 1
; VF4-NEXT: [[TMP33:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP31]], 1
; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP32]], i32 2
; VF4-NEXT: [[TMP35:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP31]], <4 x float> [[TMP34]], 1
; VF4-NEXT: br label %[[PRED_CALL_CONTINUE4]]
; VF4: [[PRED_CALL_CONTINUE4]]:
; VF4-NEXT: [[TMP36:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP24]], %[[PRED_CALL_CONTINUE2]] ], [ [[TMP31]], %[[PRED_CALL_IF3]] ]
; VF4-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
; VF4-NEXT: br i1 [[TMP37]], label %[[PRED_CALL_IF5:.*]], label %[[PRED_CALL_CONTINUE6:.*]]
; VF4: [[PRED_CALL_IF5]]:
; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
; VF4-NEXT: [[TMP39:%.*]] = tail call { float, float } @fn2(float [[TMP38]]) #[[ATTR3]]
; VF4-NEXT: [[TMP40:%.*]] = extractvalue { float, float } [[TMP39]], 0
; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 0
; VF4-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP40]], i32 3
; VF4-NEXT: [[TMP43:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP42]], 0
; VF4-NEXT: [[TMP44:%.*]] = extractvalue { float, float } [[TMP39]], 1
; VF4-NEXT: [[TMP45:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP43]], 1
; VF4-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP44]], i32 3
; VF4-NEXT: [[TMP47:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP43]], <4 x float> [[TMP46]], 1
; VF4-NEXT: br label %[[PRED_CALL_CONTINUE6]]
; VF4: [[PRED_CALL_CONTINUE6]]:
; VF4-NEXT: [[TMP48:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP36]], %[[PRED_CALL_CONTINUE4]] ], [ [[TMP43]], %[[PRED_CALL_IF5]] ]
; VF4-NEXT: [[TMP49:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP48]], 0
; VF4-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP49]], [[WIDE_LOAD]]
; VF4-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
; VF4-NEXT: br i1 [[TMP51]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; VF4: [[PRED_STORE_IF]]:
; VF4-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], 0
; VF4-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP52]]
; VF4-NEXT: [[TMP54:%.*]] = extractelement <4 x float> [[TMP50]], i32 0
; VF4-NEXT: store float [[TMP54]], ptr [[TMP53]], align 8
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE]]
; VF4: [[PRED_STORE_CONTINUE]]:
; VF4-NEXT: [[TMP55:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
; VF4-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; VF4: [[PRED_STORE_IF7]]:
; VF4-NEXT: [[TMP56:%.*]] = add i64 [[INDEX]], 1
; VF4-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP56]]
; VF4-NEXT: [[TMP58:%.*]] = extractelement <4 x float> [[TMP50]], i32 1
; VF4-NEXT: store float [[TMP58]], ptr [[TMP57]], align 8
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; VF4: [[PRED_STORE_CONTINUE8]]:
; VF4-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
; VF4-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; VF4: [[PRED_STORE_IF9]]:
; VF4-NEXT: [[TMP60:%.*]] = add i64 [[INDEX]], 2
; VF4-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP60]]
; VF4-NEXT: [[TMP62:%.*]] = extractelement <4 x float> [[TMP50]], i32 2
; VF4-NEXT: store float [[TMP62]], ptr [[TMP61]], align 8
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; VF4: [[PRED_STORE_CONTINUE10]]:
; VF4-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
; VF4-NEXT: br i1 [[TMP63]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
; VF4: [[PRED_STORE_IF11]]:
; VF4-NEXT: [[TMP64:%.*]] = add i64 [[INDEX]], 3
; VF4-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP64]]
; VF4-NEXT: [[TMP66:%.*]] = extractelement <4 x float> [[TMP50]], i32 3
; VF4-NEXT: store float [[TMP66]], ptr [[TMP65]], align 8
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; VF4: [[PRED_STORE_CONTINUE12]]:
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF4-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; VF4-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF4: [[MIDDLE_BLOCK]]:
;
; VF2IC2-LABEL: define void @struct_return_2xf32_replicate_predicated(
; VF2IC2-SAME: ptr [[A:%.*]]) {
; VF2IC2-NEXT: [[ENTRY:.*:]]
; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
; VF2IC2: [[VECTOR_PH]]:
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 8
; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP1]], align 8
; VF2IC2-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], zeroinitializer
; VF2IC2-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], zeroinitializer
; VF2IC2-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; VF2IC2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; VF2IC2: [[PRED_STORE_IF]]:
; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR3:[0-9]+]]
; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
; VF2IC2-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0
; VF2IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
; VF2IC2-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
; VF2IC2-NEXT: [[TMP11:%.*]] = fdiv float [[TMP8]], [[TMP10]]
; VF2IC2-NEXT: store float [[TMP11]], ptr [[TMP9]], align 8
; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]]
; VF2IC2: [[PRED_STORE_CONTINUE]]:
; VF2IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; VF2IC2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
; VF2IC2: [[PRED_STORE_IF2]]:
; VF2IC2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
; VF2IC2-NEXT: [[TMP14:%.*]] = tail call { float, float } @fn2(float [[TMP13]]) #[[ATTR3]]
; VF2IC2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP14]], 0
; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]]
; VF2IC2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
; VF2IC2-NEXT: [[TMP19:%.*]] = fdiv float [[TMP16]], [[TMP18]]
; VF2IC2-NEXT: store float [[TMP19]], ptr [[TMP17]], align 8
; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE3]]
; VF2IC2: [[PRED_STORE_CONTINUE3]]:
; VF2IC2-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
; VF2IC2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
; VF2IC2: [[PRED_STORE_IF4]]:
; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
; VF2IC2-NEXT: [[TMP22:%.*]] = tail call { float, float } @fn2(float [[TMP21]]) #[[ATTR3]]
; VF2IC2-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 2
; VF2IC2-NEXT: [[TMP24:%.*]] = extractvalue { float, float } [[TMP22]], 0
; VF2IC2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
; VF2IC2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
; VF2IC2-NEXT: [[TMP27:%.*]] = fdiv float [[TMP24]], [[TMP26]]
; VF2IC2-NEXT: store float [[TMP27]], ptr [[TMP25]], align 8
; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]]
; VF2IC2: [[PRED_STORE_CONTINUE5]]:
; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
; VF2IC2: [[PRED_STORE_IF6]]:
; VF2IC2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
; VF2IC2-NEXT: [[TMP30:%.*]] = tail call { float, float } @fn2(float [[TMP29]]) #[[ATTR3]]
; VF2IC2-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 3
; VF2IC2-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP30]], 0
; VF2IC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]]
; VF2IC2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
; VF2IC2-NEXT: [[TMP35:%.*]] = fdiv float [[TMP32]], [[TMP34]]
; VF2IC2-NEXT: store float [[TMP35]], ptr [[TMP33]], align 8
; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE7]]
; VF2IC2: [[PRED_STORE_CONTINUE7]]:
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; VF2IC2-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF2IC2: [[MIDDLE_BLOCK]]:
;
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%in_val = load float, ptr %arrayidx, align 8
%sgt_zero = fcmp ogt float %in_val, 0.0
br i1 %sgt_zero, label %if.then, label %for.inc

if.then:
%call = tail call { float, float } @fn2(float %in_val) #3
%extract_a = extractvalue { float, float } %call, 0
%div = fdiv float %extract_a, %in_val
store float %div, ptr %arrayidx, align 8
br label %for.inc

for.inc:
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %for.body

exit:
ret void
}

declare { i64 } @fn1(float)
declare { float, float } @fn2(float)
declare { i32, i32, i32 } @fn3(i32)
Expand All @@ -464,3 +679,4 @@ declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>)
attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" }
attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" }
attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }
attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVnM8v_fn2(fixed_vec_fn2)" }