-
Notifications
You must be signed in to change notification settings - Fork 15k
Description
This bug manifests when IndVarSimplify
transforms the PHI value of an exit block to take the value at the first iteration of the loop (rewriteFirstIterationLoopExitValues
at https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp#L431).
The transform relies on the condition to the exit block being a loop invariant one. Then loop predication's predicateLoopExits
transform comes along and changes this loop-invariant condition to a varying one. This means we can now get an incorrect deopt state passed to the interpreter.
Here is the initial IR:
; ModuleID = 'bugpoint.ll'
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
target triple = "x86_64-unknown-linux-gnu"
@global = external global ptr addrspace(1)
define i32 @foo(ptr addrspace(3) %arg) !prof !0 {
bb:
%alloca = alloca i1, align 1
%getelementptr1 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 32
%load2 = load i32, ptr addrspace(3) %getelementptr1, align 4
%getelementptr3 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 40
%load4 = load i32, ptr addrspace(3) %getelementptr3, align 4
%getelementptr5 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 56
%load6 = load i32, ptr addrspace(3) %getelementptr5, align 4
%getelementptr7 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 72
%load8 = load ptr addrspace(1), ptr addrspace(3) %getelementptr7, align 8
%getelementptr9 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 80
%init_val = load i32, ptr addrspace(3) %getelementptr9, align 4
%getelementptr11 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 88
%load12 = load i32, ptr addrspace(3) %getelementptr11, align 4
store volatile i1 true, ptr %alloca, align 1
%load13 = load volatile i1, ptr %alloca, align 1
%icmp = icmp eq ptr addrspace(1) %load8, null
%getelementptr14 = getelementptr inbounds i8, ptr addrspace(1) %load8, i64 8
%call = call i1 @llvm.experimental.widenable.condition() #1
%getelementptr15 = getelementptr inbounds i8, ptr addrspace(1) %load8, i64 20
%widenable_cond11 = call i1 @llvm.experimental.widenable.condition() #1
br label %loop_outer
loop_outer: ; preds = %bb34, %bb
%phi = phi i32 [ %phi36, %bb34 ], [ 42, %bb ]
%phi18 = phi i32 [ 2, %bb34 ], [ %load2, %bb ]
%phi19 = phi i32 [ 0, %bb34 ], [ %load4, %bb ]
%phi20 = phi i32 [ 1, %bb34 ], [ %load6, %bb ]
%phi21 = phi i32 [ %add39, %bb34 ], [ %init_val, %bb ]
%phi22 = phi i32 [ %phi35, %bb34 ], [ %load12, %bb ]
%icmp23 = icmp sgt i32 %phi19, 37
%add = add nsw i32 %phi19, 1
%load24 = load atomic ptr addrspace(1), ptr @global unordered, align 8, !nonnull !1
%getelementptr25 = getelementptr i8, ptr addrspace(1) %load24, i64 16
%mul = mul i32 %phi22, 16777216
%add26 = add i32 %mul, -16777216
%ashr = ashr exact i32 %add26, 24
%add27 = add i32 %phi, 1
%icmp28 = icmp eq i32 %add27, 0
%load29 = load atomic i32, ptr addrspace(1) %getelementptr14 unordered, align 8, !range !5, !invariant.load !1, !noundef !1
%icmp30 = icmp ugt i32 %load29, 1
%and = and i1 %icmp30, %call
%load31 = load atomic i32, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !6, !noundef !1
%icmp32 = icmp eq i32 %load31, 0
store atomic ptr addrspace(1) null, ptr addrspace(1) %getelementptr25 unordered, align 8
store ptr addrspace(1) null, ptr addrspace(256) inttoptr (i64 8 to ptr addrspace(256)), align 8, !tbaa !8
br i1 %widenable_cond11, label %bb33, label %deopt9, !prof !10
bb33: ; preds = %loop_outer
store atomic i32 606, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !6
br label %inner_loop
bb34: ; preds = %bb54
%phi35 = phi i32 [ %ashr47, %bb54 ]
%phi36 = phi i32 [ %add48, %bb54 ]
%add37 = add i32 %phi18, 1
%icmp38 = icmp sgt i32 %add37, 12
%add39 = add i32 %phi21, 1
%icmp40 = icmp sgt i32 %add39, 4
br label %loop_outer
inner_loop: ; preds = %bb54, %bb33
%phi42 = phi i32 [ %ashr, %bb33 ], [ %ashr47, %bb54 ]
%phi43 = phi i32 [ 1, %bb33 ], [ %add55, %bb54 ]
%phi44 = phi i32 [ %add27, %bb33 ], [ %add48, %bb54 ]
%mul45 = mul i32 %phi42, 16777216
%add46 = add i32 %mul45, -16777216
%ashr47 = ashr exact i32 %add46, 24
%add48 = add i32 %phi44, 1
%icmp49 = icmp eq i32 %add48, 0
br i1 %icmp49, label %bb57, label %bb54, !prof !11, !make.implicit !1
deopt9: ; preds = %loop_outer
%lcssa = phi i32 [ %phi21, %loop_outer ]
%phi51 = phi i32 [ %ashr, %loop_outer ]
%phi52 = phi i32 [ %add27, %loop_outer ]
%call53 = call i32 (...) @llvm.experimental.deoptimize.i32(i32 13) #2 [ "deopt"(i32 0, i32 1, i32 1291853205, i32 127, i32 3, i32 13, i32 0, i32 0, ptr addrspace(1) %load8, i32 3, i32 1, i32 3, i32 606, i32 7, ptr null, i32 7, ptr null, i32 3, i32 %phi52, i32 7, ptr null, i32 3, i32 %load2, i32 7, ptr null, i32 7, ptr null, i32 3, i32 1, i32 3, i32 0, i32 0, ptr addrspace(1) %load8, i32 3, i32 %lcssa, i32 3, i32 %phi51, i32 7, ptr null) ]
ret i32 %call53
bb54: ; preds = %inner_loop
store atomic i32 606, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !6
%add55 = add nuw nsw i32 %phi43, 1
%icmp56 = icmp ugt i32 %add55, 9
br i1 %icmp56, label %bb34, label %inner_loop, !llvm.loop !12
bb57: ; preds = %inner_loop
%phi58 = phi i32 [ %phi18, %inner_loop ]
%phi59 = phi i32 [ %phi21, %inner_loop ]
%phi60 = phi i32 [ %phi43, %inner_loop ]
%phi61 = phi i32 [ %ashr47, %inner_loop ]
%call62 = call i32 (...) @llvm.experimental.deoptimize.i32(i32 12) #2 [ "deopt"(i32 0, i32 1, i32 1291853205, i32 99, i32 2, i32 13, i32 0, i32 3, i32 0, i32 3, i32 0, i32 7, ptr null, i32 7, ptr null, i32 3, i32 0, i32 7, ptr null, i32 3, i32 %phi58, i32 7, ptr null, i32 7, ptr null, i32 3, i32 1, i32 3, i32 %phi60, i32 0, ptr addrspace(1) %load8, i32 3, i32 %phi59, i32 3, i32 %phi61, i32 7, ptr null) ]
ret i32 %call62
}
declare i32 @llvm.experimental.deoptimize.i32(...)
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite)
declare noundef i1 @llvm.experimental.widenable.condition() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) }
attributes #1 = { "inline-remark"="cost=never, reason=unavailable definition" }
attributes #2 = { "deopt-lowering"="live-in" "inline-remark"="cost=never, reason=unavailable definition" }
!0 = !{!"function_entry_count", i64 32768}
!1 = !{}
!4 = !{!"tbaa-access-type"}
!5 = !{i32 0, i32 2147483646}
!6 = !{!7, !7, i64 0}
!7 = !{!"int.array", !4}
!8 = !{!9, !9, i64 0}
!9 = !{!"pendingException_access", !4}
!10 = !{!"branch_weights", i32 1048576, i32 1}
!11 = !{!"branch_weights", i32 1, i32 983039}
!12 = distinct !{!12, !13}
!13 = !{!"llvm.loop.peeled.count", i32 1}
See that init_val
is the incoming value for phi21
which is in loop_outer
(header of outer loop). deopt9
is an exit block out of the outer_loop
which being taken or not depends on invariant condition widenable_cond11
. Given these facts, we know that if the exit is taken, it will be taken on first iteration. So, indvars
went ahead and did this transform for deopt9
:
%lcssa = phi i32 [ %phi_21, %loop_outer ]
=> %lcssa = phi i32 [ %init_val, %loop_outer ]
.
Until this point the IR is correct. Here is the transform in complete: https://godbolt.org/z/PrTz6KK6b
Then loop predication predicateLoopExit
transform folds the check in inner_loop
to false by widening the widenable branch outside the inner_loop
. This is the widenable branch at br i1 %widenable_cond11, label %bb33, label %deopt9,
.
This is the result after loop-predication is run: https://godbolt.org/z/sq6d1vMWa
Loop predication's predicateLoopExit
pass does two incorrect things:
- It sinks the widenable call into the loop, thereby converting an invariant condition to a variant one (this was being fixed here: https://reviews.llvm.org/D146792)
- It widens the widenable call thereby converting it into a loop-varying one.
Both of these together now mean that if deopt9
is taken, it may no longer be taken on first iteration. This means the deopt state in deopt9
block is incorrect, since it directly uses init_val
.
The complete fix for this is by making sure we do the widening at the right point: which is, we need to widen at the widenable call (not at the widenable branch as it is done today, at loop_outer
block) and do not sink the widenable call down the loop.