[AMDGPU] Pre-sink IR input for some tests

Edit the IR input for some codegen tests to simulate what the IR code sinking pass would do to it. This makes the tests immune to the presence or absence of the code sinking pass in the codegen pass pipeline, which does not belong there. Differential Revision: https://reviews.llvm.org/D130169
llvm · Jul 21, 2022 · 716ca2e · 716ca2e
1 parent 140bcd3
commit 716ca2e
Show file tree

Hide file tree

Showing 15 changed files with 89 additions and 87 deletions.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -168,11 +168,11 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = load i32, i32 addrspace(4)* @external_constant
-  %ptr = load float*, float* addrspace(4)* @const.ptr
   %tmp1 = icmp ne i32 %tmp, 0
   br i1 %tmp1, label %bb12, label %bb2
 
 bb2:
+  %ptr = load float*, float* addrspace(4)* @const.ptr
   %tmp4 = load float, float* %ptr, align 4
   %tmp5 = fcmp olt float %tmp4, 1.0
   %tmp6 = or i1 %tmp5, false

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1536,7 +1536,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX11_W64-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
   %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
@@ -1555,6 +1554,7 @@ bb:
 
 exit:
   %cond = phi i1 [false, %entry], [%cmp1, %bb]
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
   store float %result, float addrspace(1)* %gep.out, align 4
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -793,14 +793,14 @@ bb:
 
 bb9:                                              ; preds = %bb12, %bb
   %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
-  %i11 = icmp slt i64 %i10, 0
   br i1 undef, label %bb14, label %bb12
 
 bb12:                                             ; preds = %bb58, %bb9
   %i13 = add nuw nsw i64 %i10, %i8
   br label %bb9
 
 bb14:                                             ; preds = %bb9
+  %i11 = icmp slt i64 %i10, 0
   %i15 = load i64, i64 addrspace(1)* null, align 8
   br label %bb16
 
@@ -825,23 +825,23 @@ bb16:                                             ; preds = %bb58, %bb14
   %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
   %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
   %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
+  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
+  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+  fence syncscope("workgroup") acquire
+  br i1 %i11, label %bb58, label %bb51
+
+bb51:                                             ; preds = %bb16
   %i37 = fpext <2 x half> %arg4 to <2 x float>
   %i39 = fpext <2 x half> %i27 to <2 x float>
   %i40 = fpext <2 x half> %i30 to <2 x float>
   %i41 = fpext <2 x half> %i33 to <2 x float>
   %i42 = fpext <2 x half> %i36 to <2 x float>
-  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
   %i44 = fadd contract <2 x float> %i37, %i43
   %i45 = fadd contract <2 x float> %i43, zeroinitializer
-  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
   %i47 = fadd contract <2 x float> %i39, %i46
   %i48 = fadd contract <2 x float> %i40, %i43
   %i49 = fadd contract <2 x float> %i41, zeroinitializer
   %i50 = fadd contract <2 x float> %i42, zeroinitializer
-  fence syncscope("workgroup") acquire
-  br i1 %i11, label %bb58, label %bb51
-
-bb51:                                             ; preds = %bb16
   %i52 = fadd contract <2 x float> %i18, %i44
   %i53 = fadd contract <2 x float> %i19, %i45
   %i54 = fadd contract <2 x float> %i20, %i47

diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -508,11 +508,11 @@ define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i3
 bb:
   %tmp = icmp slt i32 %arg2, 9
   %tmp6 = icmp eq i32 %arg1, 0
-  %tmp7 = icmp sgt i32 %arg4, 0
   %tmp8 = icmp sgt i32 %arg4, 5
   br i1 %tmp8, label %bb9, label %bb13
 
 bb9:                                              ; preds = %bb
+  %tmp7 = icmp sgt i32 %arg4, 0
   %tmp10 = and i1 %tmp7, %tmp
   %tmp11 = icmp slt i32 %arg3, %arg4
   %tmp12 = or i1 %tmp11, %tmp7

diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -8,19 +8,19 @@
 define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
+; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2)
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2)
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
 ; OPT-NEXT:    store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    br label [[DONE:%.*]]
+; OPT:       done:
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
@@ -43,18 +43,18 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:252
 ; GCN-NEXT:    s_endpgm
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp = icmp eq i32 %tid, 0
   br i1 %cmp, label %endif, label %if
 
 if:
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
   %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2)
   br label %endif
 
 endif:
   %x = phi i32 [ %val, %if ], [ 0, %entry ]
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
   store i32 %x, i32 addrspace(1)* %out.gep
   br label %done
 

diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -7,20 +7,20 @@
 define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
+; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
-; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00)
 ; OPT-NEXT:    [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
 ; OPT-NEXT:    store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    br label [[DONE:%.*]]
+; OPT:       done:
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32:
@@ -45,19 +45,19 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp = icmp eq i32 %tid, 0
   br i1 %cmp, label %endif, label %if
 
 if:
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
   %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
   %val = load volatile float, float addrspace(1)* undef
   br label %endif
 
 endif:
   %x = phi float [ %val, %if ], [ 0.0, %entry ]
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
   store float %x, float addrspace(1)* %out.gep
   br label %done
 

diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -11,13 +11,13 @@
 
 define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 entry:
-  %rem.lhs.trunc = trunc i32 %s to i16
-  %rem4 = urem i16 %rem.lhs.trunc, 12
-  %rem.zext = zext i16 %rem4 to i32
   %cmp = icmp eq i32 %s, 3
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
+  %rem.lhs.trunc = trunc i32 %s to i16
+  %rem4 = urem i16 %rem.lhs.trunc, 12
+  %rem.zext = zext i16 %rem4 to i32
   %idxprom = zext i32 %s to i64
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom
   %div = lshr i32 %rem.zext, 3

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -254,12 +254,6 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
-  %c.bc = bitcast i32 %c to float
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
-  %tex0 = extractelement <4 x float> %tex, i32 0
-  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
-  %data.sample = extractelement <4 x float> %dtex, i32 0
-
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
 
@@ -271,6 +265,12 @@ IF:
   br label %END
 
 ELSE:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %data.sample = extractelement <4 x float> %dtex, i32 0
+
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
   br label %END
 

diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -10,36 +10,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; OPT-NEXT:  main_body:
 ; OPT-NEXT:    br label [[LOOP_OUTER:%.*]]
 ; OPT:       LOOP.outer:
-; OPT-NEXT:    [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
+; OPT-NEXT:    [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
 ; OPT-NEXT:    [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ]
 ; OPT-NEXT:    br label [[LOOP:%.*]]
 ; OPT:       LOOP:
-; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
 ; OPT-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ]
-; OPT-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ]
-; OPT-NEXT:    [[TMP47]] = add i32 [[TMP45]], 1
+; OPT-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ]
 ; OPT-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
 ; OPT-NEXT:    [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
 ; OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
 ; OPT-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
 ; OPT-NEXT:    br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
 ; OPT:       Flow:
-; OPT-NEXT:    [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT:    [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
+; OPT-NEXT:    [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
-; OPT-NEXT:    [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]])
-; OPT-NEXT:    [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]])
-; OPT-NEXT:    [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]])
-; OPT-NEXT:    br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]]
+; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
+; OPT-NEXT:    [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]])
+; OPT-NEXT:    br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]]
 ; OPT:       Flow1:
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; OPT-NEXT:    [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]])
-; OPT-NEXT:    br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
+; OPT-NEXT:    [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]])
+; OPT-NEXT:    br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]]
 ; OPT:       IF:
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]])
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
 ; OPT-NEXT:    ret void
 ; OPT:       ENDIF:
+; OPT-NEXT:    [[TMP47]] = add i32 [[TMP45]], 1
 ; OPT-NEXT:    [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]]
 ; OPT-NEXT:    [[TMP51_INV]] = xor i1 [[TMP51]], true
 ; OPT-NEXT:    br label [[FLOW]]
@@ -98,14 +99,14 @@ LOOP.outer:                                       ; preds = %ENDIF, %main_body
 
 LOOP:                                             ; preds = %ENDIF, %LOOP.outer
   %tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
-  %tmp47 = add i32 %tmp45, 1
   %tmp48 = icmp slt i32 %tmp45, %ub
   br i1 %tmp48, label %ENDIF, label %IF
 
 IF:                                               ; preds = %LOOP
   ret void
 
 ENDIF:                                            ; preds = %LOOP
+  %tmp47 = add i32 %tmp45, 1
   %tmp51 = icmp eq i32 %tmp47, %cont
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }

diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -190,16 +190,16 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
+; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
+; IR:       bb14.lr.ph:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
 ; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
-; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
-; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
-; IR:       bb14.lr.ph:
 ; IR-NEXT:    br label [[BB14:%.*]]
 ; IR:       Flow3:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
@@ -277,17 +277,17 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef
 ; IR-NEXT:    ret void
 bb:
+  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
+  br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
+
+bb14.lr.ph:                                       ; preds = %bb
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = zext i32 %my.tmp to i64
   %my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
   %my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
   %my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
   %my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
-  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
-  %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
-  br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
-
-bb14.lr.ph:                                       ; preds = %bb
   br label %bb14
 
 bb4.bb13_crit_edge:                               ; preds = %bb21

diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -21,7 +21,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_4
-; GFX10-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB0_2: ; %bb
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_or_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_1
@@ -50,20 +51,20 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_inst_prefetch 0x2
 ; GFX10-NEXT:    s_endpgm
 branch1_true:
-  br label %2
+  br label %bb
 
-2:                                                ; preds = %branch2_merge, %branch1_true
+bb:                                               ; preds = %branch2_merge, %branch1_true
   %r1.8.vec.insert14.i1 = phi float [ 0.000000e+00, %branch1_true ], [ %0, %branch2_merge ]
-  %3 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
-  %4 = icmp eq i32 %1, 0
-  br i1 %4, label %loop0_merge, label %branch2_merge
+  %i = icmp eq i32 %1, 0
+  br i1 %i, label %loop0_merge, label %branch2_merge
 
-branch2_merge:                                    ; preds = %2
-  %5 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %3, float %0, float 0.000000e+00)
-  %6 = fcmp ult float %5, 0.000000e+00
-  br i1 %6, label %2, label %loop0_merge
+branch2_merge:                                    ; preds = %bb
+  %i2 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i3 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %i2, float %0, float 0.000000e+00)
+  %i4 = fcmp ult float %i3, 0.000000e+00
+  br i1 %i4, label %bb, label %loop0_merge
 
-loop0_merge:                                      ; preds = %branch2_merge, %2
+loop0_merge:                                      ; preds = %branch2_merge, %bb
   ret void
 }