Skip to content

Commit

Permalink
[AMDGPU] Pre-sink IR input for some tests
Browse files Browse the repository at this point in the history
Edit the IR input for some codegen tests to simulate what the IR code
sinking pass would do to it. This makes the tests immune to the presence
or absence of the code sinking pass in the codegen pass pipeline, which
does not belong there.

Differential Revision: https://reviews.llvm.org/D130169
  • Loading branch information
jayfoad committed Jul 21, 2022
1 parent 140bcd3 commit 716ca2e
Show file tree
Hide file tree
Showing 15 changed files with 89 additions and 87 deletions.
Expand Up @@ -168,11 +168,11 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = load i32, i32 addrspace(4)* @external_constant
%ptr = load float*, float* addrspace(4)* @const.ptr
%tmp1 = icmp ne i32 %tmp, 0
br i1 %tmp1, label %bb12, label %bb2

bb2:
%ptr = load float*, float* addrspace(4)* @const.ptr
%tmp4 = load float, float* %ptr, align 4
%tmp5 = fcmp olt float %tmp4, 1.0
%tmp6 = or i1 %tmp5, false
Expand Down
Expand Up @@ -1536,7 +1536,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX11_W64-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
Expand All @@ -1555,6 +1554,7 @@ bb:

exit:
%cond = phi i1 [false, %entry], [%cmp1, %bb]
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
store float %result, float addrspace(1)* %gep.out, align 4
ret void
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Expand Up @@ -793,14 +793,14 @@ bb:

bb9: ; preds = %bb12, %bb
%i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
%i11 = icmp slt i64 %i10, 0
br i1 undef, label %bb14, label %bb12

bb12: ; preds = %bb58, %bb9
%i13 = add nuw nsw i64 %i10, %i8
br label %bb9

bb14: ; preds = %bb9
%i11 = icmp slt i64 %i10, 0
%i15 = load i64, i64 addrspace(1)* null, align 8
br label %bb16

Expand All @@ -825,23 +825,23 @@ bb16: ; preds = %bb58, %bb14
%i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
%i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
%i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
%i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
%i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
fence syncscope("workgroup") acquire
br i1 %i11, label %bb58, label %bb51

bb51: ; preds = %bb16
%i37 = fpext <2 x half> %arg4 to <2 x float>
%i39 = fpext <2 x half> %i27 to <2 x float>
%i40 = fpext <2 x half> %i30 to <2 x float>
%i41 = fpext <2 x half> %i33 to <2 x float>
%i42 = fpext <2 x half> %i36 to <2 x float>
%i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
%i44 = fadd contract <2 x float> %i37, %i43
%i45 = fadd contract <2 x float> %i43, zeroinitializer
%i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
%i47 = fadd contract <2 x float> %i39, %i46
%i48 = fadd contract <2 x float> %i40, %i43
%i49 = fadd contract <2 x float> %i41, zeroinitializer
%i50 = fadd contract <2 x float> %i42, zeroinitializer
fence syncscope("workgroup") acquire
br i1 %i11, label %bb58, label %bb51

bb51: ; preds = %bb16
%i52 = fadd contract <2 x float> %i18, %i44
%i53 = fadd contract <2 x float> %i19, %i45
%i54 = fadd contract <2 x float> %i20, %i47
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
Expand Up @@ -508,11 +508,11 @@ define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i3
bb:
%tmp = icmp slt i32 %arg2, 9
%tmp6 = icmp eq i32 %arg1, 0
%tmp7 = icmp sgt i32 %arg4, 0
%tmp8 = icmp sgt i32 %arg4, 5
br i1 %tmp8, label %bb9, label %bb13

bb9: ; preds = %bb
%tmp7 = icmp sgt i32 %arg4, 0
%tmp10 = and i1 %tmp7, %tmp
%tmp11 = icmp slt i32 %arg3, %arg4
%tmp12 = or i1 %tmp11, %tmp7
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
Expand Up @@ -8,19 +8,19 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
; OPT-NEXT: entry:
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
; OPT-NEXT: [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2)
; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7
; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2)
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4
; OPT-NEXT: br label [[DONE:%.*]]
; OPT: done:
; OPT-NEXT: ret void
;
; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
Expand All @@ -43,18 +43,18 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
; GCN-NEXT: s_endpgm
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%cmp = icmp eq i32 %tid, 0
br i1 %cmp, label %endif, label %if

if:
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
%val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2)
br label %endif

endif:
%x = phi i32 [ %val, %if ], [ 0, %entry ]
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
store i32 %x, i32 addrspace(1)* %out.gep
br label %done

Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
Expand Up @@ -7,20 +7,20 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
; OPT-NEXT: entry:
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7
; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00)
; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4
; OPT-NEXT: br label [[DONE:%.*]]
; OPT: done:
; OPT-NEXT: ret void
;
; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32:
Expand All @@ -45,19 +45,19 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300
; GCN-NEXT: s_endpgm
entry:
%out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
%in.gep = getelementptr float, float addrspace(1)* %in, i32 7
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%cmp = icmp eq i32 %tid, 0
br i1 %cmp, label %endif, label %if

if:
%in.gep = getelementptr float, float addrspace(1)* %in, i32 7
%fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
%val = load volatile float, float addrspace(1)* undef
br label %endif

endif:
%x = phi float [ %val, %if ], [ 0.0, %entry ]
%out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
store float %x, float addrspace(1)* %out.gep
br label %done

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
Expand Up @@ -11,13 +11,13 @@

define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
entry:
%rem.lhs.trunc = trunc i32 %s to i16
%rem4 = urem i16 %rem.lhs.trunc, 12
%rem.zext = zext i16 %rem4 to i32
%cmp = icmp eq i32 %s, 3
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
%rem.lhs.trunc = trunc i32 %s to i16
%rem4 = urem i16 %rem.lhs.trunc, 12
%rem.zext = zext i16 %rem4 to i32
%idxprom = zext i32 %s to i64
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom
%div = lshr i32 %rem.zext, 3
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
Expand Up @@ -254,12 +254,6 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; return to shader part epilog
main_body:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%data.sample = extractelement <4 x float> %dtex, i32 0

%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ELSE

Expand All @@ -271,6 +265,12 @@ IF:
br label %END

ELSE:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
%data.sample = extractelement <4 x float> %dtex, i32 0

call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
br label %END

Expand Down
33 changes: 17 additions & 16 deletions llvm/test/CodeGen/AMDGPU/multilevel-break.ll
Expand Up @@ -10,36 +10,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
; OPT-NEXT: main_body:
; OPT-NEXT: br label [[LOOP_OUTER:%.*]]
; OPT: LOOP.outer:
; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ]
; OPT-NEXT: br label [[LOOP:%.*]]
; OPT: LOOP:
; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ]
; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ]
; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1
; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ]
; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
; OPT: Flow:
; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ]
; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]])
; OPT-NEXT: [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]])
; OPT-NEXT: [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]])
; OPT-NEXT: br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]]
; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]])
; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]])
; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]]
; OPT: Flow1:
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
; OPT-NEXT: [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]])
; OPT-NEXT: br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]]
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]])
; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]]
; OPT: IF:
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]])
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
; OPT-NEXT: ret void
; OPT: ENDIF:
; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1
; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]]
; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true
; OPT-NEXT: br label [[FLOW]]
Expand Down Expand Up @@ -98,14 +99,14 @@ LOOP.outer: ; preds = %ENDIF, %main_body

LOOP: ; preds = %ENDIF, %LOOP.outer
%tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
%tmp47 = add i32 %tmp45, 1
%tmp48 = icmp slt i32 %tmp45, %ub
br i1 %tmp48, label %ENDIF, label %IF

IF: ; preds = %LOOP
ret void

ENDIF: ; preds = %LOOP
%tmp47 = add i32 %tmp45, 1
%tmp51 = icmp eq i32 %tmp47, %cont
br i1 %tmp51, label %LOOP, label %LOOP.outer
}
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
Expand Up @@ -190,16 +190,16 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; GCN-NEXT: s_endpgm
; IR-LABEL: @nested_loop_conditions(
; IR-NEXT: bb:
; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
; IR: bb14.lr.ph:
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
; IR-NEXT: [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
; IR-NEXT: [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
; IR-NEXT: [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
; IR-NEXT: [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
; IR-NEXT: [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
; IR: bb14.lr.ph:
; IR-NEXT: br label [[BB14:%.*]]
; IR: Flow3:
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
Expand Down Expand Up @@ -277,17 +277,17 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
; IR-NEXT: ret void
bb:
%my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
%my.tmp1235 = icmp slt i32 %my.tmp1134, 9
br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13

bb14.lr.ph: ; preds = %bb
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%my.tmp1 = zext i32 %my.tmp to i64
%my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
%my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
%my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
%my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
%my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
%my.tmp1235 = icmp slt i32 %my.tmp1134, 9
br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13

bb14.lr.ph: ; preds = %bb
br label %bb14

bb4.bb13_crit_edge: ; preds = %bb21
Expand Down
23 changes: 12 additions & 11 deletions llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
Expand Up @@ -21,7 +21,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_or_b32 s1, s0, s1
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_cbranch_execz .LBB0_4
; GFX10-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: .LBB0_2: ; %bb
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_or_b32 s2, s2, exec_lo
; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB0_1
Expand Down Expand Up @@ -50,20 +51,20 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_inst_prefetch 0x2
; GFX10-NEXT: s_endpgm
branch1_true:
br label %2
br label %bb

2: ; preds = %branch2_merge, %branch1_true
bb: ; preds = %branch2_merge, %branch1_true
%r1.8.vec.insert14.i1 = phi float [ 0.000000e+00, %branch1_true ], [ %0, %branch2_merge ]
%3 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%4 = icmp eq i32 %1, 0
br i1 %4, label %loop0_merge, label %branch2_merge
%i = icmp eq i32 %1, 0
br i1 %i, label %loop0_merge, label %branch2_merge

branch2_merge: ; preds = %2
%5 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %3, float %0, float 0.000000e+00)
%6 = fcmp ult float %5, 0.000000e+00
br i1 %6, label %2, label %loop0_merge
branch2_merge: ; preds = %bb
%i2 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i3 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %i2, float %0, float 0.000000e+00)
%i4 = fcmp ult float %i3, 0.000000e+00
br i1 %i4, label %bb, label %loop0_merge

loop0_merge: ; preds = %branch2_merge, %2
loop0_merge: ; preds = %branch2_merge, %bb
ret void
}

Expand Down

0 comments on commit 716ca2e

Please sign in to comment.