8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ body: |
; GFX9: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[V_AND_B32_e32_]], implicit $exec
; GFX9: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[V_AND_B32_e32_]], implicit $exec
; GFX9: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
; GFX9: S_SET_GPR_IDX_ON [[V_READFIRSTLANE_B32_]], 1, implicit-def $m0, implicit undef $m0
; GFX9: S_SET_GPR_IDX_ON [[V_READFIRSTLANE_B32_]], 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef %18.sub0, implicit $exec, implicit %18, implicit $m0
; GFX9: S_SET_GPR_IDX_OFF
; GFX9: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
; GFX9: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def dead $scc
; GFX9: S_CBRANCH_EXECNZ %bb.1, implicit $exec
; GFX9: bb.2:
Expand Down Expand Up @@ -97,9 +97,9 @@ body: |
%21:sgpr_32 = V_READFIRSTLANE_B32 %19, implicit $exec
%22:sreg_64 = V_CMP_EQ_U32_e64 %21, %19, implicit $exec
%23:sreg_64 = S_AND_SAVEEXEC_B64 killed %22, implicit-def $exec, implicit-def dead $scc, implicit $exec
S_SET_GPR_IDX_ON killed %21, 1, implicit-def $m0, implicit undef $m0
S_SET_GPR_IDX_ON killed %21, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
%24:vgpr_32 = V_MOV_B32_e32 undef %18.sub0, implicit $exec, implicit %18, implicit $m0
S_SET_GPR_IDX_OFF
S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
$exec = S_XOR_B64_term $exec, killed %23, implicit-def dead $scc
S_CBRANCH_EXECNZ %bb.1, implicit $exec
Expand Down
108 changes: 108 additions & 0 deletions llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
# Make sure mandatory skips are not removed around mode defs.
# FIXME: -amdgpu-skip-threshold seems to be backwards.

---

name: need_skip_gpr_idx_mode
body: |
; CHECK-LABEL: name: need_skip_gpr_idx_mode
; CHECK: bb.0:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SET_GPR_IDX_MODE 0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SET_GPR_IDX_MODE 0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
bb.2:
S_ENDPGM 0
...

---

name: need_skip_gpr_idx_on
body: |
; CHECK-LABEL: name: need_skip_gpr_idx_on
; CHECK: bb.0:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SET_GPR_IDX_ON $sgpr0, 0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
liveins: $sgpr0
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
liveins: $sgpr0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SET_GPR_IDX_ON $sgpr0, 0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
bb.2:
S_ENDPGM 0
...

---

name: need_skip_gpr_idx_off
body: |
; CHECK-LABEL: name: need_skip_gpr_idx_off
; CHECK: bb.0:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
bb.2:
S_ENDPGM 0
...

---

name: need_skip_gpr_idx_idx
body: |
; CHECK-LABEL: name: need_skip_gpr_idx_idx
; CHECK: bb.0:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SET_GPR_IDX_IDX $sgpr0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
liveins: $sgpr0
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
liveins: $sgpr0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SET_GPR_IDX_IDX $sgpr0, implicit-def $mode, implicit-def $m0, implicit $mode, implicit $m0
bb.2:
S_ENDPGM 0
...
111 changes: 111 additions & 0 deletions llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-mode-def.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
# Make sure mandatory skips are not removed around mode defs.
# FIXME: -amdgpu-skip-threshold seems to be backwards.

---

name: need_skip_setreg_imm32_b32
body: |
; CHECK-LABEL: name: need_skip_setreg_imm32_b32
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
successors: %bb.2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
bb.2:
S_ENDPGM 0
...

---

name: need_skip_setreg_b32
body: |
; CHECK-LABEL: name: need_skip_setreg_b32
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_SETREG_B32 $sgpr0, 3, implicit-def $mode, implicit $mode
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
liveins: $sgpr0
successors: %bb.1, %bb.2
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
liveins: $sgpr0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SETREG_B32 $sgpr0, 3, implicit-def $mode, implicit $mode
bb.2:
S_ENDPGM 0
...

---

name: need_skip_denorm_mode
body: |
; CHECK-LABEL: name: need_skip_denorm_mode
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_DENORM_MODE 3, implicit-def $mode, implicit $mode
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_DENORM_MODE 3, implicit-def $mode, implicit $mode
bb.2:
S_ENDPGM 0
...

---

name: need_skip_round_mode
body: |
; CHECK-LABEL: name: need_skip_round_mode
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: S_ROUND_MODE 3, implicit-def $mode, implicit $mode
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_ROUND_MODE 3, implicit-def $mode, implicit $mode
bb.2:
S_ENDPGM 0
...
264 changes: 132 additions & 132 deletions llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,24 @@ latch:
ret void
}

; CHECK-LABEL: {{^}}skip_mode_switch:
; CHECK: s_and_saveexec_b64
; CHECK-NEXT: s_cbranch_execz
; CHECK: s_setreg_imm32
; CHECK: s_or_b64 exec, exec
define void @skip_mode_switch(i32 %arg) {
entry:
%cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb.0, label %bb.1

bb.0:
call void @llvm.amdgcn.s.setreg(i32 2049, i32 3)
br label %bb.1

bb.1:
ret void
}

declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
Expand All @@ -494,6 +512,8 @@ declare float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 immarg, float, flo
declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare void @llvm.amdgcn.kill(i1) #0

declare void @llvm.amdgcn.s.setreg(i32 immarg, i32)

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone speculatable }
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/CodeGen/AMDGPU/spill-agpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,22 @@ bb:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; A2V: ScratchSize: 0
define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
bb:
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
br label %use
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %use, label %st

use:
call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
store <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
br label %st

st:
Expand Down