432 changes: 288 additions & 144 deletions llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

Large diffs are not rendered by default.

42 changes: 28 additions & 14 deletions llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=MUBUF %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck --check-prefix=FLATSCR %s

# Test what happens when an SGPR is unavailable for the unused add. The non-inline constant needs to be folded into the add instruction and not materialized in a register.

Expand All @@ -21,19 +22,32 @@ body: |
bb.0:
liveins: $vgpr1
; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
; CHECK: liveins: $sgpr27, $vgpr1
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
; CHECK: S_ENDPGM 0, implicit $vcc
; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs
; MUBUF: liveins: $sgpr27, $vgpr1
; MUBUF: $sgpr27 = frame-setup COPY $sgpr33
; MUBUF: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; MUBUF: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
; MUBUF: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; MUBUF: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; MUBUF: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
; MUBUF: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec
; MUBUF: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
; MUBUF: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; MUBUF: $sgpr33 = frame-setup COPY $sgpr27
; MUBUF: S_ENDPGM 0, implicit $vcc
; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs
; FLATSCR: liveins: $sgpr27, $vgpr1
; FLATSCR: $sgpr27 = frame-setup COPY $sgpr33
; FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
; FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc
; FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc
; FLATSCR: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; FLATSCR: $sgpr33 = S_ADD_U32 $sgpr33, 8192, implicit-def $scc
; FLATSCR: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
; FLATSCR: $sgpr33 = S_SUB_U32 $sgpr33, 8192, implicit-def $scc
; FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc
; FLATSCR: $sgpr33 = frame-setup COPY $sgpr27
; FLATSCR: S_ENDPGM 0, implicit $vcc
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
S_ENDPGM 0, implicit $vcc
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck -check-prefix=GFX9-FLATSCR %s

# Test case where spilling a VGPR to an emergency slot is needed during frame index elimination.

Expand Down Expand Up @@ -55,6 +56,17 @@ body: |
; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs
; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2
; GFX9-FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc
; GFX9-FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc
; GFX9-FLATSCR: $vcc_hi = S_ADD_U32 $sgpr33, 8192, implicit-def $scc
; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec
; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc
; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0
; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec
S_ENDPGM 0, csr_amdgpu_allvgprs
...
173 changes: 139 additions & 34 deletions llvm/test/CodeGen/AMDGPU/scratch-simple.ll

Large diffs are not rendered by default.

89 changes: 59 additions & 30 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,MUBUF %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN32,MUBUF %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,FLATSCR %s


# CHECK-LABEL: name: check_spill

# FLATSCR: $sgpr33 = S_MOV_B32 0
# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc

# S32 with kill
# CHECK: V_WRITELANE
# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
# CHECK: $exec_lo = S_MOV_B32 1
# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 4
# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12

# S32 without kill
# CHECK: V_WRITELANE
# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
# CHECK: $exec_lo = S_MOV_B32 1
# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 4
# CHECK: $sgpr12 = V_READLANE

# S64 with kill
Expand All @@ -25,7 +32,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 3
# GCN64: $exec = S_MOV_B64 3
# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 8
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand All @@ -36,7 +44,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 3
# GCN64: $exec = S_MOV_B64 3
# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 8
# GCN32: $exec_lo = S_MOV_B32 $sgpr12
# GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13
# GCN64: $sgpr13 = V_READLANE
Expand All @@ -50,7 +59,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 7
# GCN64: $exec = S_MOV_B64 7
# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 16
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand All @@ -63,7 +73,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 15
# GCN64: $exec = S_MOV_B64 15
# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 28
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand All @@ -77,7 +88,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 31
# GCN64: $exec = S_MOV_B64 31
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 44
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand All @@ -94,7 +106,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 255
# GCN64: $exec = S_MOV_B64 255
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 64
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand All @@ -119,7 +132,8 @@
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 65535
# GCN64: $exec = S_MOV_B64 65535
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 96
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

Expand Down Expand Up @@ -160,7 +174,8 @@
# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 4294967295
# GCN64: $exec = S_MOV_B64 4294967295
# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 160
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65

Expand Down Expand Up @@ -203,11 +218,12 @@ machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
argumentInfo:
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
dispatchPtr: { reg: '$sgpr4_sgpr5' }
kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
workGroupIDX: { reg: '$sgpr8' }
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
flatScratchInit: { reg: '$sgpr0_sgpr1' }
dispatchPtr: { reg: '$sgpr2_sgpr3' }
privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' }
kernargSegmentPtr: { reg: '$sgpr8_sgpr9' }
workGroupIDX: { reg: '$sgpr10' }
privateSegmentWaveByteOffset: { reg: '$sgpr11' }
body: |
bb.0:
liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
Expand Down Expand Up @@ -245,10 +261,15 @@ body: |
# CHECK-LABEL: name: check_reload

# FLATSCR: $sgpr33 = S_MOV_B32 0
# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc

# S32
# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
# CHECK: $exec_lo = S_MOV_B32 1
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4
# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
# CHECK: $sgpr12 = V_READLANE

Expand All @@ -257,7 +278,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 3
# GCN64: $exec = S_MOV_B64 3
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -268,7 +290,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 7
# GCN64: $exec = S_MOV_B64 7
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -280,7 +303,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 15
# GCN64: $exec = S_MOV_B64 15
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -293,7 +317,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 31
# GCN64: $exec = S_MOV_B64 31
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -307,7 +332,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 255
# GCN64: $exec = S_MOV_B64 255
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -324,7 +350,8 @@ body: |
# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 65535
# GCN64: $exec = S_MOV_B64 65535
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
# CHECK: $sgpr12 = V_READLANE
Expand All @@ -349,7 +376,8 @@ body: |
# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
# GCN32: $exec_lo = S_MOV_B32 4294967295
# GCN64: $exec = S_MOV_B64 4294967295
# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160
# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
# CHECK: $sgpr64 = V_READLANE
Expand Down Expand Up @@ -412,11 +440,12 @@ machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
argumentInfo:
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
dispatchPtr: { reg: '$sgpr4_sgpr5' }
kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
workGroupIDX: { reg: '$sgpr8' }
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
flatScratchInit: { reg: '$sgpr0_sgpr1' }
dispatchPtr: { reg: '$sgpr2_sgpr3' }
privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' }
kernargSegmentPtr: { reg: '$sgpr8_sgpr9' }
workGroupIDX: { reg: '$sgpr10' }
privateSegmentWaveByteOffset: { reg: '$sgpr11' }
body: |
bb.0:
liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
Expand Down
22 changes: 20 additions & 2 deletions llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s
; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s
;
; There is something about Tonga that causes this test to spend a lot of time
; in the default register allocator.
Expand All @@ -11,6 +13,16 @@

; Just test that it compiles successfully.
; CHECK-LABEL: test

; GFX9-FLATSCR: s_mov_b32 [[SOFF1:s[0-9]+]], 4{{$}}
; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill
; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] offset:{{[0-9]+}} ; 4-byte Folded Spill
; GFX9-FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x{{[0-9a-f]+}}{{$}}
; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload
; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] offset:{{[0-9]+}} ; 4-byte Folded Reload

; GFX10-FLATSCR: scratch_store_dword off, v{{[0-9]+}}, off offset:{{[0-9]+}} ; 4-byte Folded Spill
; GFX10-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, off offset:{{[0-9]+}} ; 4-byte Folded Reload
define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
entry:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
Expand All @@ -35,11 +47,17 @@ entry:
}

; CHECK-LABEL: test_limited_sgpr
; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
; GFX6: NumSgprs: 48
; GFX6: ScratchSize: 8608

; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x
; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload
define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
entry:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
Expand Down
124 changes: 85 additions & 39 deletions llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -amdgpu-enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s

; FIXME: The MUBUF loads in this test output are incorrect, their SOffset
; should use the frame offset register, not the ABI stack pointer register. We
Expand All @@ -13,44 +14,89 @@

; An assert was hit when frame offset register was used to address FrameIndex.
define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
; GCN-LABEL: kernel_background_evaluate:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s38, -1
; GCN-NEXT: s_mov_b32 s39, 0x31c16000
; GCN-NEXT: s_add_u32 s36, s36, s3
; GCN-NEXT: s_addc_u32 s37, s37, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0x2000
; GCN-NEXT: v_mov_b32_e32 v2, 0x4000
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_mov_b32_e32 v4, 0x400000
; GCN-NEXT: s_mov_b32 s32, 0xc0000
; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000
; GCN-NEXT: ; implicit-def: $vcc_hi
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b64 s[0:1], s[36:37]
; GCN-NEXT: s_mov_b64 s[2:3], s[38:39]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: s_cbranch_execz BB0_2
; GCN-NEXT: ; %bb.1: ; %if.then4.i
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen
; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen
; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit
; GCN-NEXT: s_endpgm
; MUBUF-LABEL: kernel_background_evaluate:
; MUBUF: ; %bb.0: ; %entry
; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24
; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; MUBUF-NEXT: s_mov_b32 s38, -1
; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000
; MUBUF-NEXT: s_add_u32 s36, s36, s3
; MUBUF-NEXT: s_addc_u32 s37, s37, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v3, 0
; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000
; MUBUF-NEXT: s_mov_b32 s32, 0xc0000
; MUBUF-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000
; MUBUF-NEXT: ; implicit-def: $vcc_hi
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s0
; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37]
; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39]
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo
; MUBUF-NEXT: s_cbranch_execz BB0_2
; MUBUF-NEXT: ; %bb.1: ; %if.then4.i
; MUBUF-NEXT: s_clause 0x1
; MUBUF-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen
; MUBUF-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_nc_u32_e32 v0, v1, v0
; MUBUF-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
; MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
; MUBUF-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen
; MUBUF-NEXT: BB0_2: ; %shader_eval_surface.exit
; MUBUF-NEXT: s_endpgm
;
; FLATSCR-LABEL: kernel_background_evaluate:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_add_u32 s2, s2, s5
; FLATSCR-NEXT: s_movk_i32 s32, 0x6000
; FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24
; FLATSCR-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; FLATSCR-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
; FLATSCR-NEXT: s_mov_b32 s38, -1
; FLATSCR-NEXT: s_mov_b32 s39, 0x31c16000
; FLATSCR-NEXT: s_add_u32 s36, s36, s5
; FLATSCR-NEXT: s_addc_u32 s37, s37, 0
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000
; FLATSCR-NEXT: v_mov_b32_e32 v3, 0
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0x400000
; FLATSCR-NEXT: ; implicit-def: $vcc_hi
; FLATSCR-NEXT: s_getpc_b64 s[4:5]
; FLATSCR-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
; FLATSCR-NEXT: s_mov_b64 s[0:1], s[36:37]
; FLATSCR-NEXT: s_mov_b64 s[2:3], s[38:39]
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo
; FLATSCR-NEXT: s_cbranch_execz BB0_2
; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i
; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000
; FLATSCR-NEXT: s_nop 1
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:4
; FLATSCR-NEXT: s_waitcnt_depctr 0xffe3
; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000
; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_lo offset:8
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0
; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
; FLATSCR-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: BB0_2: ; %shader_eval_surface.exit
; FLATSCR-NEXT: s_endpgm
entry:
%sd = alloca < 1339 x i32>, align 8192, addrspace(5)
%state = alloca <4 x i32>, align 16, addrspace(5)
Expand Down
41 changes: 28 additions & 13 deletions llvm/test/CodeGen/AMDGPU/store-hi16.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s
; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s

; GCN-LABEL: {{^}}store_global_hi_v2i16:
; GCN: s_waitcnt
Expand Down Expand Up @@ -389,7 +390,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off

; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
Expand All @@ -408,7 +410,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2f16:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}

; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
Expand All @@ -427,7 +430,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_i32_shift:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}

; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
Expand All @@ -445,7 +449,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}

; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
Expand All @@ -464,7 +469,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_i8_shift:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}

; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
Expand All @@ -481,7 +487,8 @@ entry:

; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
; GCN: s_waitcnt
; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}}

; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}}
Expand All @@ -502,7 +509,9 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}}

; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}}
Expand All @@ -522,7 +531,9 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
; GCN: s_waitcnt

; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}}

; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}}
Expand Down Expand Up @@ -634,8 +645,10 @@ entry:

; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
; GCN: s_waitcnt
; GFX900: buffer_store_dword
; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
entry:
%obj0 = alloca [10 x i32], align 4, addrspace(5)
Expand All @@ -651,8 +664,10 @@ entry:

; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
; GCN: s_waitcnt
; GFX900: buffer_store_dword
; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
entry:
%obj0 = alloca [10 x i32], align 4, addrspace(5)
Expand Down