302 changes: 167 additions & 135 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Large diffs are not rendered by default.

28 changes: 21 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 2047
Expand Down Expand Up @@ -361,7 +363,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 2048
Expand Down Expand Up @@ -405,7 +409,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 -2047
Expand Down Expand Up @@ -449,7 +455,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 -2048
Expand Down Expand Up @@ -491,7 +499,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 4095
Expand Down Expand Up @@ -581,7 +591,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 -4095
Expand Down Expand Up @@ -625,7 +637,9 @@ body: |
; GFX11: liveins: $vgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
%0:vgpr(p5) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 -4096
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,10 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
%2:sgpr(s32) = G_CONSTANT i32 4095
Expand Down Expand Up @@ -793,7 +796,9 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_LSHRREV_B32_e64_]], 4095, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 5)
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
%2:vgpr(s32) = G_CONSTANT i32 4095
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -734,11 +734,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
; FLATSCR-NEXT: v_add_u32_e32 v2, 2, v0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
; FLATSCR-NEXT: scratch_load_short_d16 v0, v2, off
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
Expand All @@ -758,23 +758,23 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
; FLATSCR_GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v0, v2, off
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
; GFX11-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
; GFX11-NEXT: scratch_load_d16_b16 v0, v2, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
Expand Down
54 changes: 30 additions & 24 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,11 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2
; GFX9-FLASTSCR-NEXT: v_add_u32_e32 v1, 2, v0
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v3, v1, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_load_2xi16_align2:
Expand All @@ -67,33 +68,36 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FLASTSCR-NEXT: v_add_nc_u32_e32 v1, 2, v0
; GFX10-FLASTSCR-NEXT: s_clause 0x1
; GFX10-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off
; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2
; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off
; GFX10-FLASTSCR-NEXT: scratch_load_ushort v3, v1, off
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_load_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_u16 v1, v0, off
; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-NEXT: scratch_load_u16 v0, v0, off
; GFX11-NEXT: scratch_load_u16 v1, v1, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FLASTSCR-NEXT: v_add_nc_u32_e32 v1, 2, v0
; GFX11-FLASTSCR-NEXT: s_clause 0x1
; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v0, off
; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off
; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v1, off
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 2
Expand Down Expand Up @@ -142,10 +146,11 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1
; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 2
; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v2, 1
; GFX9-FLASTSCR-NEXT: v_add_u32_e32 v0, 2, v1
; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v2, off
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 2
; GFX9-FLASTSCR-NEXT: scratch_store_short v0, v1, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -165,33 +170,34 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2
; GFX10-FLASTSCR-NEXT: v_add_nc_u32_e32 v2, 2, v1
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v3, 2
; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v0, off
; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v2, off offset:2
; GFX10-FLASTSCR-NEXT: scratch_store_short v2, v3, off
; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_store_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: v_mov_b32_e32 v2, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b16 v1, v0, off
; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:2
; GFX11-NEXT: scratch_store_b16 v2, v3, off
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_store_2xi16_align2:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1
; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2
; GFX11-FLASTSCR-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2
; GFX11-FLASTSCR-NEXT: v_add_nc_u32_e32 v2, 2, v1
; GFX11-FLASTSCR-NEXT: s_clause 0x1
; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v0, off
; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v2, off offset:2
; GFX11-FLASTSCR-NEXT: scratch_store_b16 v2, v3, off
; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
Expand Down
258 changes: 168 additions & 90 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll

Large diffs are not rendered by default.

488 changes: 305 additions & 183 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll

Large diffs are not rendered by default.

472 changes: 257 additions & 215 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Large diffs are not rendered by default.

111 changes: 59 additions & 52 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13845,6 +13845,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v41, s0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
Expand Down Expand Up @@ -13883,8 +13885,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX11-NEXT: v_writelane_b32 v40, s25, 21
; GFX11-NEXT: s_mov_b32 s24, s40
; GFX11-NEXT: s_mov_b32 s25, s41
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_writelane_b32 v40, s26, 22
; GFX11-NEXT: s_mov_b32 s26, s42
Expand Down Expand Up @@ -13952,6 +13953,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
Expand Down Expand Up @@ -13993,7 +13995,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40
; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42
Expand Down Expand Up @@ -14321,6 +14323,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12
; GFX11-NEXT: s_add_i32 s3, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s20, 16
; GFX11-NEXT: v_writelane_b32 v40, s21, 17
; GFX11-NEXT: v_writelane_b32 v40, s22, 18
Expand All @@ -14331,19 +14334,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
; GFX11-NEXT: v_writelane_b32 v40, s24, 20
; GFX11-NEXT: v_mov_b32_e32 v2, s48
; GFX11-NEXT: s_add_i32 s2, s32, 24
; GFX11-NEXT: s_mov_b32 s20, s36
; GFX11-NEXT: s_mov_b32 s21, s37
; GFX11-NEXT: s_mov_b32 s22, s38
; GFX11-NEXT: v_writelane_b32 v40, s25, 21
; GFX11-NEXT: s_mov_b32 s22, s38
; GFX11-NEXT: s_mov_b32 s23, s39
; GFX11-NEXT: s_mov_b32 s24, s40
; GFX11-NEXT: s_mov_b32 s25, s41
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b32 off, v6, s32 offset:24
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_writelane_b32 v40, s26, 22
; GFX11-NEXT: s_mov_b32 s26, s42
; GFX11-NEXT: scratch_store_b32 off, v6, s2
; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_writelane_b32 v40, s27, 23
; GFX11-NEXT: s_mov_b32 s27, s43
; GFX11-NEXT: v_writelane_b32 v40, s28, 24
Expand Down Expand Up @@ -14433,11 +14436,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12
; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2
; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51
Expand All @@ -14453,8 +14458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40
; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42
Expand Down Expand Up @@ -15115,15 +15120,16 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9
; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11
; GFX11-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
; GFX11-NEXT: v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
; GFX11-NEXT: v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
; GFX11-NEXT: v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v41, s0, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1
Expand All @@ -15140,7 +15146,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5
; GFX11-NEXT: v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5
; GFX11-NEXT: v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7
; GFX11-NEXT: v_writelane_b32 v41, s0, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4
Expand Down Expand Up @@ -15171,18 +15176,20 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 8
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 11
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 15
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
Expand Down Expand Up @@ -15215,7 +15222,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7
; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4
Expand Down Expand Up @@ -15417,19 +15423,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x41700000
; GFX11-NEXT: v_mov_b32_e32 v4, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41200000
; GFX11-NEXT: v_mov_b32_e32 v3, 0x41300000
; GFX11-NEXT: v_mov_b32_e32 v4, 0x41400000
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v6, 0x41600000
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41700000
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v41, s0, 0
; GFX11-NEXT: s_add_i32 s0, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT: v_mov_b32_e32 v6, 1.0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
Expand All @@ -15448,7 +15455,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v29, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX11-NEXT: v_writelane_b32 v41, s0, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4
Expand Down Expand Up @@ -15479,18 +15485,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41700000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41000000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x41300000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0x41400000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41500000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41600000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41700000
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
Expand Down Expand Up @@ -15523,7 +15531,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4
Expand Down
388 changes: 257 additions & 131 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Large diffs are not rendered by default.

38 changes: 22 additions & 16 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -310,10 +310,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
Expand All @@ -322,33 +322,37 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, s4 nt
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -657,24 +661,24 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: private_nontemporal_store_1:
Expand All @@ -684,9 +688,10 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
Expand All @@ -697,9 +702,10 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(5) %out) {
Expand Down
22 changes: 14 additions & 8 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -222,23 +222,27 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX11-WGP-LABEL: private_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_clause 0x1
; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_clause 0x1
; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -480,9 +484,10 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, off dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
Expand All @@ -494,9 +499,10 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, off dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
Expand Down
18 changes: 12 additions & 6 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,26 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31
; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v6, 16, v0
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v10, 32, v0
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v14, 48, v0
; GCN-SCRATCH-NEXT: s_clause 0x3
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v0, off offset:16
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[10:13], v0, off offset:32
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[14:17], v0, off offset:48
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[6:9], v6, off
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[10:13], v10, off
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[14:17], v14, off
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v1, v18
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v1, 16, v0
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v18, 32, v0
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v19, 48, v0
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3)
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[2:5], off
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2)
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v1, v[6:9], off
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v18, v[10:13], off
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48
; GCN-SCRATCH-NEXT: scratch_store_dwordx4 v19, v[14:17], off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down