diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index d00a446ae0397..c0590ce38f28c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -29,6 +29,11 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: @@ -38,6 +43,7 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -49,12 +55,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_kernel: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -65,6 +74,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -93,6 +105,11 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_kernel: @@ -107,6 +124,11 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_kernel: @@ -132,10 +154,14 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:52 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:36 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_kernel: @@ -150,6 +176,7 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -161,12 +188,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_kernel: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -177,11 +207,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -201,6 +235,11 @@ define void @zero_init_foo() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -209,6 +248,7 @@ define void @zero_init_foo() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -220,6 +260,9 @@ define void @zero_init_foo() { ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -228,7 +271,7 @@ define void @zero_init_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s32 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -239,6 +282,9 @@ define void @zero_init_foo() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -257,6 +303,11 @@ define void @zero_init_foo() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -273,6 +324,11 @@ define void @zero_init_foo() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -281,6 +337,7 @@ define void @zero_init_foo() { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -292,6 +349,9 @@ define void @zero_init_foo() { ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -300,7 +360,7 @@ define void @zero_init_foo() { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_mov_b32_e32 v4, s32 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -311,26 +371,15 @@ define void @zero_init_foo() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: zero_init_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -351,6 +400,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: @@ -371,6 +424,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_kernel: @@ -387,6 +444,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: @@ -409,6 +470,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_kernel: @@ -425,6 +490,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: @@ -450,6 +519,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: @@ -466,22 +539,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_sindex_kernel: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 15 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s1, s0, 2 -; GCN-NEXT: s_and_b32 s0, s0, 15 -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_add_u32 s1, 4, s1 -; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_add_u32 s0, 4, s0 -; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -492,6 +554,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -510,6 +573,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_foo: @@ -528,6 +595,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_foo: @@ -542,6 +613,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_foo: @@ -563,6 +638,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_foo: @@ -577,6 +656,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_foo: @@ -600,6 +683,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_foo: @@ -614,20 +701,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_sindex_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_lshl_b32 s1, s0, 2 -; GCN-NEXT: s_and_b32 s0, s0, 15 -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_add_u32 s1, 4, s1 -; GCN-NEXT: v_mov_b32_e32 v0, 15 -; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_add_u32 s0, 4, s0 -; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -638,6 +716,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -654,6 +733,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: @@ -670,6 +753,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: @@ -681,6 +768,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: @@ -700,6 +791,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_kernel: @@ -711,6 +806,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: @@ -732,6 +831,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: @@ -743,17 +846,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_vindex_kernel: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 -; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -766,6 +863,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -782,6 +880,9 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_foo: @@ -796,6 +897,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_foo: @@ -810,6 +915,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_foo: @@ -824,6 +933,9 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_foo: @@ -837,6 +949,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_foo: @@ -851,6 +967,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_foo: @@ -865,19 +985,11 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -888,6 +1000,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -951,13 +1064,6 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: private_ptr_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void @@ -987,6 +1093,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: @@ -998,6 +1113,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1005,10 +1121,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_small_offset_kernel: @@ -1016,7 +1139,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -1027,6 +1150,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1058,6 +1187,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_small_offset_kernel: @@ -1074,6 +1212,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: @@ -1103,9 +1250,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:260 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:276 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v5 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: @@ -1122,6 +1277,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -1129,10 +1285,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v5 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_small_offset_kernel: @@ -1140,7 +1303,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -1151,6 +1314,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1159,6 +1328,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1180,6 +1351,15 @@ define void @zero_init_small_offset_foo() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,6 +1370,7 @@ define void @zero_init_small_offset_foo() { ; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1197,10 +1378,18 @@ define void @zero_init_small_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s32 +; GFX10-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1211,17 +1400,24 @@ define void @zero_init_small_offset_foo() { ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1242,6 +1438,15 @@ define void @zero_init_small_offset_foo() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1260,6 +1465,16 @@ define void @zero_init_small_offset_foo() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1270,6 +1485,7 @@ define void @zero_init_small_offset_foo() { ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -1277,10 +1493,18 @@ define void @zero_init_small_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v5 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1291,42 +1515,34 @@ define void @zero_init_small_offset_foo() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: zero_init_small_offset_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1349,6 +1565,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: @@ -1361,6 +1585,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -1371,6 +1596,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: @@ -1378,7 +1610,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -1389,6 +1621,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1414,6 +1653,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: @@ -1432,6 +1679,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1447,6 +1702,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1460,6 +1716,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1477,6 +1740,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1487,6 +1751,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1494,7 +1765,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1505,6 +1776,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1519,6 +1797,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1540,6 +1820,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_foo: @@ -1560,13 +1848,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -1576,6 +1872,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1600,6 +1903,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_foo: @@ -1616,6 +1927,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1642,6 +1961,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1667,13 +1994,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -1683,8 +2018,15 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_endpgm -bb: +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: s_endpgm +bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef @@ -1697,6 +2039,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1716,6 +2060,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: @@ -1734,6 +2086,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: @@ -1744,8 +2104,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1768,6 +2136,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_small_offset_kernel: @@ -1781,6 +2157,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1805,6 +2189,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1828,6 +2220,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1838,8 +2238,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1856,6 +2264,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1875,6 +2285,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_small_offset_foo: @@ -1882,17 +2299,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0 ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_small_offset_foo: @@ -1900,6 +2326,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1908,6 +2335,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1925,6 +2359,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_small_offset_foo: @@ -1940,6 +2381,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1947,17 +2397,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v1 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1965,6 +2424,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1973,21 +2433,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_small_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -2001,6 +2454,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2028,6 +2483,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_large_offset_kernel: @@ -2050,10 +2514,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX10-NEXT: s_movk_i32 s2, 0x4004 ; GFX10-NEXT: s_movk_i32 s1, 0x4004 ; GFX10-NEXT: s_movk_i32 s0, 0x4004 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_large_offset_kernel: @@ -2070,11 +2542,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2106,6 +2585,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_large_offset_kernel: @@ -2126,6 +2614,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: @@ -2155,9 +2652,17 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v5 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: @@ -2185,10 +2690,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v5 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_large_offset_kernel: @@ -2205,11 +2718,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2218,6 +2738,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -2235,14 +2757,24 @@ define void @zero_init_large_offset_foo() { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX9-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2253,6 +2785,7 @@ define void @zero_init_large_offset_foo() { ; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -2260,14 +2793,23 @@ define void @zero_init_large_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_add_i32 s3, s32, 4 ; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2278,21 +2820,29 @@ define void @zero_init_large_offset_foo() { ; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s4, s32, 4 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2309,14 +2859,24 @@ define void @zero_init_large_offset_foo() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX9-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2331,14 +2891,25 @@ define void @zero_init_large_offset_foo() { ; GFX940-NEXT: s_mov_b32 s3, s0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX940-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2349,6 +2920,7 @@ define void @zero_init_large_offset_foo() { ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -2356,14 +2928,23 @@ define void @zero_init_large_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: s_add_i32 s3, s32, 4 ; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s4 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v5 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,21 +2955,29 @@ define void @zero_init_large_offset_foo() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 s4, s32, 4 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-PAL-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2397,6 +2986,8 @@ define void @zero_init_large_offset_foo() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -2419,6 +3010,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: @@ -2431,6 +3030,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -2441,6 +3041,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: @@ -2448,7 +3055,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -2459,6 +3066,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2484,6 +3098,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: @@ -2502,6 +3124,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2517,6 +3147,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2530,6 +3161,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2547,6 +3185,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -2557,6 +3196,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2564,7 +3210,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -2575,6 +3221,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2589,6 +3242,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2610,6 +3265,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_foo: @@ -2630,13 +3293,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -2646,6 +3317,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2670,6 +3348,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_foo: @@ -2686,6 +3372,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2712,6 +3406,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2737,13 +3439,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -2753,6 +3463,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2767,6 +3484,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2786,6 +3505,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: @@ -2804,6 +3531,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: @@ -2815,8 +3550,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2839,6 +3582,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_large_offset_kernel: @@ -2853,6 +3604,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2877,6 +3636,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2900,6 +3667,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2911,8 +3686,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2929,6 +3712,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2938,8 +3723,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_lo ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 @@ -2948,6 +3733,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_large_offset_foo: @@ -2955,17 +3748,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s1 ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 s0, s32, 4 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_large_offset_foo: @@ -2973,17 +3776,25 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s0, s32, 4 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, s2 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -2991,8 +3802,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 @@ -3001,6 +3812,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_large_offset_foo: @@ -3010,14 +3829,24 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 +; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3025,17 +3854,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s1 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 s0, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v1 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3043,34 +3882,26 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s0, s32, 4 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s2 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_large_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -3084,6 +3915,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3103,6 +3936,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: @@ -3121,6 +3958,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: @@ -3133,6 +3974,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3155,6 +4000,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_large_imm_offset_kernel: @@ -3168,6 +4017,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3192,6 +4045,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3215,6 +4072,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3227,6 +4088,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -3236,6 +4101,7 @@ bb: store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -3245,15 +4111,20 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi +; GFX9-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_large_imm_offset_foo: @@ -3263,14 +4134,19 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-NEXT: s_add_i32 s1, s32, 4 +; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_large_imm_offset_foo: @@ -3279,12 +4155,17 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3292,15 +4173,20 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi +; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_large_imm_offset_foo: @@ -3315,6 +4201,11 @@ define void @store_load_large_imm_offset_foo() { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3324,14 +4215,19 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3340,26 +4236,18 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_large_imm_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 13 -; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -3368,6 +4256,7 @@ bb: store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -3378,14 +4267,17 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-NEXT: scratch_store_dword v0, v2, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: @@ -3403,6 +4295,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: @@ -3415,6 +4311,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3424,17 +4324,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-PAL-NEXT: scratch_store_dword v0, v2, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vidx_sidx_offset: @@ -3447,6 +4350,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3469,6 +4376,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3481,18 +4392,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_vidx_sidx_offset: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -3501,6 +4405,7 @@ bb: %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 store volatile i32 15, i32 addrspace(5)* %gep, align 4 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 + call void asm sideeffect "; use $0", "s"([32 x i32] addrspace(5)* %alloca) #0 ret void } @@ -3583,16 +4488,6 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_i64_aligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 @@ -3678,16 +4573,6 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_i64_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 @@ -3780,17 +4665,6 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) ; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_v3i32_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: v_mov_b32_e32 v4, 3 -; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 @@ -3888,18 +4762,6 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_v4i32_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: v_mov_b32_e32 v4, 3 -; GCN-NEXT: v_mov_b32_e32 v5, 4 -; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll index 6be01670a7beb..3e1cb1ffb2995 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -10,6 +10,7 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) { ; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 ; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off +; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; MUBUF-NEXT: s_waitcnt vmcnt(2) ; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 @@ -23,6 +24,9 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) { ; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use v0 +; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; @@ -32,17 +36,22 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) { ; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 ; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off +; FLATSCR-NEXT: v_mov_b32_e32 v0, s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v0 +; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false) + call void asm sideeffect "; use $0", "s"([40 x i8] addrspace(5)* %alloca) #0 ret void }