115 changes: 54 additions & 61 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s3
Expand All @@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s6
; GFX10-NEXT: v_mov_b32_e32 v1, s7
Expand All @@ -30,7 +30,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
;
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
; GFX8-LABEL: update_dppi64_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -64,25 +64,24 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -102,7 +101,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
; GFX8-LABEL: update_dppf64_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -120,25 +119,24 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -158,7 +156,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
; GFX8-LABEL: update_dppv2i32_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -176,7 +174,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
;
; GFX10-LABEL: update_dppv2i32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
Expand All @@ -190,11 +188,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
;
; GFX11-LABEL: update_dppv2i32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -214,7 +211,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
; GFX8-LABEL: update_dppv2f32_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -232,7 +229,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
;
; GFX10-LABEL: update_dppv2f32_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
Expand All @@ -246,11 +243,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
;
; GFX11-LABEL: update_dppv2f32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -270,7 +266,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p0_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -288,25 +284,24 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -326,7 +321,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p3_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -341,11 +336,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_read_b32 v1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand All @@ -354,8 +349,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p3_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
Expand All @@ -377,11 +371,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
; GFX8-LABEL: update_dpp_p5_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s90, -1
; GFX8-NEXT: s_mov_b32 s91, 0xe80000
; GFX8-NEXT: s_add_u32 s88, s88, s9
; GFX8-NEXT: s_add_u32 s88, s88, s3
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -396,27 +390,26 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31c16000
; GFX10-NEXT: s_add_u32 s4, s4, s3
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s14, -1
; GFX10-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-NEXT: s_add_u32 s12, s12, s9
; GFX10-NEXT: s_addc_u32 s13, s13, 0
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen
; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p5_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
Expand Down
20 changes: 15 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s1
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -1054,7 +1056,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_i96_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s1
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -1091,7 +1095,9 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s1
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -1128,7 +1134,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s1
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -1166,7 +1174,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s1
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
Expand Down
124 changes: 53 additions & 71 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,23 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down Expand Up @@ -58,13 +56,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-LABEL: v_mul_i64_zext_src1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: global_load_dword v4, v3, s[0:1]
; GFX10-NEXT: global_load_dword v4, v3, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0
; GFX10-NEXT: v_mov_b32_e32 v0, v3
Expand All @@ -77,10 +75,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-LABEL: v_mul_i64_zext_src1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -112,13 +108,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-LABEL: v_mul_i64_zext_src0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v2, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
; GFX10-NEXT: v_mov_b32_e32 v0, v3
Expand All @@ -131,10 +127,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-LABEL: v_mul_i64_zext_src0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -166,13 +160,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX10-LABEL: v_mul_i64_zext_src0_src1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
Expand All @@ -182,12 +176,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-LABEL: v_mul_i64_zext_src0_src1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
Expand Down Expand Up @@ -215,13 +207,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX10-LABEL: v_mul_i64_masked_src0_hi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v4, v2, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
; GFX10-NEXT: v_mov_b32_e32 v0, v3
Expand All @@ -234,10 +226,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-LABEL: v_mul_i64_masked_src0_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down Expand Up @@ -269,13 +259,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX10-LABEL: v_mul_i64_masked_src0_lo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -286,10 +276,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX11-LABEL: v_mul_i64_masked_src0_lo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down Expand Up @@ -319,16 +307,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX10-LABEL: v_mul_i64_masked_src1_lo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: ; kill: killed $vgpr3
; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
; GFX10-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -338,10 +326,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX11-LABEL: v_mul_i64_masked_src1_lo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down Expand Up @@ -369,17 +355,17 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul_i64_masked_src0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -403,13 +389,13 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX10-LABEL: v_mul_i64_partially_masked_src0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -426,10 +412,8 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down Expand Up @@ -466,17 +450,17 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul64_masked_before_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_masked_before_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -514,13 +498,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX10-LABEL: v_mul64_masked_before_and_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
Expand Down Expand Up @@ -549,10 +533,8 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-LABEL: v_mul64_masked_before_and_in_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
Expand Down
122 changes: 61 additions & 61 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2542,7 +2542,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_zext_with_sregs:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
Expand All @@ -2559,52 +2559,52 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
Expand All @@ -2619,7 +2619,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
Expand Down Expand Up @@ -2718,7 +2718,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_sext_with_sregs:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
Expand All @@ -2738,61 +2738,61 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_ashr_i32 s3, s2, 31
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: s_mulk_i32 s3, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_add_u32 s3, s3, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: s_mulk_i32 s1, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_add_u32 s1, s1, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: s_mulk_i32 s4, 0x50
; GFX9-NEXT: s_add_u32 s3, s4, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: s_mulk_i32 s2, 0x50
; GFX9-NEXT: s_add_u32 s1, s2, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s3, s2, 31
; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
; GFX10-NEXT: s_mulk_i32 s3, 0x50
; GFX10-NEXT: s_mulk_i32 s2, 0x50
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
; GFX10-NEXT: s_mulk_i32 s1, 0x50
; GFX10-NEXT: s_mulk_i32 s0, 0x50
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
Expand All @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
Expand Down
930 changes: 465 additions & 465 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Large diffs are not rendered by default.

823 changes: 411 additions & 412 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll

Large diffs are not rendered by default.

188 changes: 94 additions & 94 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

Large diffs are not rendered by default.

476 changes: 234 additions & 242 deletions llvm/test/CodeGen/AMDGPU/add.ll

Large diffs are not rendered by default.

284 changes: 132 additions & 152 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll

Large diffs are not rendered by default.

34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,49 +41,49 @@ define void @test1() {
define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-LABEL: test2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-NEXT: s_cmp_lt_i32 s2, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lt_i32 s0, 1
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
Expand Down
3,304 changes: 1,650 additions & 1,654 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
; SI: s_lshl_b32 [[A]], [[A]], 1
; SI: s_lshl_b32 [[B]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
Expand Down Expand Up @@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad

; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AMDGPU/anyext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
; GCN-LABEL: anyext_i1_i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s4, s[2:3], 0xb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -22,8 +22,8 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
;
; GFX8-LABEL: anyext_i1_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -37,17 +37,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
;
; GFX9-LABEL: anyext_i1_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_cmp_eq_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp = icmp eq i32 %cond, 0
Expand All @@ -62,8 +62,8 @@ entry:
define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
; GCN-LABEL: s_anyext_i16_i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s15, s11
Expand All @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: s_anyext_i16_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s7
Expand All @@ -113,13 +113,13 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX9-LABEL: s_anyext_i16_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-NEXT: global_load_ushort v3, v1, s[0:1]
; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down
1,717 changes: 766 additions & 951 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Large diffs are not rendered by default.

6,994 changes: 1,684 additions & 5,310 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Large diffs are not rendered by default.

20,184 changes: 3,708 additions & 16,476 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Large diffs are not rendered by default.

1,364 changes: 608 additions & 756 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

Large diffs are not rendered by default.

1,514 changes: 684 additions & 830 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
;
; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
Expand All @@ -33,7 +33,7 @@ entry:
define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32
;
; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
Expand All @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
Expand All @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
Expand All @@ -92,15 +92,15 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
Expand All @@ -114,7 +114,7 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
Expand All @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac
;
; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
Expand All @@ -140,11 +140,11 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
Expand All @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
Expand All @@ -175,7 +175,7 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
Expand All @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
Expand All @@ -201,7 +201,7 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
Expand All @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
Expand All @@ -227,7 +227,7 @@ entry:
define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
Expand All @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in,
;
; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/bfe-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
Expand All @@ -24,11 +24,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
;
; VI-SDWA-LABEL: bfe_combine8:
; VI-SDWA: ; %bb.0:
; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
Expand All @@ -42,13 +42,13 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
;
; CI-LABEL: bfe_combine8:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s4, s[2:3], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_load_dword s2, s[0:1], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_mov_b32 s7, s3
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; CI-NEXT: v_lshrrev_b32_e32 v0, 6, v0
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: v_and_b32_e32 v0, 0x3fc, v0
Expand All @@ -71,11 +71,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_bfe_u32 v0, v0, 16, 16
; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
Expand All @@ -91,11 +91,11 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x
;
; VI-SDWA-LABEL: bfe_combine16:
; VI-SDWA: ; %bb.0:
; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0
; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
Expand All @@ -111,13 +111,13 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x
;
; CI-LABEL: bfe_combine16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s4, s[2:3], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_load_dword s2, s[0:1], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; CI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; CI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
; SI-LABEL: v_ubfe_sub_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_ubfe_sub_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
; SI-LABEL: v_ubfe_sub_multi_use_shl_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
; SI-LABEL: s_ubfe_sub_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
;
; VI-LABEL: s_ubfe_sub_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
Expand All @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
; SI-LABEL: s_ubfe_sub_multi_use_shl_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
;
; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
Expand Down Expand Up @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
; SI-LABEL: v_sbfe_sub_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_sbfe_sub_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
; SI-LABEL: v_sbfe_sub_multi_use_shl_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
; SI-LABEL: s_sbfe_sub_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
;
; VI-LABEL: s_sbfe_sub_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
Expand All @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
; SI-LABEL: s_sbfe_sub_multi_use_shl_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Expand All @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
;
; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
Expand Down Expand Up @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[6:7], 0x0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand All @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out,
;
; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[6:7], 0x0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand All @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out,
define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[6:7], 0x0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand All @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou
;
; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[6:7], 0x0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand Down Expand Up @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou
define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[6:7], 0x0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand All @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out,
;
; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[6:7], 0x0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
Expand Down
236 changes: 121 additions & 115 deletions llvm/test/CodeGen/AMDGPU/bfi_int.ll

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/bfm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
; SI-LABEL: s_bfm_pattern:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfm_b32 s2, s2, s3
Expand All @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y)
;
; VI-LABEL: s_bfm_pattern:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfm_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -36,22 +36,22 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y)
define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 {
; SI-LABEL: s_bfm_pattern_simple:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfm_b32 s4, s4, 0
; SI-NEXT: s_bfm_b32 s4, s2, 0
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_bfm_pattern_simple:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfm_b32 s2, s4, 0
; VI-NEXT: s_bfm_b32 s2, s2, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand Down
162 changes: 73 additions & 89 deletions llvm/test/CodeGen/AMDGPU/bitreverse.ll

Large diffs are not rendered by default.

64 changes: 32 additions & 32 deletions llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
define amdgpu_kernel void @br_cc_f16(
; SI-LABEL: br_cc_f16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
Expand All @@ -29,28 +29,28 @@ define amdgpu_kernel void @br_cc_f16(
; SI-NEXT: .LBB0_2: ; %two
; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-NEXT: .LBB0_3: ; %one
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: br_cc_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: s_cbranch_vccnz .LBB0_2
; VI-NEXT: ; %bb.1: ; %one
Expand All @@ -63,8 +63,8 @@ define amdgpu_kernel void @br_cc_f16(
; GFX11-LABEL: br_cc_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[8:9], s[2:3], 0x34
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s2
Expand Down Expand Up @@ -111,7 +111,7 @@ two:
define amdgpu_kernel void @br_cc_f16_imm_a(
; SI-LABEL: br_cc_f16_imm_a:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -137,7 +137,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
;
; VI-LABEL: br_cc_f16_imm_a:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -157,7 +157,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
;
; GFX11-LABEL: br_cc_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -195,7 +195,7 @@ two:
define amdgpu_kernel void @br_cc_f16_imm_b(
; SI-LABEL: br_cc_f16_imm_b:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -221,7 +221,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
;
; VI-LABEL: br_cc_f16_imm_b:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -243,7 +243,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
;
; GFX11-LABEL: br_cc_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AMDGPU/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ declare i48 @llvm.bswap.i48(i48) #1
define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s4, s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
Expand All @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX11-LABEL: test_bswap_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
Expand All @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
Expand All @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX11-LABEL: test_bswap_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
Expand All @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
Expand All @@ -166,7 +166,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX11-LABEL: test_bswap_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
Expand All @@ -189,7 +189,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_v8i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand Down Expand Up @@ -226,7 +226,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v8i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v4, 0x10203
; VI-NEXT: s_mov_b32 s15, 0xf000
; VI-NEXT: s_mov_b32 s14, -1
Expand All @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX11-LABEL: test_bswap_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
Expand Down Expand Up @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
Expand All @@ -312,7 +312,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX11-LABEL: test_bswap_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
Expand All @@ -333,7 +333,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -357,7 +357,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
Expand All @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; GFX11-LABEL: test_bswap_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
Expand All @@ -398,7 +398,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; SI-LABEL: test_bswap_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
Expand Down Expand Up @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v4, 0x10203
; VI-NEXT: s_mov_b32 s15, 0xf000
; VI-NEXT: s_mov_b32 s14, -1
Expand All @@ -458,7 +458,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
;
; GFX11-LABEL: test_bswap_v4i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
Expand Down
Loading