46 changes: 38 additions & 8 deletions llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2360,6 +2360,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
Expand Down Expand Up @@ -2397,6 +2398,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
Expand Down Expand Up @@ -2794,6 +2796,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
Expand All @@ -2807,7 +2810,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
Expand Down Expand Up @@ -3454,6 +3457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3
; GFX12-NEXT: v_and_b32_e32 v44, 1, v1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
Expand All @@ -3467,14 +3471,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5
; GFX12-NEXT: s_and_b32 s7, s2, 1
; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4
; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3
Expand All @@ -3489,30 +3495,34 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5
; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
; GFX12-NEXT: v_and_b32_e32 v11, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5
; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5
; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5
; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10014
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
Expand All @@ -3522,7 +3532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 14, s2
Expand All @@ -3541,6 +3551,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 1, v17
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v23
Expand Down Expand Up @@ -4266,6 +4277,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
Expand Down Expand Up @@ -4311,7 +4323,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s20, s3, 0x10016
; GFX12-NEXT: s_bfe_i32 s21, s3, 0x10014
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
Expand Down Expand Up @@ -6791,6 +6803,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
Expand All @@ -6808,6 +6821,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
Expand All @@ -6817,6 +6831,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
Expand All @@ -6827,6 +6842,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
Expand All @@ -6842,6 +6858,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
Expand Down Expand Up @@ -7554,6 +7571,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v62, v[30:33], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v62, v[26:29], s[0:1] offset:48
; GFX12-NEXT: global_store_b128 v62, v[8:11], s[0:1] offset:32
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v62, v[4:7], s[0:1] offset:16
Expand Down Expand Up @@ -8449,6 +8467,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v43, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
; GFX12-NEXT: v_mov_b32_e32 v66, v1
; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
Expand All @@ -8457,6 +8476,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10016
Expand All @@ -8465,6 +8485,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v45, 1, v12
; GFX12-NEXT: v_and_b32_e32 v41, 1, v16
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
Expand All @@ -8473,6 +8494,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
; GFX12-NEXT: v_and_b32_e32 v44, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
; GFX12-NEXT: v_lshrrev_b16 v10, 3, s5
Expand All @@ -8483,6 +8505,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
; GFX12-NEXT: v_and_b32_e32 v33, 1, v20
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_lshrrev_b16 v9, 15, s3
Expand All @@ -8509,6 +8532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
Expand All @@ -8518,6 +8542,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v82, 0xffff, v35
; GFX12-NEXT: v_and_b32_e32 v35, 1, v27
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
; GFX12-NEXT: v_and_b32_e32 v81, 0xffff, v4
Expand All @@ -8529,6 +8554,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v31
; GFX12-NEXT: v_and_b32_e32 v31, 1, v29
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
Expand All @@ -8538,6 +8564,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v21, 2, s2
; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v33
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_lshrrev_b16 v15, 8, s2
Expand All @@ -8561,6 +8588,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v39
; GFX12-NEXT: v_and_b32_e32 v39, 1, v25
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
; GFX12-NEXT: v_and_b32_e32 v79, 0xffff, v5
Expand Down Expand Up @@ -9818,6 +9846,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
; GFX12-NEXT: v_bfe_i32 v79, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v85, v65, 0, 1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v65, s40
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:144
Expand Down Expand Up @@ -9903,6 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v50, 31, v49
; GFX12-NEXT: v_ashrrev_i32_e32 v88, 31, v87
; GFX12-NEXT: v_ashrrev_i32_e32 v86, 31, v85
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
Expand Down
37 changes: 34 additions & 3 deletions llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2974,6 +2974,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: s_lshr_b32 s25, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_lshr_b32 s22, s5, 16
Expand Down Expand Up @@ -3464,6 +3465,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_ashr_i32 s25, s6, 16
; GFX12-NEXT: s_sext_i32_i16 s7, s7
; GFX12-NEXT: s_sext_i32_i16 s6, s6
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_ashr_i32 s22, s5, 16
Expand Down Expand Up @@ -5795,10 +5797,10 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_nop 0
Expand Down Expand Up @@ -6030,6 +6032,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
Expand Down Expand Up @@ -6370,23 +6373,27 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
Expand Down Expand Up @@ -6966,36 +6973,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s7, 16
; GFX12-NEXT: s_and_b32 s5, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s6, 16
; GFX12-NEXT: s_and_b32 s5, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
Expand Down Expand Up @@ -8047,76 +8061,91 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s15, s14, 16
; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s14
; GFX12-NEXT: v_mov_b32_e32 v2, s15
; GFX12-NEXT: s_lshr_b32 s14, s13, 16
; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224
; GFX12-NEXT: v_mov_b32_e32 v0, s13
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s14
; GFX12-NEXT: s_lshr_b32 s13, s12, 16
; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s12
; GFX12-NEXT: v_mov_b32_e32 v2, s13
; GFX12-NEXT: s_lshr_b32 s12, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s12
; GFX12-NEXT: s_lshr_b32 s11, s10, 16
; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s10
; GFX12-NEXT: v_mov_b32_e32 v2, s11
; GFX12-NEXT: s_lshr_b32 s10, s9, 16
; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160
; GFX12-NEXT: v_mov_b32_e32 v0, s9
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s9, s8, 16
; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: s_lshr_b32 s8, s7, 16
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_lshr_b32 s7, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, s7
; GFX12-NEXT: s_lshr_b32 s6, s5, 16
; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
Expand Down Expand Up @@ -8926,6 +8955,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
Expand All @@ -8937,6 +8967,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192
; GFX12-NEXT: v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
; GFX12-NEXT: v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4390,6 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22
; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23
; GFX12-NEXT: v_mov_b32_e32 v5, s56
Expand Down
108 changes: 94 additions & 14 deletions llvm/test/CodeGen/AMDGPU/load-constant-i8.ll

Large diffs are not rendered by default.

113 changes: 81 additions & 32 deletions llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll

Large diffs are not rendered by default.

80 changes: 60 additions & 20 deletions llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -816,13 +816,15 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16:
Expand Down Expand Up @@ -1129,13 +1131,15 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset:
Expand Down Expand Up @@ -1450,12 +1454,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16:
Expand Down Expand Up @@ -1752,12 +1758,14 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset:
Expand Down Expand Up @@ -2053,13 +2061,15 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4:
Expand Down Expand Up @@ -2297,12 +2307,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4:
Expand Down Expand Up @@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16:
Expand Down Expand Up @@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset:
Expand Down Expand Up @@ -3272,12 +3288,14 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16:
Expand Down Expand Up @@ -3616,12 +3634,14 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset:
Expand Down Expand Up @@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
Expand Down Expand Up @@ -4252,12 +4274,14 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
Expand Down Expand Up @@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16:
Expand Down Expand Up @@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset:
Expand Down Expand Up @@ -5073,12 +5101,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16:
Expand Down Expand Up @@ -5334,12 +5364,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset:
Expand Down Expand Up @@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16:
Expand Down Expand Up @@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset:
Expand Down Expand Up @@ -6370,12 +6406,14 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
Expand Down Expand Up @@ -6733,12 +6771,14 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
Expand Down
80 changes: 60 additions & 20 deletions llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -816,13 +816,15 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16:
Expand Down Expand Up @@ -1129,13 +1131,15 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset:
Expand Down Expand Up @@ -1450,12 +1454,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16:
Expand Down Expand Up @@ -1752,12 +1758,14 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset:
Expand Down Expand Up @@ -2053,13 +2061,15 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4:
Expand Down Expand Up @@ -2297,12 +2307,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4:
Expand Down Expand Up @@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16:
Expand Down Expand Up @@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset:
Expand Down Expand Up @@ -3272,12 +3288,14 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16:
Expand Down Expand Up @@ -3616,12 +3634,14 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset:
Expand Down Expand Up @@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
Expand Down Expand Up @@ -4252,12 +4274,14 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
Expand Down Expand Up @@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16:
Expand Down Expand Up @@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset:
Expand Down Expand Up @@ -5073,12 +5101,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16:
Expand Down Expand Up @@ -5334,12 +5364,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset:
Expand Down Expand Up @@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16:
Expand Down Expand Up @@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset:
Expand Down Expand Up @@ -6370,12 +6406,14 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
Expand Down Expand Up @@ -6733,12 +6771,14 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
Expand Down
120 changes: 90 additions & 30 deletions llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
Expand Down Expand Up @@ -149,6 +149,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_add_co_i32 s2, s2, -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ define amdgpu_kernel void @caller() {
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-SDAG-NEXT: s_endpgm
;
Expand All @@ -212,6 +213,7 @@ define amdgpu_kernel void @caller() {
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
Expand Down Expand Up @@ -276,9 +278,10 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ define amdgpu_cs void @caller() {
; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi
; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
;
Expand All @@ -116,6 +117,7 @@ define amdgpu_cs void @caller() {
; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo
; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
Expand Down Expand Up @@ -182,9 +184,10 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
Expand Down
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,26 +106,31 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
; GFX12-NEXT: s_bitset0_b32 s0, 0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
Expand Down Expand Up @@ -313,10 +318,12 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
Expand All @@ -327,8 +334,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
Expand Down Expand Up @@ -530,28 +539,33 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s33
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
; GFX12-NEXT: s_bitset0_b32 s0, 0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
Expand Down Expand Up @@ -745,6 +759,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s32
Expand All @@ -756,8 +771,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
Expand Down Expand Up @@ -911,6 +928,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s32
Expand All @@ -921,8 +939,10 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
Expand Down Expand Up @@ -1092,6 +1112,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
Expand All @@ -1103,10 +1124,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
Expand Down Expand Up @@ -1292,6 +1315,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s33
Expand All @@ -1303,10 +1327,12 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
Expand Down Expand Up @@ -1492,9 +1518,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
Expand All @@ -1509,8 +1537,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: v_readlane_b32 s59, v2, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
Expand Down Expand Up @@ -1710,10 +1740,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_add_nc_u32_e64 v1, s0, s1
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
Expand All @@ -1728,8 +1760,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: v_readlane_b32 s59, v2, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
Expand Down
17 changes: 14 additions & 3 deletions llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
; GFX12-NEXT: v_mov_b32_e32 v0, s32
Expand Down Expand Up @@ -711,13 +712,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0x4000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s32, 0
; GFX12-NEXT: v_writelane_b32 v23, s59, 28
; GFX12-NEXT: s_bitset0_b32 s32, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s32
; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0xffffc000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s32, 0
; GFX12-NEXT: s_bitset0_b32 s32, 0
; GFX12-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -754,8 +756,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: v_readlane_b32 s30, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca i32, align 4, addrspace(5)
Expand Down Expand Up @@ -1396,6 +1400,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v21, s30, 0
; GFX12-NEXT: v_writelane_b32 v21, s31, 1
Expand Down Expand Up @@ -1466,8 +1471,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: v_readlane_b32 s30, v21, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 16, addrspace(5)

Expand Down Expand Up @@ -2196,16 +2203,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v23, s31, 1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v1
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX12-NEXT: v_writelane_b32 v23, s33, 2
; GFX12-NEXT: v_writelane_b32 v23, s34, 3
Expand Down Expand Up @@ -2271,8 +2280,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: v_readlane_b32 s30, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca [4096 x i32], align 4, addrspace(5)
Expand Down
156 changes: 156 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
Expand All @@ -29,18 +30,23 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-NEXT: s_mov_b32 s2, 2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-NEXT: s_mov_b32 s2, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s3, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_mov_b32 s2, s5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
Expand All @@ -64,6 +70,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
Expand All @@ -88,6 +95,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down Expand Up @@ -475,18 +477,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
Expand All @@ -504,18 +511,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
Expand Down Expand Up @@ -688,6 +700,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -703,6 +716,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down Expand Up @@ -1007,17 +1021,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
Expand All @@ -1036,17 +1055,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
Expand Down Expand Up @@ -1224,6 +1248,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -1242,6 +1267,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down
156 changes: 156 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll

Large diffs are not rendered by default.

156 changes: 156 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -128,6 +129,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down Expand Up @@ -329,18 +331,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
Expand All @@ -361,18 +368,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
Expand Down Expand Up @@ -498,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -518,6 +531,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down Expand Up @@ -727,17 +741,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
Expand All @@ -761,17 +780,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
Expand Down Expand Up @@ -896,6 +920,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
Expand All @@ -914,6 +939,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
Expand Down
Loading