diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7003a40a940aa..9446144d30e9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2126,6 +2126,8 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, Feature45BitNumRecordsBufferResource, + FeatureSupportsXNACK, + FeatureXNACK, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 41fda6de82181..efa51ead0d196 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -90,26 +90,24 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 -; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v13, v[0:1], off offset:10 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa ; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 -; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1 -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4 -; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v7, 24, v12 :: v_dual_lshlrev_b32 v8, 16, v13 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3 +; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6 ; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -942,7 +940,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX1250-NOUNALIGNED: ; %bb.0: -; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa +; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2 @@ -954,27 +952,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa -; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s13, s[0:1], 0x8 ; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s5, 8 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s2 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s6, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s7, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s8, 8 ; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s9, 24 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s12, 16 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s3, s11 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s5 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s6, s13 ; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1: @@ -1351,11 +1348,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v3i32_align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align4: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align4: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v3i32_align4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: @@ -1388,11 +1399,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg } define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_i96_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_i96_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_i96_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_i96_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: @@ -1425,11 +1450,25 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v3i32_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v3i32_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: @@ -1462,11 +1501,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v6i16_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v6i16_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v6i16_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v6i16_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: @@ -1500,24 +1553,64 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg } define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v12i8_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s13, s0, 8 -; GFX12-NEXT: s_lshr_b32 s12, s0, 16 -; GFX12-NEXT: s_lshr_b32 s3, s0, 24 -; GFX12-NEXT: s_lshr_b32 s5, s1, 8 -; GFX12-NEXT: s_lshr_b32 s6, s1, 16 -; GFX12-NEXT: s_lshr_b32 s7, s1, 24 -; GFX12-NEXT: s_lshr_b32 s9, s2, 8 -; GFX12-NEXT: s_lshr_b32 s10, s2, 16 -; GFX12-NEXT: s_lshr_b32 s11, s2, 24 -; GFX12-NEXT: s_mov_b32 s4, s1 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s1, s13 -; GFX12-NEXT: s_mov_b32 s2, s12 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v12i8_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s13, s0, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s3, s0, 24 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s5, s1, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s6, s1, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s7, s1, 24 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s9, s2, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s11, s2, 24 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s4, s1 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s8, s2 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s1, s13 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s2, s12 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v12i8_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s13, s0, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s3, s0, 24 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s5, s1, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s6, s1, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s7, s1, 24 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s9, s2, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s11, s2, 24 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s4, s1 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s8, s2 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s1, s13 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s2, s12 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v12i8_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s13, s0, 8 +; GFX1250-NEXT: s_lshr_b32 s12, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s0, 24 +; GFX1250-NEXT: s_lshr_b32 s5, s1, 8 +; GFX1250-NEXT: s_lshr_b32 s6, s1, 16 +; GFX1250-NEXT: s_lshr_b32 s7, s1, 24 +; GFX1250-NEXT: s_lshr_b32 s9, s2, 8 +; GFX1250-NEXT: s_lshr_b32 s10, s2, 16 +; GFX1250-NEXT: s_lshr_b32 s11, s2, 24 +; GFX1250-NEXT: s_mov_b32 s4, s1 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s1, s13 +; GFX1250-NEXT: s_mov_b32 s2, s12 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v12i8_align8: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 94ba5cdd09df4..6b5647e696356 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -569,10 +569,10 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 +; GFX1250-NEXT: global_load_b128 v[0:3], v[8:9], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[8:9], off offset:16 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr @@ -752,12 +752,12 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0 +; GFX1250-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_mov_b32 v16, v0 ; GFX1250-NEXT: s_clause 0x3 -; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16 -; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32 -; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 +; GFX1250-NEXT: global_load_b128 v[0:3], v[16:17], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[16:17], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[16:17], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[16:17], off offset:48 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr @@ -1055,16 +1055,16 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0 +; GFX1250-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 ; GFX1250-NEXT: s_clause 0x7 -; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16 -; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32 -; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48 -; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64 -; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80 -; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96 -; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 +; GFX1250-NEXT: global_load_b128 v[0:3], v[32:33], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[32:33], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[32:33], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[32:33], off offset:48 +; GFX1250-NEXT: global_load_b128 v[16:19], v[32:33], off offset:64 +; GFX1250-NEXT: global_load_b128 v[20:23], v[32:33], off offset:80 +; GFX1250-NEXT: global_load_b128 v[24:27], v[32:33], off offset:96 +; GFX1250-NEXT: global_load_b128 v[28:31], v[32:33], off offset:112 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <64 x bfloat>, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ddd3b1520bf5e..363a248ead8d5 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2700,142 +2700,142 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v32i8: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16 ; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_lshr_b32 s16, s0, 16 -; GFX1250-NEXT: s_lshr_b32 s17, s0, 24 -; GFX1250-NEXT: s_lshr_b32 s20, s2, 16 -; GFX1250-NEXT: s_lshr_b32 s21, s2, 24 -; GFX1250-NEXT: s_lshr_b32 s14, s7, 16 -; GFX1250-NEXT: s_lshr_b32 s15, s7, 24 -; GFX1250-NEXT: s_bfe_u32 s27, s7, 0x80008 +; GFX1250-NEXT: s_lshr_b32 s16, s8, 16 +; GFX1250-NEXT: s_lshr_b32 s17, s8, 24 +; GFX1250-NEXT: s_lshr_b32 s6, s15, 16 +; GFX1250-NEXT: s_lshr_b32 s7, s15, 24 +; GFX1250-NEXT: s_bfe_u32 s27, s15, 0x80008 ; GFX1250-NEXT: s_add_co_i32 s17, s17, s17 ; GFX1250-NEXT: s_add_co_i32 s16, s16, s16 -; GFX1250-NEXT: s_lshr_b32 s18, s1, 16 -; GFX1250-NEXT: s_lshr_b32 s19, s1, 24 -; GFX1250-NEXT: s_lshr_b32 s22, s3, 16 -; GFX1250-NEXT: s_lshr_b32 s23, s3, 24 -; GFX1250-NEXT: s_bfe_u32 s29, s1, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x80008 -; GFX1250-NEXT: s_add_co_i32 s21, s21, s21 -; GFX1250-NEXT: s_add_co_i32 s20, s20, s20 ; GFX1250-NEXT: s_lshl_b32 s17, s17, 8 ; GFX1250-NEXT: s_and_b32 s16, s16, 0xff -; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 -; GFX1250-NEXT: s_add_co_i32 s27, s27, s27 ; GFX1250-NEXT: s_add_co_i32 s15, s15, s15 -; GFX1250-NEXT: s_add_co_i32 s14, s14, s14 -; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s27, s27, s27 +; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 +; GFX1250-NEXT: s_or_b32 s16, s16, s17 +; GFX1250-NEXT: s_and_b32 s15, s15, 0xff +; GFX1250-NEXT: s_lshl_b32 s17, s27, 8 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 8 +; GFX1250-NEXT: s_and_b32 s6, s6, 0xff +; GFX1250-NEXT: s_or_b32 s15, s15, s17 +; GFX1250-NEXT: s_or_b32 s6, s6, s7 +; GFX1250-NEXT: s_bfe_u32 s26, s14, 0x80008 +; GFX1250-NEXT: s_and_b32 s7, s15, 0xffff +; GFX1250-NEXT: s_lshl_b32 s6, s6, 16 +; GFX1250-NEXT: s_lshr_b32 s20, s10, 16 +; GFX1250-NEXT: s_lshr_b32 s21, s10, 24 +; GFX1250-NEXT: s_lshr_b32 s4, s14, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s14, 24 +; GFX1250-NEXT: s_or_b32 s6, s7, s6 +; GFX1250-NEXT: s_add_co_i32 s7, s14, s14 +; GFX1250-NEXT: s_add_co_i32 s26, s26, s26 +; GFX1250-NEXT: s_lshr_b32 s18, s9, 16 +; GFX1250-NEXT: s_lshr_b32 s19, s9, 24 +; GFX1250-NEXT: s_lshr_b32 s22, s11, 16 +; GFX1250-NEXT: s_lshr_b32 s23, s11, 24 +; GFX1250-NEXT: s_bfe_u32 s29, s9, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s30, s11, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s21, s21, s21 +; GFX1250-NEXT: s_add_co_i32 s20, s20, s20 +; GFX1250-NEXT: s_lshr_b32 s2, s13, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s13, 24 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xff +; GFX1250-NEXT: s_lshl_b32 s14, s26, 8 +; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 +; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 ; GFX1250-NEXT: s_add_co_i32 s30, s30, s30 ; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 ; GFX1250-NEXT: s_add_co_i32 s22, s22, s22 ; GFX1250-NEXT: s_lshl_b32 s21, s21, 8 ; GFX1250-NEXT: s_and_b32 s20, s20, 0xff -; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 ; GFX1250-NEXT: s_add_co_i32 s29, s29, s29 ; GFX1250-NEXT: s_add_co_i32 s19, s19, s19 ; GFX1250-NEXT: s_add_co_i32 s18, s18, s18 -; GFX1250-NEXT: s_lshr_b32 s10, s5, 16 -; GFX1250-NEXT: s_lshr_b32 s11, s5, 24 -; GFX1250-NEXT: s_lshr_b32 s12, s6, 16 -; GFX1250-NEXT: s_lshr_b32 s13, s6, 24 -; GFX1250-NEXT: s_or_b32 s16, s16, s17 -; GFX1250-NEXT: s_and_b32 s7, s7, 0xff -; GFX1250-NEXT: s_lshl_b32 s17, s27, 8 -; GFX1250-NEXT: s_lshl_b32 s15, s15, 8 -; GFX1250-NEXT: s_and_b32 s14, s14, 0xff -; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_bfe_u32 s25, s13, 0x80008 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 8 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_or_b32 s7, s7, s14 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_and_b32 s11, s11, 0xff ; GFX1250-NEXT: s_lshl_b32 s30, s30, 8 ; GFX1250-NEXT: s_lshl_b32 s23, s23, 8 ; GFX1250-NEXT: s_and_b32 s22, s22, 0xff ; GFX1250-NEXT: s_or_b32 s20, s20, s21 -; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_and_b32 s9, s9, 0xff ; GFX1250-NEXT: s_lshl_b32 s21, s29, 8 ; GFX1250-NEXT: s_lshl_b32 s19, s19, 8 ; GFX1250-NEXT: s_and_b32 s18, s18, 0xff -; GFX1250-NEXT: s_lshr_b32 s8, s4, 16 -; GFX1250-NEXT: s_lshr_b32 s9, s4, 24 -; GFX1250-NEXT: s_bfe_u32 s24, s4, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s25, s5, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s26, s6, 0x80008 -; GFX1250-NEXT: s_or_b32 s7, s7, s17 -; GFX1250-NEXT: s_or_b32 s14, s14, s15 -; GFX1250-NEXT: s_add_co_i32 s13, s13, s13 -; GFX1250-NEXT: s_add_co_i32 s12, s12, s12 -; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 -; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 -; GFX1250-NEXT: s_bfe_u32 s28, s0, 0x80008 -; GFX1250-NEXT: s_or_b32 s3, s3, s30 +; GFX1250-NEXT: s_lshr_b32 s0, s12, 16 +; GFX1250-NEXT: s_lshr_b32 s1, s12, 24 +; GFX1250-NEXT: s_bfe_u32 s24, s12, 0x80008 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s5, s7, 0xffff +; GFX1250-NEXT: s_add_co_i32 s7, s13, s13 +; GFX1250-NEXT: s_add_co_i32 s25, s25, s25 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 8 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_bfe_u32 s28, s8, 0x80008 +; GFX1250-NEXT: s_or_b32 s11, s11, s30 ; GFX1250-NEXT: s_or_b32 s22, s22, s23 -; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x80008 -; GFX1250-NEXT: s_or_b32 s1, s1, s21 +; GFX1250-NEXT: s_bfe_u32 s23, s10, 0x80008 +; GFX1250-NEXT: s_or_b32 s9, s9, s21 ; GFX1250-NEXT: s_or_b32 s18, s18, s19 -; GFX1250-NEXT: s_and_b32 s7, s7, 0xffff -; GFX1250-NEXT: s_lshl_b32 s14, s14, 16 -; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 -; GFX1250-NEXT: s_add_co_i32 s26, s26, s26 -; GFX1250-NEXT: s_lshl_b32 s13, s13, 8 -; GFX1250-NEXT: s_and_b32 s12, s12, 0xff -; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 -; GFX1250-NEXT: s_add_co_i32 s25, s25, s25 -; GFX1250-NEXT: s_lshl_b32 s11, s11, 8 -; GFX1250-NEXT: s_and_b32 s10, s10, 0xff -; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_lshl_b32 s4, s4, 16 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xff +; GFX1250-NEXT: s_lshl_b32 s13, s25, 8 +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_add_co_i32 s3, s12, s12 ; GFX1250-NEXT: s_add_co_i32 s24, s24, s24 -; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 -; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 -; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_and_b32 s11, s11, 0xffff ; GFX1250-NEXT: s_lshl_b32 s22, s22, 16 -; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 ; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 -; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1250-NEXT: s_and_b32 s9, s9, 0xffff ; GFX1250-NEXT: s_lshl_b32 s18, s18, 16 -; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 ; GFX1250-NEXT: s_add_co_i32 s28, s28, s28 -; GFX1250-NEXT: s_or_b32 s7, s7, s14 -; GFX1250-NEXT: s_and_b32 s6, s6, 0xff -; GFX1250-NEXT: s_lshl_b32 s14, s26, 8 -; GFX1250-NEXT: s_or_b32 s12, s12, s13 -; GFX1250-NEXT: s_and_b32 s5, s5, 0xff -; GFX1250-NEXT: s_lshl_b32 s13, s25, 8 -; GFX1250-NEXT: s_or_b32 s10, s10, s11 -; GFX1250-NEXT: s_and_b32 s4, s4, 0xff -; GFX1250-NEXT: s_lshl_b32 s11, s24, 8 -; GFX1250-NEXT: s_lshl_b32 s9, s9, 8 -; GFX1250-NEXT: s_and_b32 s8, s8, 0xff -; GFX1250-NEXT: s_or_b32 s3, s3, s22 -; GFX1250-NEXT: s_and_b32 s2, s2, 0xff -; GFX1250-NEXT: s_lshl_b32 s22, s23, 8 -; GFX1250-NEXT: s_or_b32 s1, s1, s18 +; GFX1250-NEXT: s_or_b32 s4, s5, s4 +; GFX1250-NEXT: s_or_b32 s5, s7, s13 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_lshl_b32 s7, s24, 8 +; GFX1250-NEXT: s_lshl_b32 s1, s1, 8 ; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_or_b32 s11, s11, s22 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xff +; GFX1250-NEXT: s_lshl_b32 s22, s23, 8 +; GFX1250-NEXT: s_or_b32 s9, s9, s18 +; GFX1250-NEXT: s_and_b32 s8, s8, 0xff ; GFX1250-NEXT: s_lshl_b32 s18, s28, 8 -; GFX1250-NEXT: s_or_b32 s6, s6, s14 -; GFX1250-NEXT: s_or_b32 s5, s5, s13 -; GFX1250-NEXT: s_or_b32 s4, s4, s11 -; GFX1250-NEXT: s_or_b32 s8, s8, s9 -; GFX1250-NEXT: s_or_b32 s2, s2, s22 -; GFX1250-NEXT: s_or_b32 s0, s0, s18 -; GFX1250-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1250-NEXT: s_lshl_b32 s12, s12, 16 +; GFX1250-NEXT: s_or_b32 s3, s3, s7 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s10, s10, s22 +; GFX1250-NEXT: s_or_b32 s8, s8, s18 ; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff -; GFX1250-NEXT: s_lshl_b32 s8, s8, 16 -; GFX1250-NEXT: s_lshl_b32 s9, s10, 16 -; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff +; GFX1250-NEXT: s_and_b32 s1, s3, 0xffff +; GFX1250-NEXT: s_lshl_b32 s0, s0, 16 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xffff ; GFX1250-NEXT: s_lshl_b32 s20, s20, 16 -; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_and_b32 s8, s8, 0xffff ; GFX1250-NEXT: s_lshl_b32 s16, s16, 16 -; GFX1250-NEXT: s_or_b32 s6, s6, s12 -; GFX1250-NEXT: s_or_b32 s4, s4, s8 -; GFX1250-NEXT: s_or_b32 s5, s5, s9 -; GFX1250-NEXT: s_or_b32 s2, s2, s20 -; GFX1250-NEXT: s_or_b32 s0, s0, s16 -; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 -; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 +; GFX1250-NEXT: s_or_b32 s0, s1, s0 +; GFX1250-NEXT: s_or_b32 s1, s5, s2 +; GFX1250-NEXT: s_or_b32 s10, s10, s20 +; GFX1250-NEXT: s_or_b32 s8, s8, s16 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6 +; GFX1250-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX1250-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX1250-NEXT: global_store_b128 v[10:11], v[4:7], off diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 51652a09863e0..2ae6fc2081ad9 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -117,12 +117,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: sadd64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -818,17 +818,17 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: suaddo64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[12:13], s[14:15] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -1096,12 +1096,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: ssub64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1798,17 +1798,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: susubo64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 @@ -3099,70 +3099,70 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX1250-LABEL: sudiv64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_b64 s[6:7], s[6:7], 0xffffffff00000000 -; GFX1250-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: -; GFX1250-NEXT: s_cvt_f32_u32 s6, s4 -; GFX1250-NEXT: s_cvt_f32_u32 s7, s5 -; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[4:5] +; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 +; GFX1250-NEXT: s_cvt_f32_u32 s5, s7 +; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_fmac_f32 s6, s7, 0x4f800000 -; GFX1250-NEXT: v_s_rcp_f32 s6, s6 +; GFX1250-NEXT: s_fmac_f32 s4, s5, 0x4f800000 +; GFX1250-NEXT: v_s_rcp_f32 s4, s4 ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_mul_f32 s6, s6, 0x5f7ffffc -; GFX1250-NEXT: s_mul_f32 s7, s6, 0x2f800000 +; GFX1250-NEXT: s_mul_f32 s4, s4, 0x5f7ffffc +; GFX1250-NEXT: s_mul_f32 s5, s4, 0x2f800000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_trunc_f32 s7, s7 -; GFX1250-NEXT: s_fmac_f32 s6, s7, 0xcf800000 -; GFX1250-NEXT: s_cvt_u32_f32 s9, s7 -; GFX1250-NEXT: s_mov_b32 s7, 0 +; GFX1250-NEXT: s_trunc_f32 s5, s5 +; GFX1250-NEXT: s_fmac_f32 s4, s5, 0xcf800000 +; GFX1250-NEXT: s_cvt_u32_f32 s9, s5 +; GFX1250-NEXT: s_mov_b32 s5, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_cvt_u32_f32 s8, s6 +; GFX1250-NEXT: s_cvt_u32_f32 s8, s4 ; GFX1250-NEXT: s_mul_u64 s[12:13], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s15, s8, s13 ; GFX1250-NEXT: s_mul_i32 s14, s8, s13 -; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s12 +; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s12 ; GFX1250-NEXT: s_mul_i32 s17, s9, s12 -; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], s[14:15] +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[4:5], s[14:15] ; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s12 ; GFX1250-NEXT: s_mul_hi_u32 s18, s9, s13 -; GFX1250-NEXT: s_add_co_u32 s6, s14, s17 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s15, s16 +; GFX1250-NEXT: s_add_co_u32 s4, s14, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s15, s16 ; GFX1250-NEXT: s_mul_i32 s12, s9, s13 ; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 -; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s10 +; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 ; GFX1250-NEXT: s_mul_i32 s15, s9, s10 -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_mul_hi_u32 s14, s9, s10 ; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s11 -; GFX1250-NEXT: s_add_co_u32 s6, s12, s15 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s13, s14 +; GFX1250-NEXT: s_add_co_u32 s4, s12, s15 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s13, s14 ; GFX1250-NEXT: s_mul_i32 s10, s9, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s16, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[6:7], s[10:11] +; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 ; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1250-NEXT: s_mul_hi_u32 s6, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 ; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 ; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 @@ -3170,33 +3170,33 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 -; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[8:9] +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s6, s8, s11 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s9, s12 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[10:11] +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] ; GFX1250-NEXT: s_and_b64 s[10:11], s[8:9], 0xffffffff00000000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_or_b32 s10, s10, s8 -; GFX1250-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] +; GFX1250-NEXT: s_mul_u64 s[8:9], s[6:7], s[10:11] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_sub_co_u32 s6, s2, s8 +; GFX1250-NEXT: s_sub_co_u32 s4, s2, s8 ; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1250-NEXT: s_sub_co_i32 s12, s3, s9 ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s5 -; GFX1250-NEXT: s_sub_co_u32 s13, s6, s4 +; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 +; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s12, s5 +; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 ; GFX1250-NEXT: s_cselect_b32 s15, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s12, s5 +; GFX1250-NEXT: s_cmp_eq_u32 s12, s7 ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[10:11], 1 ; GFX1250-NEXT: s_cselect_b32 s16, s15, s14 ; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[10:11], 2 @@ -3206,20 +3206,20 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s3, s3, s9 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_ge_u32 s3, s5 +; GFX1250-NEXT: s_cmp_ge_u32 s3, s7 ; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s6, s4 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s3, s5 -; GFX1250-NEXT: s_cselect_b32 s3, s6, s8 +; GFX1250-NEXT: s_cmp_ge_u32 s4, s6 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s3, s7 +; GFX1250-NEXT: s_cselect_b32 s3, s4, s8 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-NEXT: s_cselect_b32 s9, s13, s11 ; GFX1250-NEXT: s_cselect_b32 s8, s12, s10 ; GFX1250-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1250-NEXT: .LBB16_2: -; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1250-NEXT: s_sub_co_i32 s5, 0, s4 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX1250-NEXT: s_sub_co_i32 s4, 0, s6 ; GFX1250-NEXT: s_mov_b32 s9, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -3228,23 +3228,23 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1250-NEXT: s_mul_i32 s5, s5, s3 +; GFX1250-NEXT: s_mul_i32 s4, s4, s3 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX1250-NEXT: s_add_co_i32 s3, s3, s5 +; GFX1250-NEXT: s_mul_hi_u32 s4, s3, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s4 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s3, s2, s3 -; GFX1250-NEXT: s_mul_i32 s5, s3, s4 +; GFX1250-NEXT: s_mul_i32 s4, s3, s6 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_sub_co_i32 s2, s2, s5 -; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 -; GFX1250-NEXT: s_sub_co_i32 s6, s2, s4 -; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1250-NEXT: s_cselect_b32 s3, s5, s3 -; GFX1250-NEXT: s_cselect_b32 s2, s6, s2 -; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 -; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1250-NEXT: s_cselect_b32 s8, s5, s3 +; GFX1250-NEXT: s_sub_co_i32 s2, s2, s4 +; GFX1250-NEXT: s_add_co_i32 s4, s3, 1 +; GFX1250-NEXT: s_sub_co_i32 s5, s2, s6 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1250-NEXT: s_cselect_b32 s3, s4, s3 +; GFX1250-NEXT: s_cselect_b32 s2, s5, s2 +; GFX1250-NEXT: s_add_co_i32 s4, s3, 1 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1250-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1250-NEXT: .LBB16_3: ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index be60a00145c8a..0cae0e51107df 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -705,12 +705,13 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scale_offset -; GFX1250-NEXT: global_load_b32 v0, v0, s[2:3] scale_offset -; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NEXT: ds_store_b32 v2, v1 offset:32 +; GFX1250-NEXT: ds_store_b32 v0, v1 offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_store_b32 v3, v0 offset:32 +; GFX1250-NEXT: ds_store_b32 v3, v2 offset:32 ; GFX1250-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i @@ -1282,14 +1283,14 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX1250-LABEL: simple_write2_v4f32_superreg_align4: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s8, s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[6:7], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s4 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX1250-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 20795431b4cd8..b5b2655246c3f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2179,6 +2179,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm @@ -2190,15 +2191,16 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] +; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4827f752d9f7c..5e6de6d66ccc1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_0_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_1_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 6dfefd8a6052a..6a6f232c55e24 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_min3_num_f32 v0, v2, v0, v1 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -1217,36 +1217,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: s_mov_b32 s12, s6 -; GFX1250-NEXT: s_mov_b32 s13, s7 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s14 +; GFX1250-NEXT: s_mov_b32 s5, s15 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 @@ -1427,36 +1427,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: s_mov_b32 s12, s6 -; GFX1250-NEXT: s_mov_b32 s13, s7 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s14 +; GFX1250-NEXT: s_mov_b32 s5, s15 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index e532deaca98a8..f80716939f618 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -11,22 +11,20 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: global_load_u8 v2, v[2:3], off -; GCN-SDAG-NEXT: global_load_u8 v3, v[4:5], off -; GCN-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GCN-SDAG-NEXT: global_load_u8 v6, v[2:3], off +; GCN-SDAG-NEXT: global_load_u8 v7, v[4:5], off +; GCN-SDAG-NEXT: global_load_u8 v10, v[0:1], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 +; GCN-SDAG-NEXT: v_lshlrev_b16 v0, 8, v6 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 -; GCN-SDAG-NEXT: v_lshlrev_b16 v2, 8, v3 +; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v7 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-SDAG-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-SDAG-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_bitop2_b32 v0, v10, v0 bitop3:0x54 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -35,13 +33,15 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 -; GCN-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GCN-GISEL-NEXT: global_load_u8 v1, v[2:3], off -; GCN-GISEL-NEXT: global_load_u8 v2, v[4:5], off +; GCN-GISEL-NEXT: global_load_u8 v6, v[0:1], off +; GCN-GISEL-NEXT: global_load_u8 v7, v[2:3], off +; GCN-GISEL-NEXT: global_load_u8 v10, v[4:5], off ; GCN-GISEL-NEXT: s_wait_loadcnt 0x1 -; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; GCN-GISEL-NEXT: s_wait_xcnt 0x2 +; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v7, 8, v6 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_lshlrev_b32 v2, 24, v2 +; GCN-GISEL-NEXT: s_wait_xcnt 0x1 +; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v10 :: v_dual_lshlrev_b32 v2, 24, v10 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 ; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off @@ -64,21 +64,21 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2 ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[2:3], off -; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 0 -; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_pk_add_u16 v10, v6, v2 -; GCN-SDAG-NEXT: v_pk_add_u16 v11, v7, v3 +; GCN-SDAG-NEXT: global_load_b128 v[8:11], v[2:3], off ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 12 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-SDAG-NEXT: v_pk_add_u16 v12, v7, v11 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[6:7], 8 -; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v0 -; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v1 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v8 ; GCN-SDAG-NEXT: s_clause 0x2 -; GCN-SDAG-NEXT: global_store_b16 v[2:3], v11, off -; GCN-SDAG-NEXT: global_store_b32 v[6:7], v10, off -; GCN-SDAG-NEXT: global_store_b64 v[8:9], v[4:5], off +; GCN-SDAG-NEXT: global_store_b16 v[2:3], v12, off +; GCN-SDAG-NEXT: global_store_b32 v[6:7], v1, off +; GCN-SDAG-NEXT: global_store_b64 v[10:11], v[4:5], off ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GCN-GISEL-LABEL: test_v7i16_load_store: @@ -86,28 +86,29 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2 ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[2:3], off -; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 +; GCN-GISEL-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 2 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 4 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 6 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 8 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 10 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[22:23], 12 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_pk_add_u16 v2, v6, v2 -; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v0 -; GCN-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GCN-GISEL-NEXT: v_pk_add_u16 v3, v7, v3 +; GCN-GISEL-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v8 +; GCN-GISEL-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-GISEL-NEXT: v_pk_add_u16 v6, v7, v11 ; GCN-GISEL-NEXT: s_clause 0x6 -; GCN-GISEL-NEXT: global_store_b16 v[8:9], v4, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v4, off -; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off -; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off -; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off -; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-GISEL-NEXT: global_store_b16 v[2:3], v4, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[12:13], v4, off +; GCN-GISEL-NEXT: global_store_b16 v[14:15], v5, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[16:17], v5, off +; GCN-GISEL-NEXT: global_store_b16 v[18:19], v1, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[20:21], v1, off +; GCN-GISEL-NEXT: global_store_b16 v[22:23], v6, off +; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1 %insert = insertelement <7 x i16> %vec1, i16 20, i32 4 @@ -253,8 +254,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32 ; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 ; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70 +; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 @@ -262,14 +263,15 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 -; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_dual_mov_b32 v0, 0xc8 :: v_dual_mov_b32 v1, 0 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off +; GCN-SDAG-NEXT: global_store_b128 v[2:3], v[6:9], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 ; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x5 ; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17 +; GCN-SDAG-NEXT: v_dual_mov_b32 v2, v16 :: v_dual_mov_b32 v3, v17 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] @@ -286,8 +288,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7] ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25] @@ -298,8 +300,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19] ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off -; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off +; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[0:3], off +; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[34:37], off ; GCN-SDAG-NEXT: s_clause 0x7 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112 @@ -309,7 +311,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16 -; GCN-SDAG-NEXT: s_wait_xcnt 0x8 +; GCN-SDAG-NEXT: s_wait_xcnt 0x9 ; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -325,7 +327,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:48 ; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96 ; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 +; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[38:39], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32 @@ -333,7 +335,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_mov_b64_e32 v[66:67], 0x60 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[68:69], 0x70 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8 +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], 0xc8 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[64:65], 0x50 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x6 ; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off @@ -349,7 +352,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_store_b128 v[68:69], v[30:33], off ; GCN-GISEL-NEXT: s_wait_xcnt 0x5 ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9] +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[8:9] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] ; GCN-GISEL-NEXT: s_wait_xcnt 0x4 ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15] @@ -361,8 +364,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25] ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] -; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] ; GCN-GISEL-NEXT: s_wait_xcnt 0x1 @@ -372,8 +375,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33] ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off -; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[34:37], off +; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[34:37], off +; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[0:3], off ; GCN-GISEL-NEXT: s_clause 0x7 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16 @@ -383,7 +386,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112 -; GCN-GISEL-NEXT: s_wait_xcnt 0x9 +; GCN-GISEL-NEXT: s_wait_xcnt 0x8 ; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13 ; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4 @@ -402,16 +405,17 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GCN-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset -; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7 ; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6 @@ -428,10 +432,9 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GCN-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 ; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 @@ -440,8 +443,10 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset -; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4 ; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll index 90fcb5191c353..fa97380583798 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll @@ -11,14 +11,11 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0 define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-MESA3D-LABEL: test_cluster_id_x: ; CHECK-MESA3D: .amd_kernel_code_t @@ -68,7 +65,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -98,14 +95,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_x: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -155,7 +149,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -190,14 +184,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-MESA3D-LABEL: test_cluster_id_y: ; CHECK-MESA3D: .amd_kernel_code_t @@ -247,7 +238,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -277,14 +268,11 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_y: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -334,7 +322,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -369,16 +357,14 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; CHECK-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_wait_xcnt 0x0 +; CHECK-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ; CHECK-MESA3D-LABEL: test_cluster_id_z: ; CHECK-MESA3D: .amd_kernel_code_t @@ -428,7 +414,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -460,16 +446,14 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_wait_xcnt 0x0 +; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_z: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -519,7 +503,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll index aa3b7b3606fd8..3ef84a3943d14 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1219,7 +1219,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll index afe37e371fbc3..b8ff9e5ae0366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll @@ -65,7 +65,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -153,7 +153,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll index 7ea4fa5373e57..9bca696b73437 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -954,7 +954,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1038,7 +1038,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll index 4f7bbf8f3746f..42a50bb304bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll @@ -5,13 +5,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_bcast_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) @@ -92,13 +92,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_down_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) @@ -179,13 +179,13 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_up_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) @@ -266,13 +266,13 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_xor_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir index 76e2092c8b57a..abcae69c1c589 100644 --- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir @@ -69,9 +69,9 @@ body: | bb.0: ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GCN-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 1e6b77ecea85e..4ad161c03f5b7 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -471,13 +471,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader -; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB4_2: ; %for.body @@ -602,13 +602,13 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader -; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB5_2: ; %for.body diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index dbcd3700a1605..08ec0c847e941 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1117,18 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; GFX1250-LABEL: mad_i64_i32_uniform: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s7, 0 +; GFX1250-NEXT: s_mov_b32 s5, 0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s2 ; GFX1250-NEXT: s_mov_b32 s2, s3 -; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s3, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] -; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_mul_u64 s[2:3], s[4:5], s[2:3] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index fef9a9ae07fb1..ae0805448d693 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -257,16 +257,15 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_imax_sge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_i8 s2, s[2:3], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_i8 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_i8 s4, s[2:3], 0x0 +; GFX1250-NEXT: s_load_i8 s5, s[6:7], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_max_i32 s2, s2, s3 +; GFX1250-NEXT: s_max_i32 s2, s4, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -701,16 +700,15 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_umax_uge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_u8 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_u8 s4, s[2:3], 0x0 +; GFX1250-NEXT: s_load_u8 s5, s[6:7], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_max_u32 s2, s2, s3 +; GFX1250-NEXT: s_max_u32 s2, s4, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -777,13 +775,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u32_e32 v0, s2, v0 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0 +; GFX1250-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -1122,12 +1119,12 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_ugt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1175,12 +1172,12 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_uge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1228,12 +1225,12 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sgt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1281,12 +1278,12 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 311527d5d04cc..6a3d31ffcb9f1 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -131,14 +131,14 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_sle_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -1172,14 +1172,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX1250-LABEL: s_test_imin_sle_v4i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1307,14 +1307,14 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_slt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -1484,14 +1484,14 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_slt_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_u16 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i16 v1, v1, v2 ; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset @@ -1686,16 +1686,16 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX1250-LABEL: s_test_imin_slt_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_min_i32 s0, s0, s2 ; GFX1250-NEXT: s_min_i32 s1, s1, s3 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b @@ -2011,14 +2011,14 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ule_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -2171,16 +2171,16 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b96 v[0:2], v3, s[2:3] -; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[4:5] +; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[6:7] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v2, v2, v6 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v5 @@ -2374,14 +2374,14 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -2611,14 +2611,14 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ult_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -2771,14 +2771,14 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_umin_ult_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX1250-NEXT: global_load_u8 v2, v0, s[4:5] +; GFX1250-NEXT: global_load_u8 v2, v0, s[6:7] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u16 v1, v1, v2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -3023,23 +3023,22 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i32_multi_use: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s5, s[6:7], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[12:13], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[14:15], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_lt_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_cmp_lt_u32 s0, s1 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX1250-NEXT: s_and_b32 s6, s6, exec_lo -; GFX1250-NEXT: s_cselect_b32 s4, s4, s5 -; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1250-NEXT: s_and_b32 s2, s2, exec_lo +; GFX1250-NEXT: s_cselect_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b32 v1, v2, s[0:1] -; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX1250-NEXT: global_store_b32 v1, v2, s[8:9] +; GFX1250-NEXT: global_store_b8 v1, v0, s[10:11] ; GFX1250-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 @@ -3220,12 +3219,12 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX1250-NEXT: global_load_u16 v1, v0, s[14:15] +; GFX1250-NEXT: global_load_u16 v2, v0, s[12:13] ; GFX1250-NEXT: s_wait_loadcnt 0x1 ; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -3235,8 +3234,8 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: global_store_b16 v0, v1, s[8:9] +; GFX1250-NEXT: global_store_b8 v0, v2, s[10:11] ; GFX1250-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %aptr, align 2 %b = load i16, ptr addrspace(1) %bptr, align 2 @@ -4338,12 +4337,12 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ult_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp ult i64 %a, %b @@ -4462,12 +4461,12 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ule_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp ule i64 %a, %b @@ -4586,12 +4585,12 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_slt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp slt i64 %a, %b @@ -4710,12 +4709,12 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_sle_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp sle i64 %a, %b @@ -4872,14 +4871,14 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_sle_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -5042,14 +5041,14 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_ule_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index baccb4c7d0859..d29847e40dc8b 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -450,6 +450,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34 +; GFX1250-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mul_i32 s2, s3, s2 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -613,25 +614,25 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; ; GFX1250-LABEL: v_trunc_i64_mul_to_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s6, s10 -; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s6 +; GFX1250-NEXT: s_mov_b32 s15, s7 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s12, s2 ; GFX1250-NEXT: s_mov_b32 s13, s3 ; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null -; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_trunc_i64_mul_to_i32: @@ -2091,11 +2092,11 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX1250-LABEL: s_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7] ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX1250-NEXT: s_mov_b32 s2, -1 @@ -2292,25 +2293,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; ; GFX1250-LABEL: v_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s6, s10 -; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s6 +; GFX1250-NEXT: s_mov_b32 s15, s7 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s12, s2 ; GFX1250-NEXT: s_mov_b32 s13, s3 ; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null -; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i64: @@ -2845,30 +2846,30 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-LABEL: mul64_in_branch: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1250-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1250-NEXT: ; %bb.1: ; %else -; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] +; GFX1250-NEXT: s_mul_u64 s[0:1], s[12:13], s[14:15] ; GFX1250-NEXT: s_cbranch_execnz .LBB16_4 ; GFX1250-NEXT: .LBB16_2: ; %if -; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_mov_b32 s4, s2 -; GFX1250-NEXT: s_mov_b32 s5, s3 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s0, s10 +; GFX1250-NEXT: s_mov_b32 s1, s11 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_branch .LBB16_5 ; GFX1250-NEXT: .LBB16_3: -; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX1250-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1250-NEXT: s_branch .LBB16_2 ; GFX1250-NEXT: .LBB16_4: -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-NEXT: .LBB16_5: ; %endif -; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, -1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: mul64_in_branch: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index b0651ef53dd1b..78207c2cf605e 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -340,46 +340,46 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35] @@ -395,58 +395,58 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43] @@ -466,14 +466,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -1597,46 +1597,46 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35] @@ -1652,58 +1652,58 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43] @@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -2428,46 +2428,46 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fma_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[12:13] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[14:15] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[44:45] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[46:47] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33] @@ -2482,58 +2482,58 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43] @@ -2553,14 +2553,14 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -3529,9 +3529,9 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; ; GFX1250-SDAG-LABEL: fadd_fadd_fsub: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3 @@ -3541,14 +3541,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1] -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[6:7] ; GFX1250-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index b717f85e179b3..6671201ca2b94 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -186,12 +186,12 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 ; ; GFX1250-LABEL: mixed_inreg_block_count_x: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x10 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX1250-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 4d367ef7ffd9d..c1764c94ea2de 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -346,10 +346,10 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; ; GFX1250-LABEL: byref_preload_arg: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100 +; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS @@ -404,10 +404,10 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: byref_staggered_preload_arg: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100 +; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll index b5bb68e1eaa89..e0ea08d276979 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -97,9 +97,9 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -111,10 +111,10 @@ entry: define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -126,10 +126,10 @@ entry: define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -141,12 +141,12 @@ entry: define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b256_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -158,16 +158,16 @@ entry: define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b512_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13 +; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15 +; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17 +; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -275,11 +275,11 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -294,10 +294,10 @@ define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -312,10 +312,10 @@ define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -330,12 +330,12 @@ define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -350,16 +350,16 @@ define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13 +; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15 +; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17 +; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll index f2ecfe8fc9a1f..3d74b171400ac 100644 --- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll +++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll @@ -17,16 +17,16 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_i8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_and_b32 s2, s2, 31 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_ashr_pk_i8_i32 v0, s0, s1, v0 -; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7] ; GFX1250-NEXT: s_endpgm %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0 %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1 @@ -58,16 +58,16 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_u8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_and_b32 s2, s2, 31 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_ashr_pk_u8_i32 v0, s0, s1, v0 -; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7] ; GFX1250-NEXT: s_endpgm %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0 %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index a392692e618cd..6636eb544343b 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -211,38 +211,39 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX1250-SDAG-LABEL: workgroup_id_xyz: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x40014 -; GFX1250-SDAG-NEXT: s_lshr_b32 s6, ttmp7, 16 -; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s0, 1 +; GFX1250-SDAG-NEXT: s_lshr_b32 s8, ttmp7, 16 +; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s0, 1 +; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-SDAG-NEXT: s_mul_i32 s4, s8, s9 ; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40010 -; GFX1250-SDAG-NEXT: s_mul_i32 s7, s6, s7 -; GFX1250-SDAG-NEXT: s_bfe_u32 s8, ttmp6, 0x40008 +; GFX1250-SDAG-NEXT: s_bfe_u32 s5, ttmp6, 0x40008 ; GFX1250-SDAG-NEXT: s_and_b32 s10, ttmp7, 0xffff ; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, 1 ; GFX1250-SDAG-NEXT: s_bfe_u32 s11, ttmp6, 0x4000c -; GFX1250-SDAG-NEXT: s_add_co_i32 s8, s8, s7 -; GFX1250-SDAG-NEXT: s_mul_i32 s7, s10, s9 +; GFX1250-SDAG-NEXT: s_add_co_i32 s5, s5, s4 +; GFX1250-SDAG-NEXT: s_mul_i32 s4, s10, s9 ; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40004 ; GFX1250-SDAG-NEXT: s_add_co_i32 s11, s11, 1 -; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s7 -; GFX1250-SDAG-NEXT: s_and_b32 s7, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s4 +; GFX1250-SDAG-NEXT: s_and_b32 s4, ttmp6, 15 ; GFX1250-SDAG-NEXT: s_mul_i32 s11, ttmp9, s11 ; GFX1250-SDAG-NEXT: s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4) -; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s7, s11 +; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s11 ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s12, 0 -; GFX1250-SDAG-NEXT: s_cselect_b32 s7, ttmp9, s7 +; GFX1250-SDAG-NEXT: s_cselect_b32 s4, ttmp9, s4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7 -; GFX1250-SDAG-NEXT: s_cselect_b32 s7, s10, s9 -; GFX1250-SDAG-NEXT: s_cselect_b32 s6, s6, s8 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-SDAG-NEXT: s_cselect_b32 s4, s10, s9 +; GFX1250-SDAG-NEXT: s_cselect_b32 s5, s8, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-SDAG-NEXT: global_store_b32 v0, v2, s[2:3] -; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[4:5] +; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[6:7] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: workgroup_id_xyz: @@ -250,39 +251,40 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 -; GFX1250-GISEL-NEXT: s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4) +; GFX1250-GISEL-NEXT: s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4) ; GFX1250-GISEL-NEXT: s_mul_i32 s0, ttmp9, s0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-GISEL-NEXT: s_add_co_i32 s1, s1, s0 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: s_cselect_b32 s7, ttmp9, s1 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s9, ttmp9, s1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x40010 -; GFX1250-GISEL-NEXT: s_and_b32 s8, ttmp7, 0xffff +; GFX1250-GISEL-NEXT: s_and_b32 s10, ttmp7, 0xffff ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 -; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40004 -; GFX1250-GISEL-NEXT: s_mul_i32 s10, s8, s0 +; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40004 +; GFX1250-GISEL-NEXT: s_mul_i32 s12, s10, s0 +; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s12 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, s10 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s7 -; GFX1250-GISEL-NEXT: s_cselect_b32 s8, s8, s9 -; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40014 +; GFX1250-GISEL-NEXT: s_cselect_b32 s4, s10, s11 +; GFX1250-GISEL-NEXT: s_bfe_u32 s5, ttmp6, 0x40014 ; GFX1250-GISEL-NEXT: s_lshr_b32 s10, ttmp7, 16 -; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, 1 +; GFX1250-GISEL-NEXT: s_add_co_i32 s5, s5, 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40008 -; GFX1250-GISEL-NEXT: s_mul_i32 s9, s10, s9 +; GFX1250-GISEL-NEXT: s_mul_i32 s5, s10, s5 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s9 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: s_cselect_b32 s6, s10, s11 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6 +; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s5 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s5, s10, s11 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x2 ; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-GISEL-NEXT: global_store_b32 v1, v2, s[2:3] -; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[4:5] +; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[6:7] ; GFX1250-GISEL-NEXT: s_endpgm ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s index 3c693610bee51..80a340c1f6261 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -178,6 +178,7 @@ max_vgprs: // ASM-NEXT: .amdhsa_next_free_sgpr 32 // ASM-NEXT: .amdhsa_named_barrier_count 3 // ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 // ASM-NEXT: .amdhsa_float_round_mode_32 1 // ASM-NEXT: .amdhsa_float_round_mode_16_64 1 // ASM-NEXT: .amdhsa_float_denorm_mode_32 1 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s index 776006bdfba28..642e62df0437a 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s @@ -178,6 +178,7 @@ max_vgprs: // ASM-NEXT: .amdhsa_next_free_sgpr 32 // ASM-NEXT: .amdhsa_named_barrier_count 3 // ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_xnack_mask 1 // ASM-NEXT: .amdhsa_float_round_mode_32 1 // ASM-NEXT: .amdhsa_float_round_mode_16_64 1 // ASM-NEXT: .amdhsa_float_denorm_mode_32 1 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s index 3e96ea3c67380..13f20bfd1081a 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s @@ -20,7 +20,7 @@ ; CHECK-NEXT: ; IMAGE_OP 0 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32 ; CHECK-NEXT: .amdhsa_reserve_vcc 0 -; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 1 ; CHECK-NEXT: .amdhsa_next_free_sgpr 8 ; CHECK-NEXT: .amdhsa_float_round_mode_32 0 ; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0 @@ -76,7 +76,7 @@ ; CHECK-NEXT: ; IMAGE_OP 0 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32 ; CHECK-NEXT: .amdhsa_reserve_vcc 0 -; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 1 ; CHECK-NEXT: .amdhsa_next_free_sgpr 8 ; CHECK-NEXT: .amdhsa_float_round_mode_32 0 ; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0