104 changes: 52 additions & 52 deletions llvm/test/CodeGen/AMDGPU/madak.ll
Original file line number Diff line number Diff line change
Expand Up @@ -220,40 +220,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
;
; GFX9-LABEL: madak_2_use_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc
; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000
; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3
; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4
; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: madak_2_use_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc
; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000
; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5]
; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1]
; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4
; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-MAD-NEXT: s_endpgm
;
Expand Down Expand Up @@ -282,40 +282,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
;
; GFX940-FMA-LABEL: madak_2_use_f32:
; GFX940-FMA: ; %bb.0:
; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000
; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1
; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000
; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3
; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: madak_2_use_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc
; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000
; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5]
; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1]
; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4
; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FMA-NEXT: s_endpgm
;
Expand Down Expand Up @@ -398,24 +398,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
;
; GFX9-LABEL: madak_m_inline_imm_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
Expand All @@ -435,24 +435,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
;
; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
; GFX940-FMA: ; %bb.0:
; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3]
; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000
; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX940-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
Expand Down Expand Up @@ -931,23 +931,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
;
; GFX9-LABEL: s_s_madak_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-MAD-LABEL: s_s_madak_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7
; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3
; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
; GFX11-MAD-LABEL: s_s_madak_f32:
Expand All @@ -964,23 +964,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
;
; GFX940-FMA-LABEL: s_s_madak_f32:
; GFX940-FMA: ; %bb.0:
; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7
; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2
; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3
; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2
; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX940-FMA-NEXT: s_endpgm
;
; GFX10-FMA-LABEL: s_s_madak_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7
; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3
; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: s_s_madak_f32:
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
; GCN-LABEL: vector_clause:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5]
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; GCN-NEXT: s_endpgm
;
; GCN-SCRATCH-LABEL: vector_clause:
Expand Down
160 changes: 80 additions & 80 deletions llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
; GFX9-LABEL: ctlz_i64_poison:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v2, v0
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: ctlz_i64_poison:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7]
; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2
; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
%ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
Expand All @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
; GFX9-LABEL: ctlz_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v2, v0
; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: ctlz_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7]
; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2
; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
%ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
Expand All @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
; GFX9-LABEL: cttz_i64_poison:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX9-NEXT: v_ffbl_b32_e32 v2, v2
; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v0, v2
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cttz_i64_poison:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7
; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6
; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
Expand All @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
Expand All @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
; GFX9-LABEL: cttz_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6
; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7
; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(5)
Expand All @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v0, v2
; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cttz_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7
; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6
; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7]
; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5
; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7
; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3
; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3]
; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_waitcnt vmcnt(6)
Expand All @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/mul_int24.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
;
; GFX9-LABEL: test_smul24_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i32:
Expand Down Expand Up @@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
;
; GFX9-LABEL: test_smulhi24_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smulhi24_i64:
Expand Down
52 changes: 26 additions & 26 deletions llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
;
; GFX9-LABEL: test_umul24_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff
; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%0 = shl i32 %a, 8
Expand Down Expand Up @@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
;
; GFX9-LABEL: test_umulhi24_i32_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff
; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff
; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%a.24 = and i32 %a, 16777215
Expand Down Expand Up @@ -663,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
;
; GFX9-LABEL: test_umulhi16_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s0, s6, 0xffff
; GFX9-NEXT: s_and_b32 s1, s7, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5]
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%a.16 = and i32 %a, 65535
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_scalar_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -35,17 +35,17 @@ bb:
define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -67,17 +67,17 @@ bb:
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -99,17 +99,17 @@ bb:
define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -132,17 +132,17 @@ bb:
define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -163,17 +163,17 @@ bb:
define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
; GCN-NEXT: global_store_dword v3, v0, s[4:5]
; GCN-NEXT: global_store_dword v3, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -194,16 +194,16 @@ bb:
define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: ds_read_u16 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
; GCN-NEXT: global_store_dword v2, v0, s[4:5]
; GCN-NEXT: global_store_dword v2, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
Expand All @@ -222,11 +222,11 @@ bb:
define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v3, v1
Expand All @@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1
; GCN-NEXT: global_store_dword v4, v0, s[4:5]
; GCN-NEXT: global_store_dword v4, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand All @@ -261,10 +261,10 @@ bb:
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
; GCN-NEXT: ds_read_u16 v3, v1
Expand All @@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GCN-NEXT: global_store_dword v1, v0, s[4:5]
; GCN-NEXT: global_store_dword v1, v0, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
; SDAG-LABEL: buffers_from_flat_dont_alias:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-NEXT: s_mov_b32 s3, 0
; SDAG-NEXT: s_mov_b32 s2, 16
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; SDAG-NEXT: s_mov_b32 s7, 0
; SDAG-NEXT: s_mov_b32 s6, 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_and_b32 s1, s5, 0xffff
; SDAG-NEXT: s_mov_b32 s0, s4
; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; SDAG-NEXT: s_and_b32 s1, s7, 0xffff
; SDAG-NEXT: s_mov_b32 s0, s6
; SDAG-NEXT: s_and_b32 s5, s1, 0xffff
; SDAG-NEXT: s_mov_b32 s4, s0
; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; SDAG-NEXT: s_and_b32 s5, s3, 0xffff
; SDAG-NEXT: s_mov_b32 s4, s2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1
; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2
; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3
; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_from_flat_dont_alias:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: s_mov_b32 s3, 0
; GISEL-NEXT: s_mov_b32 s2, 16
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GISEL-NEXT: s_mov_b32 s7, 0
; GISEL-NEXT: s_mov_b32 s6, 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_and_b32 s1, s5, 0xffff
; GISEL-NEXT: s_mov_b32 s0, s4
; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GISEL-NEXT: s_and_b32 s1, s7, 0xffff
; GISEL-NEXT: s_mov_b32 s0, s6
; GISEL-NEXT: s_and_b32 s5, s1, 0xffff
; GISEL-NEXT: s_mov_b32 s4, s0
; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GISEL-NEXT: s_and_b32 s5, s3, 0xffff
; GISEL-NEXT: s_mov_b32 s4, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1
; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2
; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3
; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GISEL-NEXT: s_endpgm
%a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
%b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/rotl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX10-LABEL: rotl_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_i32 s0, 32, s7
; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_sub_i32 s3, 32, s3
; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_i32:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/rotr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
;
; GFX10-LABEL: rotr_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotr_i32:
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GFX9-LABEL: s_shl_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_shl_v2i16:
Expand Down Expand Up @@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
;
; GFX10-LABEL: s_shl_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6
; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2
; GFX10-NEXT: s_mov_b32 s4, s0
; GFX10-NEXT: s_mov_b32 s5, s1
; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_shl_v2i16:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/sub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
;
; GFX9-LABEL: s_sub_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s0, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_sub_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: s_sub_i32:
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
; GFX9-LABEL: s_test_sub_v2i16_kernarg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_test_sub_v2i16_kernarg:
Expand All @@ -246,14 +246,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
;
; GFX10-LABEL: s_test_sub_v2i16_kernarg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7
; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3
; GFX10-NEXT: s_mov_b32 s4, s0
; GFX10-NEXT: s_mov_b32 s5, s1
; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_sub_v2i16_kernarg:
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/v_cndmask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
;
; GFX10-LABEL: v_cnd_nan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s6, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1]
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_cmp_eq_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cnd_nan:
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
;
; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff
; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1
; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1
; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1
; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1
; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; SDAG-GFX9-NEXT: s_endpgm
;
; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
Expand Down Expand Up @@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
;
; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
; GISEL-GFX9: ; %bb.0:
; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0
; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff
; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0
; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6
; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7
; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0
; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0
; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0
; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1
; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1
; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3
; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4
; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4
; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2
; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3
; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5
; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5
; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2
; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GISEL-GFX9-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
Expand Down
54 changes: 27 additions & 27 deletions llvm/test/CodeGen/AMDGPU/wave32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1344,40 +1344,40 @@ exit:
define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
; GFX1032-LABEL: fdiv_f32:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6
; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
; GFX1032-NEXT: v_rcp_f32_e32 v1, v0
; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1
; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6
; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX1032-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX1032-NEXT: global_store_dword v1, v0, s[0:1]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: fdiv_f32:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6
; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2
; GFX1064-NEXT: v_rcp_f32_e32 v1, v0
; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1
; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6
; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2
; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6
; GFX1064-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2
; GFX1064-NEXT: global_store_dword v1, v0, s[0:1]
; GFX1064-NEXT: s_endpgm
entry:
%fdiv = fdiv float %a, %b
Expand Down Expand Up @@ -2138,23 +2138,23 @@ main_body:
define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
; GFX1032-LABEL: test_intr_fcmp_i64:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7|
; GFX1032-NEXT: v_mov_b32_e32 v0, s0
; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_intr_fcmp_i64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7|
; GFX1064-NEXT: v_mov_b32_e32 v0, s0
; GFX1064-NEXT: v_mov_b32_e32 v1, s1
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3|
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s3
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1064-NEXT: s_endpgm
%temp = call float @llvm.fabs.f32(float %a)
%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
Expand Down Expand Up @@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
; GFX1032-LABEL: test_intr_fcmp_i32:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7|
; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
; GFX1032-NEXT: v_mov_b32_e32 v1, s2
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_intr_fcmp_i32:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7|
; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3|
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1064-NEXT: s_endpgm
%temp = call float @llvm.fabs.f32(float %a)
%result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
Expand Down