244 changes: 122 additions & 122 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Large diffs are not rendered by default.

108 changes: 72 additions & 36 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11

---
name: fract_f64_neg
Expand All @@ -12,23 +12,41 @@ body: |
bb.1:
liveins: $sgpr0_sgpr1
; CHECK-LABEL: name: fract_f64_neg
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
; GFX10-LABEL: name: fract_f64_neg
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: fract_f64_neg
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
%8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
Expand Down Expand Up @@ -60,23 +78,41 @@ body: |
bb.1:
liveins: $sgpr0_sgpr1
; CHECK-LABEL: name: fract_f64_neg_abs
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
; GFX10-LABEL: name: fract_f64_neg_abs
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: fract_f64_neg_abs
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
%8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
Expand Down
241 changes: 220 additions & 21 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ regBankSelected: true
# GCN: [[PTR:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1

# Immediate offset:
# SICI: S_LOAD_DWORD_IMM [[PTR]], 1, 0
# SI: S_LOAD_DWORD_IMM [[PTR]], 1, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1, 0
# VI: S_LOAD_DWORD_IMM [[PTR]], 4, 0

# Max immediate offset for SI
# SICI: S_LOAD_DWORD_IMM [[PTR]], 255, 0
# SI: S_LOAD_DWORD_IMM [[PTR]], 255, 0
# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 255, 0
# VI: S_LOAD_DWORD_IMM [[PTR]], 1020, 0

# Immediate overflow for SI
Expand Down Expand Up @@ -196,7 +198,8 @@ body: |
# GCN: %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 44

# VI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 64, 0 :: (dereferenceable invariant load (s32), addrspace 4)
# SICI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4)
# SI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4)
# CI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM_ci %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4)

---

Expand Down
121 changes: 66 additions & 55 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,16 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
;
; GFX10-LABEL: test_div_scale_f32_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_1:
Expand Down Expand Up @@ -131,16 +131,16 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
;
; GFX10-LABEL: test_div_scale_f32_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_2:
Expand Down Expand Up @@ -217,15 +217,16 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_1:
Expand Down Expand Up @@ -303,15 +304,16 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_2:
Expand Down Expand Up @@ -678,14 +680,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
Expand Down Expand Up @@ -753,14 +756,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
Expand Down Expand Up @@ -828,14 +832,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
Expand Down Expand Up @@ -903,14 +908,15 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1090,10 +1096,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_1:
Expand Down Expand Up @@ -1150,10 +1157,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_2:
Expand Down Expand Up @@ -1212,14 +1220,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
Expand Down Expand Up @@ -1281,14 +1289,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
Expand Down Expand Up @@ -1358,17 +1366,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
;
; GFX10-LABEL: test_div_scale_f32_fabs_num:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_num:
Expand Down Expand Up @@ -1448,17 +1456,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
;
; GFX10-LABEL: test_div_scale_f32_fabs_den:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_den:
Expand Down Expand Up @@ -1518,11 +1526,12 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou
;
; GFX10-LABEL: test_div_scale_f32_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 null, 0
; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_val_undef_val:
Expand Down Expand Up @@ -1566,11 +1575,12 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou
;
; GFX10-LABEL: test_div_scale_f32_undef_val_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 null, 0
; GFX10-NEXT: v_div_scale_f32 v0, s0, 0x41000000, 0x41000000, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_val_val:
Expand Down Expand Up @@ -1612,11 +1622,12 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
;
; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 null, 0
; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_undef_val:
Expand Down Expand Up @@ -1663,10 +1674,10 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
; GFX10-LABEL: test_div_scale_f64_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_val_undef_val:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s5, 0x405ec000
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: v_accvgpr_write_b32 a5, s9
; GCN-NEXT: v_accvgpr_write_b32 a6, s10
; GCN-NEXT: v_accvgpr_write_b32 a7, s11
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
;
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,16 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
Expand Down Expand Up @@ -119,16 +119,16 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
Expand Down Expand Up @@ -284,16 +284,16 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
Expand Down Expand Up @@ -336,11 +336,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_read_b32 v1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand Down Expand Up @@ -390,17 +390,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31c16000
; GFX10-NEXT: s_add_u32 s4, s4, s3
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
Expand Down Expand Up @@ -355,12 +355,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul_i64_masked_src0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0:
Expand Down Expand Up @@ -450,12 +450,12 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul64_masked_before_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_masked_before_branch:
Expand Down
110 changes: 55 additions & 55 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2559,47 +2559,47 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
Expand Down Expand Up @@ -2738,56 +2738,56 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_ashr_i32 s3, s2, 31
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: s_mulk_i32 s3, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_add_u32 s3, s3, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: s_mulk_i32 s1, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_add_u32 s1, s1, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: s_mulk_i32 s4, 0x50
; GFX9-NEXT: s_add_u32 s3, s4, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: s_mulk_i32 s2, 0x50
; GFX9-NEXT: s_add_u32 s1, s2, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s3, s2, 31
; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
; GFX10-NEXT: s_mulk_i32 s3, 0x50
; GFX10-NEXT: s_mulk_i32 s2, 0x50
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
; GFX10-NEXT: s_mulk_i32 s1, 0x50
; GFX10-NEXT: s_mulk_i32 s0, 0x50
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
Expand Down
196 changes: 98 additions & 98 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Large diffs are not rendered by default.

349 changes: 174 additions & 175 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll

Large diffs are not rendered by default.

168 changes: 84 additions & 84 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
;
; GFX9-LABEL: constant_load_i8_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_byte v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_byte v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in, align 4
store i8 %ld, ptr addrspace(1) %out, align 4
Expand All @@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
;
; GFX9-LABEL: constant_load_i16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_short v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i16, ptr addrspace(4) %in, align 4
store i16 %ld, ptr addrspace(1) %out, align 4
Expand All @@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: sextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i8 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_sext_i32_i8 s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i8 s2, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_sext_i32_i8 s0, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%sext = sext i8 %load to i32
Expand All @@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: sextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s2, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%sext = sext i16 %load to i32
Expand All @@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: zextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%zext = zext i8 %load to i32
Expand All @@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: zextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%zext = zext i16 %load to i32
Expand All @@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: constant_load_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
store i8 %load, ptr addrspace(1) %out, align 2
Expand All @@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
;
; GFX9-LABEL: constant_load_i16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 2
store i16 %load, ptr addrspace(1) %out, align 2
Expand Down Expand Up @@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: constant_sextload_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
Expand Down Expand Up @@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: constant_zextload_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
Expand Down
183 changes: 92 additions & 91 deletions llvm/test/CodeGen/AMDGPU/add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,26 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX9-LABEL: s_add_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_add_i32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_i32:
Expand Down Expand Up @@ -125,30 +125,30 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_add_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s5, s7
; GFX9-NEXT: s_add_i32 s3, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s4, s6
; GFX10-NEXT: s_add_i32 s3, s5, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_add_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v2i32:
Expand Down Expand Up @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX9-LABEL: s_add_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s7, s15
; GFX9-NEXT: s_add_i32 s3, s6, s14
; GFX9-NEXT: s_add_i32 s0, s7, s15
; GFX9-NEXT: s_add_i32 s1, s6, s14
; GFX9-NEXT: s_add_i32 s6, s11, s19
; GFX9-NEXT: s_add_i32 s7, s10, s18
; GFX9-NEXT: s_add_i32 s9, s9, s17
Expand All @@ -388,23 +388,24 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v8i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s7, s15
; GFX10-NEXT: s_add_i32 s3, s6, s14
; GFX10-NEXT: s_add_i32 s0, s7, s15
; GFX10-NEXT: s_add_i32 s1, s6, s14
; GFX10-NEXT: s_add_i32 s6, s11, s19
; GFX10-NEXT: s_add_i32 s7, s10, s18
; GFX10-NEXT: s_add_i32 s8, s8, s16
Expand All @@ -417,10 +418,10 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v6, s1
; GFX10-NEXT: v_mov_b32_e32 v7, s0
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v8i32:
Expand Down Expand Up @@ -592,11 +593,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s7, s39
; GFX9-NEXT: s_add_i32 s3, s6, s38
; GFX9-NEXT: s_add_i32 s0, s7, s39
; GFX9-NEXT: s_add_i32 s1, s6, s38
; GFX9-NEXT: s_add_i32 s6, s11, s43
; GFX9-NEXT: s_add_i32 s7, s10, s42
; GFX9-NEXT: s_add_i32 s10, s15, s47
Expand All @@ -613,38 +614,38 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9-NEXT: v_mov_b32_e32 v3, s14
; GFX9-NEXT: s_add_i32 s9, s9, s41
; GFX9-NEXT: s_add_i32 s8, s8, s40
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
; GFX9-NEXT: s_add_i32 s5, s5, s37
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: s_add_i32 s4, s4, s36
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v16i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v16, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s7, s39
; GFX10-NEXT: s_add_i32 s3, s6, s38
; GFX10-NEXT: s_add_i32 s0, s7, s39
; GFX10-NEXT: s_add_i32 s1, s6, s38
; GFX10-NEXT: s_add_i32 s6, s11, s43
; GFX10-NEXT: s_add_i32 s7, s10, s42
; GFX10-NEXT: s_add_i32 s10, s15, s47
Expand Down Expand Up @@ -673,12 +674,12 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX10-NEXT: v_mov_b32_e32 v11, s6
; GFX10-NEXT: v_mov_b32_e32 v12, s4
; GFX10-NEXT: v_mov_b32_e32 v13, s5
; GFX10-NEXT: v_mov_b32_e32 v14, s3
; GFX10-NEXT: v_mov_b32_e32 v15, s2
; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v14, s1
; GFX10-NEXT: v_mov_b32_e32 v15, s0
; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:48
; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:32
; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v16i32:
Expand Down Expand Up @@ -812,30 +813,30 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX9-LABEL: v_add_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_i32:
Expand Down Expand Up @@ -917,26 +918,26 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_add_imm_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_imm_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_imm_i32:
Expand Down Expand Up @@ -1231,50 +1232,50 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[8:9], 0
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_add_u32 s4, s4, s6
; GFX9-NEXT: s_addc_u32 s5, s5, s7
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: s_addc_u32 s1, s9, s11
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
; GFX9-NEXT: .LBB9_2: ; %if
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_add_u32 s4, s4, s6
; GFX10-NEXT: s_addc_u32 s5, s5, s7
; GFX10-NEXT: s_add_u32 s0, s8, s10
; GFX10-NEXT: s_addc_u32 s1, s9, s11
; GFX10-NEXT: s_cbranch_execnz .LBB9_3
; GFX10-NEXT: .LBB9_2: ; %if
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB9_4:
; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
Expand Down
84 changes: 42 additions & 42 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -184,24 +184,24 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: s_test_add_self_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_self_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_self_v2i16:
Expand Down Expand Up @@ -298,27 +298,27 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_add_v2i16_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_constant:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_constant:
Expand Down Expand Up @@ -365,27 +365,27 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_add_v2i16_neg_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_neg_constant:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_neg_constant:
Expand Down Expand Up @@ -431,26 +431,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
Expand Down Expand Up @@ -495,26 +495,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
Expand Down Expand Up @@ -560,26 +560,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
Expand All @@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX10-NEXT: s_cmp_lt_i32 s2, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
Expand Down
1,225 changes: 614 additions & 611 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
; SI: s_lshl_b32 [[A]], [[A]], 1
; SI: s_lshl_b32 [[B]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
Expand Down Expand Up @@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad

; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
Expand Down
198 changes: 106 additions & 92 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Large diffs are not rendered by default.

918 changes: 513 additions & 405 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Large diffs are not rendered by default.

1,100 changes: 568 additions & 532 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Large diffs are not rendered by default.

172 changes: 92 additions & 80 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

Large diffs are not rendered by default.

192 changes: 102 additions & 90 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/build_vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector2:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
Expand Down Expand Up @@ -119,14 +119,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector4:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: v_mov_b32_e32 v2, 7
; GFX940-NEXT: v_mov_b32_e32 v3, 8
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
Expand Down Expand Up @@ -176,11 +176,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector_v2i16:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
Expand Down
182 changes: 92 additions & 90 deletions llvm/test/CodeGen/AMDGPU/carryout-selection.ll
Original file line number Diff line number Diff line change
Expand Up @@ -157,26 +157,26 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: sadd64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876
; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: sadd64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
; GFX1010-NEXT: v_mov_b32_e32 v0, s2
; GFX1010-NEXT: v_mov_b32_e32 v1, s3
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876
; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: sadd64ri:
Expand Down Expand Up @@ -255,23 +255,23 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: vadd64rr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64rr:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64rr:
Expand Down Expand Up @@ -346,23 +346,24 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
;
; GFX9-LABEL: vadd64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1010-NEXT: s_mov_b32 null, 0
; GFX1010-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0x1234, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64ri:
Expand Down Expand Up @@ -675,34 +676,34 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s6, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_addc_u32 s7, s5, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_addc_u32 s1, s9, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
; GFX1010: ; %bb.0:
; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s6, s4, s6
; GFX1010-NEXT: s_addc_u32 s7, s5, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
; GFX1010-NEXT: v_mov_b32_e32 v1, s7
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_add_u32 s0, s8, s10
; GFX1010-NEXT: s_addc_u32 s1, s9, s11
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: suaddo64:
Expand Down Expand Up @@ -1042,26 +1043,26 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: ssub64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6
; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: ssub64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
; GFX1010-NEXT: v_mov_b32_e32 v0, s2
; GFX1010-NEXT: v_mov_b32_e32 v1, s3
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6
; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: ssub64ri:
Expand Down Expand Up @@ -1140,23 +1141,23 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: vsub64rr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64rr:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64rr:
Expand Down Expand Up @@ -1231,23 +1232,24 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
;
; GFX9-LABEL: vsub64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1010-NEXT: s_mov_b32 null, 0
; GFX1010-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x1234, 0, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64ri:
Expand Down Expand Up @@ -1561,34 +1563,34 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_u32 s6, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_subb_u32 s7, s5, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_sub_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_subb_u32 s1, s9, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
; GFX1010: ; %bb.0:
; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s6, s4, s6
; GFX1010-NEXT: s_subb_u32 s7, s5, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
; GFX1010-NEXT: v_mov_b32_e32 v1, s7
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_sub_u32 s0, s8, s10
; GFX1010-NEXT: s_subb_u32 s1, s9, s11
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: susubo64:
Expand Down
Loading