108 changes: 72 additions & 36 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11

---
name: fract_f64_neg
Expand All @@ -12,23 +12,41 @@ body: |
bb.1:
liveins: $sgpr0_sgpr1
; CHECK-LABEL: name: fract_f64_neg
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
; GFX10-LABEL: name: fract_f64_neg
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: fract_f64_neg
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
%8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
Expand Down Expand Up @@ -60,23 +78,41 @@ body: |
bb.1:
liveins: $sgpr0_sgpr1
; CHECK-LABEL: name: fract_f64_neg_abs
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
; GFX10-LABEL: name: fract_f64_neg_abs
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: fract_f64_neg_abs
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
%8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
Expand Down
253 changes: 234 additions & 19 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

Large diffs are not rendered by default.

56 changes: 28 additions & 28 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,16 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
;
; GFX10-LABEL: test_div_scale_f32_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_1:
Expand Down Expand Up @@ -133,16 +133,16 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
;
; GFX10-LABEL: test_div_scale_f32_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_2:
Expand Down Expand Up @@ -1266,14 +1266,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
Expand Down Expand Up @@ -1337,14 +1337,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
Expand Down Expand Up @@ -1416,17 +1416,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
;
; GFX10-LABEL: test_div_scale_f32_fabs_num:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_num:
Expand Down Expand Up @@ -1508,17 +1508,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
;
; GFX10-LABEL: test_div_scale_f32_fabs_den:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_den:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
;
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; encoding: [0x01,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,16 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
Expand Down Expand Up @@ -120,16 +120,16 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
Expand Down Expand Up @@ -288,16 +288,16 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
Expand Down
110 changes: 55 additions & 55 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2559,47 +2559,47 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
Expand Down Expand Up @@ -2738,56 +2738,56 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_ashr_i32 s3, s2, 31
; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: s_mulk_i32 s3, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_add_u32 s3, s3, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
; GFX8-NEXT: s_mulk_i32 s0, 0x50
; GFX8-NEXT: s_mulk_i32 s1, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_add_u32 s1, s1, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT: s_mulk_i32 s4, 0x50
; GFX9-NEXT: s_add_u32 s3, s4, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
; GFX9-NEXT: s_mulk_i32 s2, 0x50
; GFX9-NEXT: s_add_u32 s1, s2, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s3, s2, 31
; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
; GFX10-NEXT: s_mulk_i32 s3, 0x50
; GFX10-NEXT: s_mulk_i32 s2, 0x50
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
; GFX10-NEXT: s_mulk_i32 s1, 0x50
; GFX10-NEXT: s_mulk_i32 s0, 0x50
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
Expand Down
196 changes: 98 additions & 98 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ entry:
; GCN-LABEL: {{^}}smrd6:
; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
; GFX9_10: s_add_u32 s2, s2, -4
; GFX9_10: s_addc_u32 s3, s3, -1
; GFX9_10: s_add_u32 s0, s6, -4
; GFX9_10: s_addc_u32 s1, s7, -1
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
entry:
Expand Down
349 changes: 174 additions & 175 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -479,33 +479,33 @@ return:
define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
; GFX906-LABEL: v8i8_phi_chain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB8_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX906-NEXT: .LBB8_2: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB8_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5]
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9]
; GFX906-NEXT: .LBB8_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -533,31 +533,31 @@ bb.3:
define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
; GFX906-LABEL: v8i8_multi_block:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1]
; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v1, v3
; GFX906-NEXT: v_mov_b32_e32 v2, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB9_4
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3]
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB9_3
; GFX906-NEXT: ; %bb.2: ; %bb.2
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9]
; GFX906-NEXT: .LBB9_3: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: .LBB9_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
168 changes: 84 additions & 84 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
;
; GFX9-LABEL: constant_load_i8_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_byte v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_byte v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in, align 4
store i8 %ld, ptr addrspace(1) %out, align 4
Expand All @@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
;
; GFX9-LABEL: constant_load_i16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_short v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i16, ptr addrspace(4) %in, align 4
store i16 %ld, ptr addrspace(1) %out, align 4
Expand All @@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: sextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i8 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_sext_i32_i8 s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i8 s2, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_sext_i32_i8 s0, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%sext = sext i8 %load to i32
Expand All @@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: sextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s2, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%sext = sext i16 %load to i32
Expand All @@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: zextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%zext = zext i8 %load to i32
Expand All @@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: zextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%zext = zext i16 %load to i32
Expand All @@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: constant_load_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
store i8 %load, ptr addrspace(1) %out, align 2
Expand All @@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
;
; GFX9-LABEL: constant_load_i16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 2
store i16 %load, ptr addrspace(1) %out, align 2
Expand Down Expand Up @@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: constant_sextload_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
Expand Down Expand Up @@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: constant_zextload_i8_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
Expand Down
116 changes: 58 additions & 58 deletions llvm/test/CodeGen/AMDGPU/add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,26 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX9-LABEL: s_add_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_add_i32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_i32:
Expand Down Expand Up @@ -125,30 +125,30 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_add_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s5, s7
; GFX9-NEXT: s_add_i32 s3, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s2, s4, s6
; GFX10-NEXT: s_add_i32 s3, s5, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_add_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v2i32:
Expand Down Expand Up @@ -813,30 +813,30 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX9-LABEL: v_add_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_i32:
Expand Down Expand Up @@ -922,26 +922,26 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_add_imm_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_imm_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_imm_i32:
Expand Down Expand Up @@ -1240,50 +1240,50 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[8:9], 0
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_add_u32 s4, s4, s6
; GFX9-NEXT: s_addc_u32 s5, s5, s7
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: s_addc_u32 s1, s9, s11
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
; GFX9-NEXT: .LBB9_2: ; %if
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_add_u32 s4, s4, s6
; GFX10-NEXT: s_addc_u32 s5, s5, s7
; GFX10-NEXT: s_add_u32 s0, s8, s10
; GFX10-NEXT: s_addc_u32 s1, s9, s11
; GFX10-NEXT: s_cbranch_execnz .LBB9_3
; GFX10-NEXT: .LBB9_2: ; %if
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB9_4:
; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
Expand Down
84 changes: 42 additions & 42 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,24 +186,24 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: s_test_add_self_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_self_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_self_v2i16:
Expand Down Expand Up @@ -300,27 +300,27 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_add_v2i16_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_constant:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_constant:
Expand Down Expand Up @@ -369,27 +369,27 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_add_v2i16_neg_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_neg_constant:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_neg_constant:
Expand Down Expand Up @@ -437,26 +437,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
Expand Down Expand Up @@ -503,26 +503,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
Expand Down Expand Up @@ -570,26 +570,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
Expand All @@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX10-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
Expand Down
441 changes: 221 additions & 220 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
; SI: s_lshl_b32 [[A]], [[A]], 1
; SI: s_lshl_b32 [[B]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
Expand Down Expand Up @@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad

; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -427,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_mov_b32 null, 0
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
Expand Down Expand Up @@ -1921,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
Expand Down
1,475 changes: 737 additions & 738 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Large diffs are not rendered by default.

253 changes: 127 additions & 126 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1839,111 +1839,112 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX9-LABEL: add_i64_uniform:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[8:9], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_i32 s7, s3, s6
; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
; GFX9-NEXT: s_add_i32 s8, s8, s7
; GFX9-NEXT: s_mul_i32 s6, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_mul_i32 s3, s7, s2
; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
; GFX9-NEXT: s_add_i32 s8, s8, s3
; GFX9-NEXT: s_mul_i32 s2, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_nop 2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s7, s3, s6
; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
; GFX1064-NEXT: s_mul_i32 s6, s2, s6
; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: v_mov_b32_e32 v0, s6
; GFX1064-NEXT: s_mul_i32 s3, s7, s2
; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
; GFX1064-NEXT: s_mul_i32 s2, s6, s2
; GFX1064-NEXT: s_add_i32 s8, s8, s3
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mul_i32 s6, s3, s5
; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
; GFX1032-NEXT: s_mul_i32 s5, s2, s5
; GFX1032-NEXT: s_add_i32 s7, s7, s6
; GFX1032-NEXT: v_mov_b32_e32 v0, s5
; GFX1032-NEXT: v_mov_b32_e32 v1, s7
; GFX1032-NEXT: s_mul_i32 s2, s7, s1
; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
; GFX1032-NEXT: s_mul_i32 s1, s6, s1
; GFX1032-NEXT: s_add_i32 s3, s3, s2
; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_uniform:
Expand Down Expand Up @@ -5501,119 +5502,119 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX9-LABEL: sub_i64_uniform:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b64 s[8:9], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_i32 s7, s3, s6
; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
; GFX9-NEXT: s_add_i32 s8, s8, s7
; GFX9-NEXT: s_mul_i32 s6, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_mul_i32 s3, s7, s2
; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
; GFX9-NEXT: s_add_i32 s8, s8, s3
; GFX9-NEXT: s_mul_i32 s2, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB13_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s7, s3, s6
; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
; GFX1064-NEXT: s_mul_i32 s6, s2, s6
; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: v_mov_b32_e32 v0, s6
; GFX1064-NEXT: s_mul_i32 s3, s7, s2
; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
; GFX1064-NEXT: s_mul_i32 s2, s6, s2
; GFX1064-NEXT: s_add_i32 s8, s8, s3
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB13_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mul_i32 s6, s3, s5
; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
; GFX1032-NEXT: s_mul_i32 s5, s2, s5
; GFX1032-NEXT: s_add_i32 s7, s7, s6
; GFX1032-NEXT: v_mov_b32_e32 v0, s5
; GFX1032-NEXT: v_mov_b32_e32 v1, s7
; GFX1032-NEXT: s_mul_i32 s2, s7, s1
; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
; GFX1032-NEXT: s_mul_i32 s1, s6, s1
; GFX1032-NEXT: s_add_i32 s3, s3, s2
; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB13_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_uniform:
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_mov_b32 null, 0
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
Expand Down Expand Up @@ -1504,14 +1505,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_mov_b32 null, 0
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
Expand Down Expand Up @@ -1674,14 +1675,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
Expand Down
156 changes: 78 additions & 78 deletions llvm/test/CodeGen/AMDGPU/carryout-selection.ll
Original file line number Diff line number Diff line change
Expand Up @@ -157,26 +157,26 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: sadd64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876
; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: sadd64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
; GFX1010-NEXT: v_mov_b32_e32 v0, s2
; GFX1010-NEXT: v_mov_b32_e32 v1, s3
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876
; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: sadd64ri:
Expand Down Expand Up @@ -255,23 +255,23 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: vadd64rr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64rr:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64rr:
Expand Down Expand Up @@ -679,34 +679,34 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s6, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_addc_u32 s7, s5, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_addc_u32 s1, s9, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
; GFX1010: ; %bb.0:
; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s6, s4, s6
; GFX1010-NEXT: s_addc_u32 s7, s5, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
; GFX1010-NEXT: v_mov_b32_e32 v1, s7
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_add_u32 s0, s8, s10
; GFX1010-NEXT: s_addc_u32 s1, s9, s11
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: suaddo64:
Expand Down Expand Up @@ -1048,26 +1048,26 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: ssub64ri:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6
; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: ssub64ri:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
; GFX1010-NEXT: v_mov_b32_e32 v0, s2
; GFX1010-NEXT: v_mov_b32_e32 v1, s3
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6
; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: ssub64ri:
Expand Down Expand Up @@ -1146,23 +1146,23 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
;
; GFX9-LABEL: vsub64rr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64rr:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64rr:
Expand Down Expand Up @@ -1571,34 +1571,34 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_u32 s6, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_subb_u32 s7, s5, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_sub_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_subb_u32 s1, s9, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
; GFX1010: ; %bb.0:
; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s6, s4, s6
; GFX1010-NEXT: s_subb_u32 s7, s5, s7
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
; GFX1010-NEXT: v_mov_b32_e32 v1, s7
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_sub_u32 s0, s8, s10
; GFX1010-NEXT: s_subb_u32 s1, s9, s11
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: susubo64:
Expand Down
Loading