46 changes: 46 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s

; Unaligned DS access in available from GFX9 onwards.
; LDS alignment enforcement is controlled by a configuration register:
Expand Down Expand Up @@ -80,6 +81,14 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: load_lds_v4i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_b128 v[0:3], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
ret <4 x i32> %load
}
Expand Down Expand Up @@ -145,6 +154,14 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_b32 v2, v2 offset:8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: load_lds_v3i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_b96 v[0:2], v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
ret <3 x i32> %load
}
Expand Down Expand Up @@ -200,6 +217,14 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_lds_v4i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_store_b128 v0, v[1:4]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
}
Expand Down Expand Up @@ -248,6 +273,14 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
; GFX10-NEXT: ds_write_b32 v0, v3 offset:8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_lds_v3i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_store_b96 v0, v[1:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
ret void
}
Expand Down Expand Up @@ -290,6 +323,19 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)*
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_s_load_constant_v8i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1
store <8 x i32> %load, <8 x i32> addrspace(1)* %out
ret void
Expand Down
832 changes: 449 additions & 383 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll

Large diffs are not rendered by default.

745 changes: 432 additions & 313 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s

define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
; GFX9-LABEL: v_mul_v2i16:
Expand Down
311 changes: 205 additions & 106 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll

Large diffs are not rendered by default.

184 changes: 112 additions & 72 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s

define float @v_roundeven_f32(float %x) {
; GFX6-LABEL: v_roundeven_f32:
Expand All @@ -30,12 +31,12 @@ define float @v_roundeven_f32(float %x) {
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call float @llvm.roundeven.f32(float %x)
ret float %roundeven
}
Expand Down Expand Up @@ -69,13 +70,13 @@ define <2 x float> @v_roundeven_v2f32(<2 x float> %x) {
; GFX9-NEXT: v_rndne_f32_e32 v1, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_rndne_f32_e32 v1, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_v2f32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x)
ret <2 x float> %roundeven
}
Expand Down Expand Up @@ -113,14 +114,14 @@ define <3 x float> @v_roundeven_v3f32(<3 x float> %x) {
; GFX9-NEXT: v_rndne_f32_e32 v2, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v3f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_rndne_f32_e32 v1, v1
; GFX10-NEXT: v_rndne_f32_e32 v2, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_v3f32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1
; GFX10PLUS-NEXT: v_rndne_f32_e32 v2, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x)
ret <3 x float> %roundeven
}
Expand Down Expand Up @@ -162,15 +163,15 @@ define <4 x float> @v_roundeven_v4f32(<4 x float> %x) {
; GFX9-NEXT: v_rndne_f32_e32 v3, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_rndne_f32_e32 v1, v1
; GFX10-NEXT: v_rndne_f32_e32 v2, v2
; GFX10-NEXT: v_rndne_f32_e32 v3, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_v4f32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0
; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1
; GFX10PLUS-NEXT: v_rndne_f32_e32 v2, v2
; GFX10PLUS-NEXT: v_rndne_f32_e32 v3, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
ret <4 x float> %roundeven
}
Expand Down Expand Up @@ -204,12 +205,12 @@ define half @v_roundeven_f16(half %x) {
; GFX9-NEXT: v_rndne_f16_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f16_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f16_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call half @llvm.roundeven.f16(half %x)
ret half %roundeven
}
Expand Down Expand Up @@ -264,6 +265,17 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x)
ret <2 x half> %roundeven
}
Expand Down Expand Up @@ -331,6 +343,18 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v2f16_fneg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg)
ret <2 x half> %roundeven
Expand Down Expand Up @@ -408,6 +432,22 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_rndne_f16_e32 v2, v2
; GFX11-NEXT: v_rndne_f16_e32 v3, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
ret <4 x half> %roundeven
}
Expand Down Expand Up @@ -438,12 +478,12 @@ define float @v_roundeven_f32_fabs(float %x) {
; GFX9-NEXT: v_rndne_f32_e64 v0, |v0|
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f32_fabs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e64 v0, |v0|
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f32_fabs:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e64 v0, |v0|
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%roundeven = call float @llvm.roundeven.f32(float %fabs.x)
ret float %roundeven
Expand All @@ -470,10 +510,10 @@ define amdgpu_ps float @s_roundeven_f32(float inreg %x) {
; GFX9-NEXT: v_rndne_f32_e32 v0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_roundeven_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_rndne_f32_e32 v0, s0
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: s_roundeven_f32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, s0
; GFX10PLUS-NEXT: ; return to shader part epilog
%roundeven = call float @llvm.roundeven.f32(float %x)
ret float %roundeven
}
Expand Down Expand Up @@ -503,12 +543,12 @@ define float @v_roundeven_f32_fneg(float %x) {
; GFX9-NEXT: v_rndne_f32_e64 v0, -v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f32_fneg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f32_e64 v0, -v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f32_fneg:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f32_e64 v0, -v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%roundeven = call float @llvm.roundeven.f32(float %neg.x)
ret float %roundeven
Expand Down Expand Up @@ -548,12 +588,12 @@ define double @v_roundeven_f64(double %x) {
; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f64_e32 v[0:1], v[0:1]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call double @llvm.roundeven.f64(double %x)
ret double %roundeven
}
Expand Down Expand Up @@ -593,12 +633,12 @@ define double @v_roundeven_f64_fneg(double %x) {
; GFX9-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_f64_fneg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_f64_fneg:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg double %x
%roundeven = call double @llvm.roundeven.f64(double %neg.x)
ret double %roundeven
Expand Down Expand Up @@ -648,13 +688,13 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
; GFX9-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1]
; GFX10-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10PLUS-LABEL: v_roundeven_v2f64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_rndne_f64_e32 v[0:1], v[0:1]
; GFX10PLUS-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
ret <2 x double> %roundeven
}
Expand Down
244 changes: 163 additions & 81 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll

Large diffs are not rendered by default.

789 changes: 431 additions & 358 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s

; Test gfx9+ s_shl[1-4]_add_u32 pattern matching

Expand Down
129 changes: 129 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s

; FIXME:
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
Expand Down Expand Up @@ -47,6 +48,18 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: ds_write_b128 v4, v[0:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_store_b128 v4, v[0:3]
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out
ret void
}
Expand Down Expand Up @@ -219,6 +232,60 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b8 v1, v4 offset:14
; GFX10-NEXT: ds_write_b8 v1, v5 offset:15
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000
; GFX11-NEXT: s_lshr_b32 s2, s4, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
; GFX11-NEXT: s_lshr_b32 s3, s3, s1
; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000
; GFX11-NEXT: s_bfe_u32 s8, s6, 0x100000
; GFX11-NEXT: s_lshr_b32 s9, s2, s1
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2
; GFX11-NEXT: s_lshr_b32 s5, s6, 16
; GFX11-NEXT: s_lshr_b32 s2, s4, s1
; GFX11-NEXT: s_lshr_b32 s4, s0, s1
; GFX11-NEXT: s_lshr_b32 s0, s8, s1
; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s2
; GFX11-NEXT: v_mov_b32_e32 v8, s4
; GFX11-NEXT: ds_store_b8 v1, v0
; GFX11-NEXT: ds_store_b8 v1, v5 offset:1
; GFX11-NEXT: ds_store_b8 v1, v3 offset:2
; GFX11-NEXT: ds_store_b8 v1, v6 offset:3
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
; GFX11-NEXT: ds_store_b8 v1, v7 offset:5
; GFX11-NEXT: ds_store_b8 v1, v4 offset:6
; GFX11-NEXT: ds_store_b8 v1, v8 offset:7
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_lshr_b32 s2, s7, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v7, s2
; GFX11-NEXT: s_lshr_b32 s0, s5, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s7
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: s_bfe_u32 s0, s7, 0x100000
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_lshr_b32 s0, s2, s1
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: ds_store_b8 v1, v0 offset:8
; GFX11-NEXT: ds_store_b8 v1, v2 offset:9
; GFX11-NEXT: ds_store_b8 v1, v3 offset:10
; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
; GFX11-NEXT: ds_store_b8 v1, v7 offset:14
; GFX11-NEXT: ds_store_b8 v1, v8 offset:15
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
}
Expand Down Expand Up @@ -309,6 +376,31 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b16 v1, v7 offset:10
; GFX10-NEXT: ds_write_b16 v1, v8 offset:14
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s4, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
; GFX11-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: s_lshr_b32 s2, s6, 16
; GFX11-NEXT: s_lshr_b32 s3, s7, 16
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s2
; GFX11-NEXT: v_mov_b32_e32 v8, s3
; GFX11-NEXT: ds_store_b16 v1, v0
; GFX11-NEXT: ds_store_b16 v1, v5 offset:2
; GFX11-NEXT: ds_store_b16 v1, v2 offset:4
; GFX11-NEXT: ds_store_b16 v1, v6 offset:6
; GFX11-NEXT: ds_store_b16 v1, v3 offset:8
; GFX11-NEXT: ds_store_b16 v1, v7 offset:10
; GFX11-NEXT: ds_store_b16 v1, v4 offset:12
; GFX11-NEXT: ds_store_b16 v1, v8 offset:14
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
ret void
}
Expand Down Expand Up @@ -357,6 +449,19 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32_align4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_mov_b32_e32 v4, s7
; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
; GFX11-NEXT: ds_store_2addr_b32 v1, v3, v4 offset0:2 offset1:3
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
ret void
}
Expand Down Expand Up @@ -403,6 +508,18 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32_align8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
ret void
}
Expand Down Expand Up @@ -448,6 +565,18 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: ds_write_b128 v4, v[0:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v4i32_align16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_store_b128 v4, v[0:3]
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
ret void
}
108 changes: 108 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s

; FIXME:
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
Expand Down Expand Up @@ -44,6 +45,17 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3
; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b96 v3, v[0:2]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: ds_store_b96 v3, v[0:2]
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out
ret void
}
Expand Down Expand Up @@ -181,6 +193,46 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b8 v1, v2 offset:10
; GFX10-NEXT: ds_write_b8 v1, v4 offset:11
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000
; GFX11-NEXT: s_lshr_b32 s2, s4, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: s_lshr_b32 s5, s6, 16
; GFX11-NEXT: s_lshr_b32 s3, s3, s1
; GFX11-NEXT: s_bfe_u32 s7, s6, 0x100000
; GFX11-NEXT: s_lshr_b32 s6, s2, s1
; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: s_lshr_b32 s2, s4, s1
; GFX11-NEXT: s_lshr_b32 s4, s0, s1
; GFX11-NEXT: s_lshr_b32 s0, s7, s1
; GFX11-NEXT: s_lshr_b32 s1, s5, s1
; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s2
; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v12, s1
; GFX11-NEXT: ds_store_b8 v1, v0
; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
ret void
}
Expand Down Expand Up @@ -256,6 +308,27 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b16 v1, v5 offset:6
; GFX10-NEXT: ds_write_b16 v1, v6 offset:10
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s4, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: s_lshr_b32 s2, s6, 16
; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_mov_b32_e32 v6, s2
; GFX11-NEXT: ds_store_b16 v1, v0
; GFX11-NEXT: ds_store_b16 v1, v3 offset:2
; GFX11-NEXT: ds_store_b16 v1, v2 offset:4
; GFX11-NEXT: ds_store_b16 v1, v5 offset:6
; GFX11-NEXT: ds_store_b16 v1, v4 offset:8
; GFX11-NEXT: ds_store_b16 v1, v6 offset:10
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
ret void
}
Expand Down Expand Up @@ -301,6 +374,18 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
; GFX10-NEXT: ds_write_b32 v1, v3 offset:8
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32_align4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
; GFX11-NEXT: ds_store_b32 v1, v3 offset:8
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
ret void
}
Expand Down Expand Up @@ -346,6 +431,18 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
; GFX10-NEXT: ds_write_b32 v1, v3 offset:8
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32_align8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1
; GFX11-NEXT: ds_store_b32 v1, v3 offset:8
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
ret void
}
Expand Down Expand Up @@ -388,6 +485,17 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: ds_write_b96 v3, v[0:2]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_lds_v3i32_align16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x10
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: ds_store_b96 v3, v[0:2]
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
ret void
}
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s

define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) {
; GCN-LABEL: scalar_xnor_i32_one_use:
Expand Down
56 changes: 56 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s

define i32 @zextload_global_i1_to_i32(i1 addrspace(1)* %ptr) {
; GFX9-LABEL: zextload_global_i1_to_i32:
Expand Down Expand Up @@ -159,6 +160,15 @@ define i64 @zextload_global_i1_to_i64(i1 addrspace(1)* %ptr) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i1_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i1, i1 addrspace(1)* %ptr
%ext = zext i1 %load to i64
ret i64 %ext
Expand Down Expand Up @@ -200,6 +210,15 @@ define i64 @zextload_global_i8_to_i64(i8 addrspace(1)* %ptr) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i8_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i8, i8 addrspace(1)* %ptr
%ext = zext i8 %load to i64
ret i64 %ext
Expand Down Expand Up @@ -241,6 +260,15 @@ define i64 @zextload_global_i16_to_i64(i16 addrspace(1)* %ptr) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i16_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i16, i16 addrspace(1)* %ptr
%ext = zext i16 %load to i64
ret i64 %ext
Expand Down Expand Up @@ -282,6 +310,15 @@ define i64 @zextload_global_i32_to_i64(i32 addrspace(1)* %ptr) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i32_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i64
ret i64 %ext
Expand Down Expand Up @@ -327,6 +364,15 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i32_to_i96:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i96
ret i96 %ext
Expand Down Expand Up @@ -376,6 +422,16 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: zextload_global_i32_to_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i128
ret i128 %ext
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/add3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s

; ===================================================================================
; V_ADD3_U32
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/add_shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s

; ===================================================================================
; V_ADD_LSHL_U32
Expand Down
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11

define amdgpu_kernel void @test0() {
; GFX9-LABEL: test0:
Expand All @@ -10,6 +11,10 @@ define amdgpu_kernel void @test0() {
; GFX10-LABEL: test0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_endpgm
tail call void @llvm.amdgcn.endpgm()
unreachable
}
Expand All @@ -25,6 +30,12 @@ define void @test1() {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
tail call void @llvm.amdgcn.endpgm()
unreachable
}
Expand Down Expand Up @@ -63,6 +74,23 @@ define amdgpu_kernel void @test2(i32* %p, i32 %x) {
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
; GFX11-NEXT: s_endpgm
%cond = icmp sgt i32 %x, 0
br i1 %cond, label %then, label %else

Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/and_or.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s

; ===================================================================================
; V_AND_OR_B32
Expand Down
653 changes: 653 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Large diffs are not rendered by default.

992 changes: 992 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Large diffs are not rendered by default.

218 changes: 218 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s

declare i1 @llvm.amdgcn.wqm.vote(i1)
declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
Expand Down Expand Up @@ -148,6 +150,85 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: .LBB0_6: ; %UnifiedReturnBlock
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[10:11], exec
; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1164-NEXT: s_cbranch_execz .LBB0_4
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_mov_b64 s[12:13], exec
; GFX1164-NEXT: s_mov_b64 s[10:11], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.2:
; GFX1164-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_mul_i32 s12, s12, 5
; GFX1164-NEXT: v_mov_b32_e32 v1, s12
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX1164-NEXT: .LBB0_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1164-NEXT: ; %bb.5: ; %if
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_saveexec_b32 s8, s9
; GFX1132-NEXT: s_cbranch_execz .LBB0_4
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_mov_b32 s10, exec_lo
; GFX1132-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.2:
; GFX1132-NEXT: s_bcnt1_i32_b32 s10, s10
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_mul_i32 s10, s10, 5
; GFX1132-NEXT: v_mov_b32_e32 v1, s10
; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX1132-NEXT: .LBB0_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6
; GFX1132-NEXT: ; %bb.5: ; %if
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
Expand Down Expand Up @@ -403,6 +484,143 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0
; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
; GFX1164-NEXT: ; implicit-def: $vgpr4
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s12, v1, 31
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s12
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s12, v1, 15
; GFX1164-NEXT: v_readlane_b32 s13, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s12, 16
; GFX1164-NEXT: s_mov_b64 exec, s[10:11]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1164-NEXT: v_readlane_b32 s12, v1, 63
; GFX1164-NEXT: v_readlane_b32 s14, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s13, 32
; GFX1164-NEXT: s_mov_b64 exec, s[10:11]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1164-NEXT: v_writelane_b32 v3, s14, 48
; GFX1164-NEXT: s_mov_b64 exec, s[10:11]
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
; GFX1164-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GFX1164-NEXT: s_cbranch_execz .LBB1_3
; GFX1164-NEXT: ; %bb.2:
; GFX1164-NEXT: v_mov_b32_e32 v0, s12
; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX1164-NEXT: .LBB1_3:
; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_e32 v4, s4, v0
; GFX1164-NEXT: .LBB1_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1164-NEXT: ; %bb.5: ; %if
; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0
; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_varying:
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_mov_b32 s8, exec_lo
; GFX1132-NEXT: ; implicit-def: $vgpr4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_mov_b32 s9, s8
; GFX1132-NEXT: s_and_saveexec_b32 s8, s9
; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_readlane_b32 s11, v1, 31
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_readlane_b32 s10, v1, 15
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
; GFX1132-NEXT: v_writelane_b32 v3, s10, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX1132-NEXT: s_cbranch_execz .LBB1_3
; GFX1132-NEXT: ; %bb.2:
; GFX1132-NEXT: v_mov_b32_e32 v0, s11
; GFX1132-NEXT: s_mov_b32 s10, s11
; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX1132-NEXT: .LBB1_3:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_e32 v4, s4, v0
; GFX1132-NEXT: .LBB1_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1132-NEXT: ; %bb.5: ; %if
; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0
; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0)
Expand Down
526 changes: 526 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

Large diffs are not rendered by default.

602 changes: 602 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions llvm/test/CodeGen/AMDGPU/carryout-selection.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1010,GFX10W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1030,GFX10W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1030,GFX10W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s

; GCN-ISEL-LABEL: name: sadd64rr
; GCN-ISEL-LABEL: body:
Expand Down Expand Up @@ -57,6 +58,9 @@ entry:
; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
;
; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vadd64rr(i64 addrspace(1)* %out, i64 %a) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -89,6 +93,9 @@ entry:
; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0, 0x1234, [[CARRY]]
; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0, 0x1234, [[CARRY]]
;
; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0, 0x1234, [[CARRY]]
define amdgpu_kernel void @vadd64ri(i64 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -134,6 +141,9 @@ define amdgpu_kernel void @suaddo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX10W32: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
;
; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
define amdgpu_kernel void @uaddo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
Expand Down Expand Up @@ -181,6 +191,9 @@ define amdgpu_kernel void @suaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v0
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
;
; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vuaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -242,6 +255,9 @@ entry:
; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
;
; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vsub64rr(i64 addrspace(1)* %out, i64 %a) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -274,6 +290,9 @@ entry:
; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0x1234, 0, [[CARRY]]
; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, 0x1234, 0, [[CARRY]]
;
; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, 0x1234, 0, [[CARRY]]
define amdgpu_kernel void @vsub64ri(i64 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -319,6 +338,9 @@ define amdgpu_kernel void @susubo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX10W32: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
;
; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
define amdgpu_kernel void @usubo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
Expand Down Expand Up @@ -366,6 +388,9 @@ define amdgpu_kernel void @susubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v0
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
;
; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down
143 changes: 143 additions & 0 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s

define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_empty:
Expand All @@ -15,6 +16,10 @@ define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
; GFX1010-LABEL: test_kern_empty:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_empty:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: s_endpgm
entry:
ret void
}
Expand Down Expand Up @@ -46,6 +51,14 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_stack:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v0, 0
; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
Expand Down Expand Up @@ -113,6 +126,24 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm

entry:
tail call void @ex() #0
ret void
Expand Down Expand Up @@ -188,6 +219,28 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_kern_stack_and_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v1, 0
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm

entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
Expand All @@ -210,6 +263,12 @@ define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_empty:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_endpgm

entry:
ret void
}
Expand Down Expand Up @@ -244,6 +303,15 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_stack:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v0, 0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
Expand Down Expand Up @@ -314,6 +382,43 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
; GFX1010-NEXT s_add_u32 s12, s12, s17
; GFX1010-NEXT s_mov_b32 s32, 0
; GFX1010-NEXT s_mov_b32 s33, 0
; GFX1010-NEXT s_addc_u32 s13, s13, 0
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT s_add_u32 s0, s0, s17
; GFX1010-NEXT s_addc_u32 s1, s1, 0
; GFX1010-NEXT s_mov_b32 s12, s14
; GFX1010-NEXT s_mov_b32 s13, s15
; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT s_mov_b32 s14, s16
; GFX1010-NEXT s_getpc_b64 s[18:19]
; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT s_endpgm
entry:
tail call void @ex() #2
ret void
Expand Down Expand Up @@ -392,6 +497,28 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: v_mov_b32_e32 v1, 0
; GFX1100-NEXT: v_mov_b32_e32 v31, v0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
Expand Down Expand Up @@ -451,6 +578,22 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_endpgm
;
; GFX1100-LABEL: test_sgpr_offset_kernel:
; GFX1100: ; %bb.0: ; %entry
; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: s_movk_i32 s0, 0x1000
; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
; GFX1100-NEXT: s_movk_i32 s0, 0x1000
; GFX1100-NEXT: ;;#ASMSTART
; GFX1100-NEXT: ;;#ASMEND
; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
; fit in the instruction, and has to live in the SGPR offset.
Expand Down
204 changes: 204 additions & 0 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s

define <2 x half> @chain_hi_to_lo_private() {
; GFX900-LABEL: chain_hi_to_lo_private:
Expand Down Expand Up @@ -47,6 +49,18 @@ define <2 x half> @chain_hi_to_lo_private() {
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s0, 2
; GFX11-NEXT: scratch_load_u16 v0, off, s0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
%load_lo = load half, half addrspace(5)* %gep_lo
Expand Down Expand Up @@ -97,6 +111,16 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %ba
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_u16 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(5)* %base_lo
%load_hi = load half, half addrspace(5)* %base_hi
Expand Down Expand Up @@ -145,6 +169,16 @@ define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in)
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_arithmatic:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%arith_lo = fadd half %in, 1.0
%load_hi = load half, half addrspace(5)* %base
Expand Down Expand Up @@ -176,6 +210,17 @@ define <2 x half> @chain_hi_to_lo_group() {
; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_load_u16 v0, v1 offset:2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
%load_lo = load half, half addrspace(3)* %gep_lo
Expand Down Expand Up @@ -207,6 +252,16 @@ define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base
; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group_different_bases:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_u16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_load_u16_d16_hi v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(3)* %base_lo
%load_hi = load half, half addrspace(3)* %base_hi
Expand Down Expand Up @@ -244,6 +299,20 @@ define <2 x half> @chain_hi_to_lo_global() {
; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_global:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 2
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
%load_lo = load half, half addrspace(1)* %gep_lo
Expand Down Expand Up @@ -275,6 +344,16 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %bas
; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(1)* %base_lo
%load_hi = load half, half addrspace(1)* %base_hi
Expand Down Expand Up @@ -312,6 +391,20 @@ define <2 x half> @chain_hi_to_lo_flat() {
; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 2
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half* null, i64 1
%load_lo = load half, half* %gep_lo
Expand Down Expand Up @@ -343,6 +436,16 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %ba
; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half* %base_lo
%load_hi = load half, half* %base_hi
Expand Down Expand Up @@ -483,6 +586,31 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; FLATSCR_GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: vload2_private:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:6 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b16 off, v0, off offset:8 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4
; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%loc = alloca [3 x i16], align 2, addrspace(5)
%loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
Expand Down Expand Up @@ -534,6 +662,18 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_u16_d16_hi v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%load_lo = load i16, i16 addrspace(3)* %gep_lo
Expand Down Expand Up @@ -568,6 +708,18 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %
; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_load_u16 v1, v0 offset:2
; GFX11-NEXT: ds_load_u16_d16_hi v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
Expand Down Expand Up @@ -625,6 +777,18 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
%load_lo = load i16, i16 addrspace(5)* %gep_lo
Expand Down Expand Up @@ -660,6 +824,19 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
Expand Down Expand Up @@ -698,6 +875,19 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
Expand Down Expand Up @@ -734,6 +924,20 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX11-NEXT: ds_load_u16 v3, v0
; GFX11-NEXT: ds_store_b16 v1, v2
; GFX11-NEXT: ds_load_u16 v0, v0 offset:2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s

; Test that unused lanes in the s_xor result are masked out with v_cndmask.

Expand Down
267 changes: 267 additions & 0 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll

Large diffs are not rendered by default.

549 changes: 549 additions & 0 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

127 changes: 127 additions & 0 deletions llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s

declare i32 @llvm.amdgcn.workitem.id.x() #0

Expand Down Expand Up @@ -32,6 +33,14 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX10-NEXT: ds_write_b32 v0, v1 offset:12
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: write_ds_sub0_offset0_global:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b32 v0, v1 offset:12
; GFX11-NEXT: s_endpgm
entry:
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
%sub1 = sub i32 0, %x.i
Expand Down Expand Up @@ -94,6 +103,23 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; GFX10-NEXT: global_store_dword v[0:1], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_store_b32 v2, v3 offset:12
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
%sub1 = sub i32 0, %x.i
Expand Down Expand Up @@ -158,6 +184,23 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
; GFX10-NEXT: global_store_dword v[0:1], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: v_not_b32_e32 v0, v0
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_store_b32 v2, v3 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
%sub1 = sub i32 -1, %x.i
%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
Expand Down Expand Up @@ -190,6 +233,12 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 {
; GFX10-NEXT: v_mov_b32_e32 v1, 13
; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_max_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0
; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
; GFX11-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%shl = shl i32 %x.i, 4
%add = add i32 %shl, 65535
Expand Down Expand Up @@ -224,6 +273,14 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
; GFX11-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%.neg = mul i32 %x.i, -4
%add = add i32 %.neg, 65535
Expand Down Expand Up @@ -258,6 +315,14 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -292,6 +357,14 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
; GFX11-NEXT: ds_store_b8 v0, v1
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -329,6 +402,15 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
; GFX10-NEXT: ds_write_b32 v0, v1 offset:456
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
; GFX11-NEXT: ds_store_b32 v0, v1 offset:456
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -369,6 +451,15 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -408,6 +499,15 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023
; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0
; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -474,6 +574,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
; GFX10-NEXT: global_store_dword v[0:1], v5, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0
; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down Expand Up @@ -514,6 +632,15 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0
; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
; GFX11-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1034 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1034 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1035 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1035 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1036 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1036 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1100 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1100 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1101 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1101 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1102 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1102 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1103 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1103 %s

; FIXME: With the default attributes the eflags are not accurate for
; xnack and sramecc. Subsequent Target-ID patches will address this.
Expand Down Expand Up @@ -129,6 +133,10 @@
; GFX1034: EF_AMDGPU_MACH_AMDGCN_GFX1034 (0x3E)
; GFX1035: EF_AMDGPU_MACH_AMDGCN_GFX1035 (0x3D)
; GFX1036: EF_AMDGPU_MACH_AMDGCN_GFX1036 (0x45)
; GFX1100: EF_AMDGPU_MACH_AMDGCN_GFX1100 (0x41)
; GFX1101: EF_AMDGPU_MACH_AMDGCN_GFX1101 (0x46)
; GFX1102: EF_AMDGPU_MACH_AMDGCN_GFX1102 (0x47)
; GFX1103: EF_AMDGPU_MACH_AMDGCN_GFX1103 (0x44)
; ALL: ]

define amdgpu_kernel void @elf_header() {
Expand Down
47 changes: 47 additions & 0 deletions llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck -check-prefix=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s

define i32 @s_add_co_select_user() {
; GFX7-LABEL: s_add_co_select_user:
Expand Down Expand Up @@ -55,6 +56,25 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_add_co_select_user:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0
; GFX11-NEXT: s_addc_u32 s1, s0, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s2
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%i = load volatile i32, i32 addrspace(4)* null, align 8
%i1 = add i32 %i, %i
Expand Down Expand Up @@ -148,6 +168,33 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_co_br_user:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s1, s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lt_u32 s1, s0
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%i1 = add i32 %i, %i
%i2 = icmp ult i32 %i1, %i
Expand Down
Loading