diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll index d7d1a4cc819dbf..fc4c3f710c28fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -3,10 +3,13 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s ; GCN-LABEL: test_local_misaligned_v2: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_{{read2|load_2addr}}_b32 +; GCN-DAG: ds_{{write2|store_2addr}}_b32 define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -22,12 +25,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_write2_b32 -; ALIGNED-DAG: ds_write2_b32 -; UNALIGNED-DAG: ds_read2_b64 -; UNALIGNED-DAG: ds_write2_b64 +; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32 +; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32 +; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32 +; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32 +; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64 +; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -47,12 +50,12 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; ALIGNED-DAG: ds_read2_b32 -; ALIGNED-DAG: ds_read_b32 -; ALIGNED-DAG: ds_write2_b32 -; ALIGNED-DAG: ds_write_b32 -; UNALIGNED-DAG: ds_read_b96 -; UNALIGNED-DAG: ds_write_b96 +; ALIGNED-DAG: ds_{{read2|load_2addr}}_b32 +; ALIGNED-DAG: ds_{{read|load}}_b32 +; ALIGNED-DAG: ds_{{write2|store_2addr}}_b32 +; ALIGNED-DAG: ds_{{write|store}}_b32 +; UNALIGNED-DAG: ds_{{read|load}}_b96 +; UNALIGNED-DAG: ds_{{write|store}}_b96 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -70,8 +73,8 @@ bb: } ; GCN-LABEL: test_local_aligned_v2: -; GCN-DAG: ds_read_b64 -; GCN-DAG: ds_write_b64 +; GCN-DAG: ds_{{read|load}}_b64 +; GCN-DAG: ds_{{write|store}}_b64 define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -87,8 +90,8 @@ bb: } ; GCN-LABEL: test_local_aligned_v3: -; GCN-DAG: ds_read_b96 -; GCN-DAG: ds_write_b96 +; GCN-DAG: ds_{{read|load}}_b96 +; GCN-DAG: ds_{{write|store}}_b96 define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -106,14 +109,14 @@ bb: } ; GCN-LABEL: test_local_v4_aligned8: -; ALIGNED-WGP-DAG: ds_read2_b32 -; ALIGNED-WGP-DAG: ds_read2_b32 -; ALIGNED-WGP-DAG: ds_write2_b32 -; ALIGNED-WGP-DAG: ds_write2_b32 -; ALIGNED-CU-DAG: ds_read2_b64 -; ALIGNED-CU-DAG: ds_write2_b64 -; UNALIGNED-DAG: ds_read2_b64 -; UNALIGNED-DAG: ds_write2_b64 +; ALIGNED-WGP-DAG: ds_{{read2|load_2addr}}_b32 +; ALIGNED-WGP-DAG: ds_{{read2|load_2addr}}_b32 +; ALIGNED-WGP-DAG: ds_{{write2|store_2addr}}_b32 +; ALIGNED-WGP-DAG: ds_{{write2|store_2addr}}_b32 +; ALIGNED-CU-DAG: ds_{{read2|load_2addr}}_b64 +; ALIGNED-CU-DAG: ds_{{write2|store_2addr}}_b64 +; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64 +; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 0c2ba16034b37d..7cdaa805749433 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; FIXME: VI or should be unnecessary @@ -18,16 +19,16 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_add_u16_e32 v2, v0, v1 -; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u16_e32 v3, v4, v2 +; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16: @@ -40,10 +41,9 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16: @@ -57,12 +57,27 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -79,40 +94,35 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s6, s[6:7], 0x0 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_lshr_b32 s5, s7, 16 -; VI-NEXT: s_add_i32 s6, s6, s7 -; VI-NEXT: s_add_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xffff -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_load_dword s2, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_add_i32 s2, s2, s0 +; VI-NEXT: s_add_i32 s1, s1, s3 +; VI-NEXT: s_and_b32 s0, s2, 0xffff +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_pk_add_u16 v0, s11, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: @@ -120,15 +130,29 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_add_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, s2, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = add <2 x i16> %a, %b @@ -141,44 +165,54 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_add_i32 s0, s0, s0 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, s4, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_add_u16 v1, s2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v0, s2, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_add_u16 v1, s2, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_add_self_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, s2, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = add <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -190,46 +224,48 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_add_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s4, s4, s5 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_add_u16 v0, s2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v0, s2, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_add_v2i16_kernarg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -241,48 +277,57 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, 0x1c8 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_add_u16_e32 v2, 0x7b, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b +; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -298,48 +343,57 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, 0xfffffc21 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_add_u16_e32 v2, 0xfcb3, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_neg_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -354,47 +408,56 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, -1 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_add_u16_e32 v2, -1, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u16_e32 v4, -1, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_inline_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -413,42 +476,51 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_add_u16_e32 v0, 32, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; VI-NEXT: v_add_u16_e32 v2, 32, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, 32 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -464,47 +536,56 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v2, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, 0x3f80 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s2, 1.0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1] -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -528,15 +609,15 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v1, v[0:1] glc +; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: flat_load_dword v3, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_add_u16_e32 v0, v1, v2 -; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u16_e32 v2, v4, v3 +; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: @@ -544,17 +625,16 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: @@ -568,14 +648,32 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -602,17 +700,17 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: flat_load_dword v6, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_add_u16_e32 v0, v4, v2 -; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: v_add_u16_e32 v0, v6, v2 +; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: @@ -626,13 +724,11 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: @@ -646,16 +742,33 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -682,17 +795,17 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] glc +; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v1, v[2:3] glc +; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_add_u16_e32 v0, v0, v1 -; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_u16_e32 v2, v4, v2 +; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: @@ -700,17 +813,16 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: @@ -724,14 +836,32 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -760,8 +890,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_e32 v0, v0, v1 @@ -769,7 +899,7 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: @@ -777,11 +907,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -789,7 +918,7 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: @@ -798,13 +927,11 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -812,8 +939,32 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll index dd32f9a35b0190..0c9248a7c0bab0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -1,20 +1,22 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SICI,SICIVI,GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SICI,CIVI,SICIVI,GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,SICIVI,GFX89,GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GFX9-NOT: m0 +; GFX9PLUS-NOT: m0 ; SICIVI-DAG: s_mov_b32 m0 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 @@ -25,19 +27,20 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* % } ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; GFX9-NOT: m0 +; GFX9PLUS-NOT: m0 ; SICIVI-DAG: s_mov_b32 m0 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GFX89-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 +; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 +; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32 ; GCN: [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { @@ -49,10 +52,10 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* % } ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset -; GFX9-NOT: m0 +; GFX9PLUS-NOT: m0 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GFX9: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b @@ -65,18 +68,19 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac } ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: -; GFX9-NOT: m0 +; GFX9PLUS-NOT: m0 ; SICIVI-DAG: s_mov_b32 m0 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12 -; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48 +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 @@ -86,19 +90,20 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* } ; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: -; GFX9-NOT: m0 +; GFX9PLUS-NOT: m0 ; SICIVI-DAG: s_mov_b32 m0 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 ; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 +; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 +; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index fc6751bf5763c3..0cbed25036b3ae 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1,8 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS,GFX11 %s ; Make sure fdiv is promoted to f32. @@ -22,17 +23,17 @@ ; SI: v_div_fixup_f32 ; SI: v_cvt_f16_f32 -; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[LHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[RHS:v[0-9]+]] -; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] -; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] +; GFX8PLUS-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] +; GFX8PLUS-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] -; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] -; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] -; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] -; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] +; GFX8PLUS: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] +; GFX8PLUS: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] +; GFX8PLUS: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, @@ -51,11 +52,11 @@ entry: } ; GCN-LABEL: {{^}}v_rcp_f16: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9_10-NOT: [[RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8PLUS-NOT: [[RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -69,11 +70,11 @@ entry: } ; GCN-LABEL: {{^}}v_rcp_f16_abs: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| -; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| +; GFX8PLUS-NOT: [RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -90,12 +91,12 @@ entry: ; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp. ; GCN-LABEL: {{^}}reciprocal_f16_rounded: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{.+}} -; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] -; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] -; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] -; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL16:v[0-9]+]], v{{.+}} +; GFX8PLUS: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] +; GFX8PLUS: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] +; GFX8PLUS: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] +; GFX8PLUS: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -109,11 +110,11 @@ entry: } ; GCN-LABEL: {{^}}v_rcp_f16_afn: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9_10-NOT: [[RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8PLUS-NOT: [[RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -127,11 +128,11 @@ entry: } ; GCN-LABEL: {{^}}v_rcp_f16_neg: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] -; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] +; GFX8PLUS-NOT: [RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -145,11 +146,11 @@ entry: } ; GCN-LABEL: {{^}}v_rsq_f16: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8PLUS-NOT: [RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -164,12 +165,13 @@ entry: } ; GCN-LABEL: {{^}}v_rsq_f16_neg: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9_10-NOT: [[VAL]] -; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] -; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] -; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL:v[0-9]+]] +; GFX8PLUS-NOT: [[VAL]] +; GFX8PLUS: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX8PLUS-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] +; GFX8PLUS-NOT: [RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -184,13 +186,13 @@ entry: } ; GCN-LABEL: {{^}}v_fdiv_f16_afn: -; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[LHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[RHS:v[0-9]+]] -; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] +; GFX8PLUS: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; GFX8PLUS: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -206,13 +208,13 @@ entry: } ; GCN-LABEL: {{^}}v_fdiv_f16_unsafe: -; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[LHS:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[RHS:v[0-9]+]] -; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] +; GFX8PLUS: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; GFX8PLUS: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -227,11 +229,11 @@ entry: ret void } -; SI-LABEL: {{^}}div_afn_2_x_pat_f16: +; GCN-LABEL: {{^}}div_afn_2_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} -; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} -; GFX8_9_10: buffer_store_short [[MUL]] +; GFX8PLUS: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.*}}, [[MUL]] define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv afn half %x, 2.0 @@ -239,11 +241,11 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 { ret void } -; SI-LABEL: {{^}}div_afn_k_x_pat_f16: +; GCN-LABEL: {{^}}div_afn_k_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}} -; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} -; GFX8_9_10: buffer_store_short [[MUL]] +; GFX8PLUS: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.*}}, [[MUL]] define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv afn half %x, 10.0 @@ -251,11 +253,11 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 { ret void } -; SI-LABEL: {{^}}div_afn_neg_k_x_pat_f16: +; GCN-LABEL: {{^}}div_afn_neg_k_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}} -; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} -; GFX8_9_10: buffer_store_short [[MUL]] +; GFX8PLUS: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} +; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.*}}, [[MUL]] define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv afn half %x, -10.0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 63339529ad0fbb..22ccf1f412905b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -1,41 +1,42 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI,CIVI-HSA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX10 %s - -; CHECK-LABEL: {{^}}store_flat_i32: -; CHECK-DAG: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], -; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]], -; CHECK: s_waitcnt lgkmcnt(0) -; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]] -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: flat_store_dword v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]] +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s + +; GCN-LABEL: {{^}}store_flat_i32: +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], +; GCN-DAG: s_load_{{dword|b32}} s[[SDATA:[0-9]+]], +; GCN: s_waitcnt lgkmcnt(0) +; GCN-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]] +; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; GCN: flat_store_{{dword|b32}} v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]] define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* store volatile i32 %x, i32* %fptr, align 4 ret void } -; CHECK-LABEL: {{^}}store_flat_i64: -; CHECK: flat_store_dwordx2 +; GCN-LABEL: {{^}}store_flat_i64: +; GCN: flat_store_{{dwordx2|b64}} define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* store volatile i64 %x, i64* %fptr, align 8 ret void } -; CHECK-LABEL: {{^}}store_flat_v4i32: -; CHECK: flat_store_dwordx4 +; GCN-LABEL: {{^}}store_flat_v4i32: +; GCN: flat_store_{{dwordx4|b128}} define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16 ret void } -; CHECK-LABEL: {{^}}store_flat_trunc_i16: -; CHECK: flat_store_short +; GCN-LABEL: {{^}}store_flat_trunc_i16: +; GCN: flat_store_{{short|b16}} define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* %y = trunc i32 %x to i16 @@ -43,8 +44,8 @@ define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) ret void } -; CHECK-LABEL: {{^}}store_flat_trunc_i8: -; CHECK: flat_store_byte +; GCN-LABEL: {{^}}store_flat_trunc_i8: +; GCN: flat_store_{{byte|b8}} define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* %y = trunc i32 %x to i8 @@ -54,8 +55,8 @@ define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) # -; CHECK-LABEL: load_flat_i32: -; CHECK: flat_load_dword +; GCN-LABEL: load_flat_i32: +; GCN: flat_load_{{dword|b32}} define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* %fload = load volatile i32, i32* %fptr, align 4 @@ -63,8 +64,8 @@ define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 add ret void } -; CHECK-LABEL: load_flat_i64: -; CHECK: flat_load_dwordx2 +; GCN-LABEL: load_flat_i64: +; GCN: flat_load_{{dwordx2|b64}} define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* %fload = load volatile i64, i64* %fptr, align 8 @@ -72,8 +73,8 @@ define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 add ret void } -; CHECK-LABEL: load_flat_v4i32: -; CHECK: flat_load_dwordx4 +; GCN-LABEL: load_flat_v4i32: +; GCN: flat_load_{{dwordx4|b128}} define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32 @@ -81,8 +82,8 @@ define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, ret void } -; CHECK-LABEL: sextload_flat_i8: -; CHECK: flat_load_sbyte +; GCN-LABEL: sextload_flat_i8: +; GCN: flat_load_{{sbyte|i8}} define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* %fload = load volatile i8, i8* %fptr, align 4 @@ -91,8 +92,8 @@ define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 a ret void } -; CHECK-LABEL: zextload_flat_i8: -; CHECK: flat_load_ubyte +; GCN-LABEL: zextload_flat_i8: +; GCN: flat_load_{{ubyte|u8}} define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* %fload = load volatile i8, i8* %fptr, align 4 @@ -101,8 +102,8 @@ define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 a ret void } -; CHECK-LABEL: sextload_flat_i16: -; CHECK: flat_load_sshort +; GCN-LABEL: sextload_flat_i16: +; GCN: flat_load_{{sshort|i16}} define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* %fload = load volatile i16, i16* %fptr, align 4 @@ -111,8 +112,8 @@ define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 ret void } -; CHECK-LABEL: zextload_flat_i16: -; CHECK: flat_load_ushort +; GCN-LABEL: zextload_flat_i16: +; GCN: flat_load_{{ushort|u16}} define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* %fload = load volatile i16, i16* %fptr, align 4 @@ -121,11 +122,11 @@ define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 ret void } -; CHECK-LABEL: flat_scratch_unaligned_load: -; CHECK: flat_load_ubyte -; CHECK: flat_load_ubyte -; CHECK: flat_load_ubyte -; CHECK: flat_load_ubyte +; GCN-LABEL: flat_scratch_unaligned_load: +; GCN: flat_load_{{ubyte|u8}} +; GCN: flat_load_{{ubyte|u8}} +; GCN: flat_load_{{ubyte|u8}} +; GCN: flat_load_{{ubyte|u8}} define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* @@ -133,11 +134,11 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { ret void } -; CHECK-LABEL: flat_scratch_unaligned_store: -; CHECK: flat_store_byte -; CHECK: flat_store_byte -; CHECK: flat_store_byte -; CHECK: flat_store_byte +; GCN-LABEL: flat_scratch_unaligned_store: +; GCN: flat_store_{{byte|b8}} +; GCN: flat_store_{{byte|b8}} +; GCN: flat_store_{{byte|b8}} +; GCN: flat_store_{{byte|b8}} define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* @@ -145,11 +146,11 @@ define amdgpu_kernel void @flat_scratch_unaligned_store() { ret void } -; CHECK-LABEL: flat_scratch_multidword_load: +; GCN-LABEL: flat_scratch_multidword_load: ; CIVI-HSA: flat_load_dword v ; CIVI-HSA: flat_load_dword v ; GFX9: flat_load_dwordx2 -; GFX10: flat_load_dwordx2 +; GFX10PLUS: flat_load_{{dwordx2|b64}} ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr define amdgpu_kernel void @flat_scratch_multidword_load() { %scratch = alloca <2 x i32>, addrspace(5) @@ -158,11 +159,11 @@ define amdgpu_kernel void @flat_scratch_multidword_load() { ret void } -; CHECK-LABEL: flat_scratch_multidword_store: +; GCN-LABEL: flat_scratch_multidword_store: ; CIVI-HSA: flat_store_dword v ; CIVI-HSA: flat_store_dword v ; GFX9: flat_store_dwordx2 -; GFX10: flat_store_dwordx2 +; GFX10PLUS: flat_store_{{dwordx2|b64}} ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr define amdgpu_kernel void @flat_scratch_multidword_store() { %scratch = alloca <2 x i32>, addrspace(5) @@ -171,7 +172,7 @@ define amdgpu_kernel void @flat_scratch_multidword_store() { ret void } -; CHECK-LABEL: {{^}}store_flat_i8_max_offset: +; GCN-LABEL: {{^}}store_flat_i8_max_offset: ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}} define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 { @@ -180,15 +181,15 @@ define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 { ret void } -; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1: -; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} +; GCN-LABEL: {{^}}store_flat_i8_max_offset_p1: +; GCN: flat_store_{{byte|b8}} v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{( dlc)?}}{{$}} define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 store volatile i8 %x, i8* %fptr.offset ret void } -; CHECK-LABEL: {{^}}store_flat_i8_neg_offset: +; GCN-LABEL: {{^}}store_flat_i8_neg_offset: ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s @@ -200,27 +201,28 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 { ret void } -; CHECK-LABEL: {{^}}load_flat_i8_max_offset: +; GCN-LABEL: {{^}}load_flat_i8_max_offset: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}} ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} +; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 %val = load volatile i8, i8* %fptr.offset ret void } -; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1: +; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} +; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 %val = load volatile i8, i8* %fptr.offset ret void } -; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: +; GCN-LABEL: {{^}}load_flat_i8_neg_offset: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll index fc29b0e6374433..cdde9f3cf9e781 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll @@ -1,9 +1,10 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9_11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9_11 %s ; GCN-LABEL: flat_inst_offset: -; GFX9: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}] offset:4 -; GFX9: flat_store_dword v[{{[0-9:]+}}], v{{[0-9]+}} offset:4 +; GFX9_11: flat_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:4 +; GFX9_11: flat_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}} offset:4 ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} ; GFX10: flat_store_dword v[{{[0-9:]+}}], v{{[0-9]+}}{{$}} define void @flat_inst_offset(i32* nocapture %p) { @@ -15,8 +16,8 @@ define void @flat_inst_offset(i32* nocapture %p) { } ; GCN-LABEL: global_inst_offset: -; GCN: global_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}], off offset:4 -; GCN: global_store_dword v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 +; GCN: global_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}], off offset:4 +; GCN: global_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 define void @global_inst_offset(i32 addrspace(1)* nocapture %p) { %gep = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 1 %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -26,7 +27,7 @@ define void @global_inst_offset(i32 addrspace(1)* nocapture %p) { } ; GCN-LABEL: load_i16_lo: -; GFX9: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} +; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} ; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_i16_lo(i16* %arg, <2 x i16>* %out) { %gep = getelementptr inbounds i16, i16* %arg, i32 4 @@ -38,7 +39,7 @@ define amdgpu_kernel void @load_i16_lo(i16* %arg, <2 x i16>* %out) { } ; GCN-LABEL: load_i16_hi: -; GFX9: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} +; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} ; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_i16_hi(i16* %arg, <2 x i16>* %out) { %gep = getelementptr inbounds i16, i16* %arg, i32 4 @@ -50,7 +51,7 @@ define amdgpu_kernel void @load_i16_hi(i16* %arg, <2 x i16>* %out) { } ; GCN-LABEL: load_half_lo: -; GFX9: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} +; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} ; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_half_lo(half* %arg, <2 x half>* %out) { %gep = getelementptr inbounds half, half* %arg, i32 4 @@ -62,7 +63,7 @@ define amdgpu_kernel void @load_half_lo(half* %arg, <2 x half>* %out) { } ; GCN-LABEL: load_half_hi: -; GFX9: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} +; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}} ; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_half_hi(half* %arg, <2 x half>* %out) { %gep = getelementptr inbounds half, half* %arg, i32 4 @@ -74,7 +75,7 @@ define amdgpu_kernel void @load_half_hi(half* %arg, <2 x half>* %out) { } ; GCN-LABEL: load_float_lo: -; GFX9: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}] offset:16{{$}} +; GFX9_11: flat_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:16{{$}} ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}} define amdgpu_kernel void @load_float_lo(float* %arg, float* %out) { %gep = getelementptr inbounds float, float* %arg, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c9be3952f01a05..8230fbaf8b866d 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -1,8 +1,10 @@ -; XUN: llc -mtriple=amdgcn-amd-amdhsa -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-DENORM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-FLUSH %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s +; XUN: llc -mtriple=amdgcn-amd-amdhsa -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,VI-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,VI-FLUSH %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't ; make add an instruction if the fadd has more than one use. @@ -44,8 +46,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}} ; SIVI-DAG: v_mac_f32_e64 [[MAD:v[0-9]+]], [[X]], 2.0 ; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} -; GCN-DAG: buffer_store_dword [[MUL2]] -; GCN-DAG: buffer_store_dword [[MAD]] +; GCN-DAG: buffer_store_{{dword|b32}} [[MUL2]] +; GCN-DAG: buffer_store_{{dword|b32}} [[MAD]] ; GCN: s_endpgm define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 @@ -60,8 +62,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| ; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} ; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} -; GCN-DAG: buffer_store_dword [[MUL2]] -; GCN-DAG: buffer_store_dword [[MAD]] +; GCN-DAG: buffer_store_{{dword|b32}} [[MUL2]] +; GCN-DAG: buffer_store_{{dword|b32}} [[MAD]] ; GCN: s_endpgm define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 @@ -92,7 +94,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* ; GCN-LABEL: {{^}}fmul_x2_xn2_f32: ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: buffer_store_{{dword|b32}} [[RESULT]] define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 @@ -107,7 +109,7 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, f ; SIVI: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GFX10: v_mul_f32_e64 [[TMP0:v[0-9]+]], 0xc0c00000, [[X:s[0-9]+]] ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: buffer_store_{{dword|b32}} [[RESULT]] define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 %mul2 = fmul fast float %x, 2.0 @@ -154,8 +156,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] ; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} -; GCN-DAG: buffer_store_short [[MUL2]] -; GCN-DAG: buffer_store_short [[MAD]] +; GCN-DAG: buffer_store_{{short|b16}} [[MUL2]] +; GCN-DAG: buffer_store_{{short|b16}} [[MAD]] ; GCN: s_endpgm define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half @@ -176,8 +178,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] ; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} -; GCN-DAG: buffer_store_short [[MUL2]] -; GCN-DAG: buffer_store_short [[MAD]] +; GCN-DAG: buffer_store_{{short|b16}} [[MUL2]] +; GCN-DAG: buffer_store_{{short|b16}} [[MAD]] ; GCN: s_endpgm define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half @@ -221,7 +223,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* % ; GCN-LABEL: {{^}}fmul_x2_xn2_f16: ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] -; GCN: buffer_store_short [[RESULT]] +; GCN: buffer_store_{{short|b16}} [[RESULT]] define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half @@ -238,7 +240,7 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext ; SIVI: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]] ; GFX10: v_mul_f16_e64 [[TMP0:v[0-9]+]], 0xc600, [[X:s[0-9]+]] ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]] -; GCN: buffer_store_short [[RESULT]] +; GCN: buffer_store_{{short|b16}} [[RESULT]] define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 4623129c2e8859..f2a13f6fc22a10 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1,13 +1,18 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s + +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fmuladd.f16(half, half, half) #1 @@ -73,8 +78,8 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -85,8 +90,8 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} +; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -103,8 +108,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half add } ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -115,8 +120,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half add ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -133,8 +138,8 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add } ; GCN-LABEL: {{^}}fadd_a_a_b_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -148,9 +153,9 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in1, @@ -170,8 +175,8 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, } ; GCN-LABEL: {{^}}fadd_b_a_a_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -185,9 +190,9 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in1, @@ -207,8 +212,8 @@ define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, } ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] @@ -216,8 +221,8 @@ define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -233,8 +238,8 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -243,10 +248,10 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] -; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -264,8 +269,8 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, } ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -274,10 +279,10 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] -; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -295,14 +300,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -320,9 +325,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}mad_sub_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] @@ -335,7 +340,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -355,9 +360,9 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out } ; GCN-LABEL: {{^}}mad_sub_inv_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] @@ -369,7 +374,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -389,9 +394,9 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}mad_sub_fabs_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| @@ -403,7 +408,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -424,9 +429,9 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| @@ -439,7 +444,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -460,9 +465,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap } ; GCN-LABEL: {{^}}neg_neg_mad_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] @@ -476,9 +481,9 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[REGC]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -500,9 +505,9 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}mad_fabs_sub_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] @@ -515,7 +520,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -536,8 +541,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -551,9 +556,9 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -571,8 +576,8 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add } ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: -; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] @@ -585,7 +590,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] -; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll index b58a48dfe41539..aa5450047278e4 100644 --- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -1,10 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE64 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE32 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -17,7 +18,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 { ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -30,10 +31,10 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 { ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; WAVE64-NOT: [[FI]] -; WAVE64: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +; WAVE64: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]] ; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]] -; WAVE32: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -46,7 +47,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 { ; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN-NOT: [[FI]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]] define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 66fb529326edfa..4cd9940b014827 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,9 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=GCN,CI,ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,VI,ALL %s ; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 < %s -mattr=-flat-for-global | FileCheck --check-prefixes=GCNHSA,ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GCNHSA,ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment @@ -18,8 +19,8 @@ ; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000 ; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000 -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen +; GCNHSA: buffer_store_{{dword|b32}} {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen +; GCNHSA: buffer_load_{{dword|b32}} {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen ; GCNHSA: .amdhsa_kernel large_alloca_compute_shader ; GCNHSA: .amdhsa_group_segment_fixed_size 0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir index 2475304da706d0..7cb311f3bd70d7 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -1,8 +1,10 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s # GCN-LABEL: name: hazard_lds_branch_buf -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: hazard_lds_branch_buf body: | @@ -17,8 +19,9 @@ body: | ... # GCN-LABEL: name: hazard_buf_branch_lds -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: DS_READ_B32 +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: DS_READ_B32 --- name: hazard_buf_branch_lds body: | @@ -92,8 +95,8 @@ body: | ... # GCN-LABEL: name: hazard_lds_branch_buf_loop -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: DS_READ_B32 +# GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN: DS_READ_B32 # GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: hazard_lds_branch_buf_loop @@ -106,9 +109,10 @@ body: | ... # GCN-LABEL: name: single_hazard_lds_branch_buf -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: single_hazard_lds_branch_buf body: | @@ -159,8 +163,10 @@ body: | ... # GCN-LABEL: name: hazard_lds_branch_vscnt_1_buf -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +# GCN: bb.1: +# GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1 +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: hazard_lds_branch_vscnt_1_buf body: | @@ -194,8 +200,10 @@ body: | ... # GCN-LABEL: name: hazard_lds_branch_vscnt_s0_buf -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN +# GCN: bb.1: +# GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr0, 0 +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN --- name: hazard_lds_branch_vscnt_s0_buf body: | @@ -228,8 +236,9 @@ body: | ... # GCN-LABEL: name: hazard_lds_branch_global -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: GLOBAL_LOAD_DWORD +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: GLOBAL_LOAD_DWORD --- name: hazard_lds_branch_global body: | @@ -244,8 +253,9 @@ body: | ... # GCN-LABEL: name: hazard_lds_branch_scratch -# GCN: S_WAITCNT_VSCNT undef $sgpr_null, 0 -# GCN-NEXT: SCRATCH_LOAD_DWORD +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 +# GCN-NEXT: SCRATCH_LOAD_DWORD --- name: hazard_lds_branch_scratch body: | diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 234b57decf5e1b..b5fceed2c105b9 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -3,10 +3,13 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s ; GCN-LABEL: test_local_misaligned_v2: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_{{read2|load_2addr}}_b32 +; GCN-DAG: ds_{{write2|store_2addr}}_b32 define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -22,10 +25,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_{{read2|load_2addr}}_b32 +; GCN-DAG: ds_{{read2|load_2addr}}_b32 +; GCN-DAG: ds_{{write2|store_2addr}}_b32 +; GCN-DAG: ds_{{write2|store_2addr}}_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -45,10 +48,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write_b32 +; GCN-DAG: ds_{{read2|load_2addr}}_b32 +; GCN-DAG: ds_{{read|load}}_b32 +; GCN-DAG: ds_{{write2|store_2addr}}_b32 +; GCN-DAG: ds_{{write|store}}_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -66,12 +69,12 @@ bb: } ; GCN-LABEL: test_flat_misaligned_v2: -; VECT-DAG: flat_load_dwordx2 v -; VECT-DAG: flat_store_dwordx2 v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v +; VECT-DAG: flat_load_{{dwordx2|b64}} v +; VECT-DAG: flat_store_{{dwordx2|b64}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -87,16 +90,16 @@ bb: } ; GCN-LABEL: test_flat_misaligned_v4: -; VECT-DAG: flat_load_dwordx4 v -; VECT-DAG: flat_store_dwordx4 v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v +; VECT-DAG: flat_load_{{dwordx4|b128}} v +; VECT-DAG: flat_store_{{dwordx4|b128}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -116,14 +119,14 @@ bb: } ; GCN-LABEL: test_flat_misaligned_v3: -; VECT-DAG: flat_load_dwordx3 v -; VECT-DAG: flat_store_dwordx3 v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_load_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v -; SPLIT-DAG: flat_store_dword v +; VECT-DAG: flat_load_{{dwordx3|b96}} v +; VECT-DAG: flat_store_{{dwordx3|b96}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_load_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v +; SPLIT-DAG: flat_store_{{dword|b32}} v define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -141,8 +144,8 @@ bb: } ; GCN-LABEL: test_local_aligned_v2: -; GCN-DAG: ds_read_b64 -; GCN-DAG: ds_write_b64 +; GCN-DAG: ds_{{read|load}}_b64 +; GCN-DAG: ds_{{write|store}}_b64 define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -158,8 +161,8 @@ bb: } ; GCN-LABEL: test_local_aligned_v3: -; GCN-DAG: ds_read_b96 -; GCN-DAG: ds_write_b96 +; GCN-DAG: ds_{{read|load}}_b96 +; GCN-DAG: ds_{{write|store}}_b96 define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -177,8 +180,8 @@ bb: } ; GCN-LABEL: test_flat_aligned_v2: -; GCN-DAG: flat_load_dwordx2 v -; GCN-DAG: flat_store_dwordx2 v +; GCN-DAG: flat_load_{{dwordx2|b64}} v +; GCN-DAG: flat_store_{{dwordx2|b64}} v define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -194,8 +197,8 @@ bb: } ; GCN-LABEL: test_flat_aligned_v4: -; GCN-DAG: flat_load_dwordx4 v -; GCN-DAG: flat_store_dwordx4 v +; GCN-DAG: flat_load_{{dwordx4|b128}} v +; GCN-DAG: flat_store_{{dwordx4|b128}} v define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,10 +218,10 @@ bb: } ; GCN-LABEL: test_local_v4_aligned8: -; ALIGNED-DAG: ds_read2_b64 -; ALIGNED-DAG: ds_write2_b64 -; UNALIGNED-DAG: ds_read2_b64 -; UNALIGNED-DAG: ds_write2_b64 +; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64 +; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64 +; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64 +; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -238,12 +241,12 @@ bb: } ; GCN-LABEL: test_flat_v4_aligned8: -; VECT-DAG: flat_load_dwordx4 v -; VECT-DAG: flat_store_dwordx4 v -; SPLIT-DAG: flat_load_dwordx2 v -; SPLIT-DAG: flat_load_dwordx2 v -; SPLIT-DAG: flat_store_dwordx2 v -; SPLIT-DAG: flat_store_dwordx2 v +; VECT-DAG: flat_load_{{dwordx4|b128}} v +; VECT-DAG: flat_store_{{dwordx4|b128}} v +; SPLIT-DAG: flat_load_{{dwordx2|b64}} v +; SPLIT-DAG: flat_load_{{dwordx2|b64}} v +; SPLIT-DAG: flat_store_{{dwordx2|b64}} v +; SPLIT-DAG: flat_store_{{dwordx2|b64}} v define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index b4bf563725a6de..2425fbb1896257 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) @@ -25,7 +26,7 @@ entry: ; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp ; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX940: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: {{v_dot2c_f32_f16_e32|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp( float addrspace(1)* %r, <2 x half> addrspace(1)* %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index bd30de55ae0c05..f3490f45111574 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -1,15 +1,15 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED,PREGFX10,PREGFX10-UNPACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s - +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,PREGFX10,PREGFX10-UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,PREGFX10,PREGFX10-PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s ; GCN-LABEL: {{^}}tbuffer_store_d16_x: -; GCN-DAG: s_load_dwordx4 -; GCN-DAG: s_load_dword s[[S_LO:[0-9]+]] +; GCN-DAG: s_load_{{dwordx4|b128}} +; GCN-DAG: s_load_{{dword|b32}} s[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] ; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10: tbuffer_store_{{format_d16|d16_format}}_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -17,7 +17,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: -; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, +; GCN: s_load_{{dword|b32}} [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] @@ -25,7 +25,7 @@ main_body: ; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -33,7 +33,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} @@ -48,7 +48,7 @@ main_body: ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] ; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; GFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -57,7 +57,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} @@ -72,7 +72,7 @@ main_body: ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 01d3d38e48a71a..c85a21fc85eb17 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -1,15 +1,15 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PREGFX10,UNPACKED,PREGFX10-UNPACKED %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PREGFX10,PACKED,PREGFX10-PACKED %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PREGFX10,PACKED,PREGFX10-PACKED %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,PACKED,GFX10-PACKED %s - +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,UNPACKED,PREGFX10-UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,PACKED,PREGFX10-PACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,PACKED,PREGFX10-PACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,PACKED,GFX10-PACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,PACKED,GFX10-PACKED %s ; GCN-LABEL: {{^}}tbuffer_store_d16_x: -; GCN-DAG: s_load_dwordx4 -; GCN-DAG: s_load_dword{{[x0-2]*}} s[[[S_LO:[0-9]+]] +; GCN-DAG: s_load_{{dwordx4|b128}} +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] ; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX10: tbuffer_store_{{format_d16|d16_format}}_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -17,7 +17,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: -; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; GCN: s_load_{{dwordx2|b64}} s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] @@ -25,7 +25,7 @@ main_body: ; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -33,7 +33,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} @@ -47,7 +47,7 @@ main_body: ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] ; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; GFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] idxen define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> @@ -56,7 +56,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; GCN-DAG: s_load_{{dwordx2|b64}} s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} @@ -70,7 +70,7 @@ main_body: ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX10-PACKED: tbuffer_store_{{format_d16|d16_format}}_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_{{.*}}] idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 3bb3f0f2c5eba3..183b26bad7a732 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,17 +1,19 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX10,GFX10-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX10,GFX10-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ; GCN-LABEL: {{^}}fmuladd_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] @@ -25,12 +27,12 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] -; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] -; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10-FLUSH: buffer_store_short [[ADD]] +; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] -; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] -; GFX10-DENORM: buffer_store_short v[[C_F16]], +; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] +; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16( @@ -47,8 +49,8 @@ define amdgpu_kernel void @fmuladd_f16( } ; GCN-LABEL: {{^}}fmuladd_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] @@ -62,12 +64,12 @@ define amdgpu_kernel void @fmuladd_f16( ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] -; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]] -; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10-FLUSH: buffer_store_short [[ADD]] +; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]] +; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] -; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] -; GFX10-DENORM: buffer_store_short v[[C_F16]], +; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] +; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_a( @@ -82,8 +84,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( } ; GCN-LABEL: {{^}}fmuladd_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]] +; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] ; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]] @@ -97,12 +99,12 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] -; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]] -; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10-FLUSH: buffer_store_short [[ADD]] +; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]] +; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] -; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] -; GFX10-DENORM: buffer_store_short v[[C_F16]], +; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] +; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_b( @@ -129,9 +131,9 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GFX10PLUS: buffer_load_{{dword|b32}} v[[A_V2_F16:[0-9]+]] +; GFX10PLUS: buffer_load_{{dword|b32}} v[[B_V2_F16:[0-9]+]] +; GFX10PLUS: buffer_load_{{dword|b32}} v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -167,12 +169,12 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] -; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]] +; GFX10PLUS-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] +; GFX10PLUS-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]] -; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] +; GFX10PLUS-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] -; GCN: buffer_store_dword v[[R_V2_F16]] +; GCN: buffer_store_{{dword|b32}} v[[R_V2_F16]] define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index e9b6c7f8d5fb4b..46dc93f18cdafe 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -1,9 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA,GFX10-FMA %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9_10,FMA,GFX940-FMA %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10PLUS-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone @@ -15,10 +16,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]] +; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10PLUS-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -44,16 +45,16 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], -; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], -; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]], +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]], +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]], ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10PLUS-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] -; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 +; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 ; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { @@ -81,9 +82,9 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo } ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: -; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; GFX10PLUS-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -108,10 +109,10 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]] +; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]] ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +; GFX10PLUS-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -130,13 +131,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out ; We can't use an SGPR when forming madak ; GCN-LABEL: {{^}}s_v_madak_f32: -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]] ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] +; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 -; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10PLUS-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 ; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -152,13 +153,13 @@ define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float } ; GCN-LABEL: @v_s_madak_f32 -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]] ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] +; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]] ; GFX6_8_9-NOT: v_madak_f32 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 -; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10PLUS-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 ; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -176,8 +177,8 @@ define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 ; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; GFX10-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; GFX10PLUS-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; GFX10PLUS-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 { %mul = fmul float %a, %b @@ -189,11 +190,11 @@ define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, flo ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]] ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 -; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; GFX10PLUS-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 ; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { @@ -216,11 +217,11 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalia ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]] +; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]] ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} -; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 -; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; GFX10PLUS-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 ; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { @@ -244,17 +245,17 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; because the implicit immediate already uses the constant bus. ; On GFX10+ we can use two scalar operands. ; GCN-LABEL: {{^}}madak_constant_bus_violation: -; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] -; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} +; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VGPR:v[0-9]+]] +; GCN: s_load_{{dword|b32}} [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} ; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 ; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 -; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] -; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; GFX10-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GFX10PLUS: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] +; GFX10PLUS-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GFX10PLUS-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] -; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] +; GFX8PLUS: {{flat|global}}_store_{{dword|b32}} v[{{[0-9:]+}}], [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { bb: %tmp = icmp eq i32 %arg1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 74a090c5f673bf..1dfc85d2ece60e 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1,8 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,GFX8_9_10,FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,GFX8_9_10,FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX9_10,GFX8_9_10,FUNC %s -; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefixes=EG,FUNC %s +; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefixes=EG,FUNC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,GFX8_9_10,FUNC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,GFX8_9_10,FUNC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX9_10,GFX8_9_10,FUNC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX9_10,GFX8_9_10,FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; GCN: v_min_i32_e32 @@ -61,8 +62,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}s_test_imin_sle_i8: -; GCN: s_load_dword -; GCN: s_load_dword +; GCN: s_load_{{dword|b32}} +; GCN: s_load_{{dword|b32}} ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: s_min_i32 @@ -75,9 +76,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], ; FIXME: Why vector and sdwa for last element? ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: -; GCN-DAG: s_load_dwordx2 -; GCN-DAG: s_load_dword s -; GCN-DAG: s_load_dword s +; GCN-DAG: s_load_{{dwordx2|b64}} +; GCN-DAG: s_load_{{dword|b32}} s +; GCN-DAG: s_load_{{dword|b32}} s ; GCN-NOT: _load_ ; CI: s_min_i32 @@ -107,7 +108,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 } ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: -; GCN: s_load_dwordx4 s +; GCN: s_load_{{dwordx4|b128}} s ; CI: s_ashr_i32 ; CI: s_sext_i32_i16 @@ -350,8 +351,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs ; CI: {{buffer|flat|global}}_load_ubyte ; CI: v_min_u32_e32 -; GFX8_9_10: {{flat|global}}_load_ubyte -; GFX8_9_10: {{flat|global}}_load_ubyte +; GFX8_9_10: {{flat|global}}_load_{{ubyte|u8}} +; GFX8_9_10: {{flat|global}}_load_{{ubyte|u8}} ; GFX8_9: v_min_u16_e32 ; GFX10: v_min_u16 @@ -492,11 +493,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, < ; Make sure redundant and removed ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: -; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} -; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; GCN-DAG: s_load_{{dword|b32}} [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} +; GCN-DAG: s_load_{{dword|b32}} [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN: s_min_u32 [[MIN:s[0-9]+]], s{{[0-9]}}, s{{[0-9]}} -; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], s{{[0-9]}} -; GCN: buffer_store_dword [[VMIN]] +; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; GCN: {{flat|global}}_store_{{dword|b32}} v{{.+}}, [[VMIN]] ; EG: MIN_UINT define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { @@ -512,14 +513,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspac ; Make sure redundant sign_extend_inreg removed. ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} -; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} +; GCN-DAG: s_load_{{dword|b32}} [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} +; GCN-DAG: s_load_{{dword|b32}} [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} ; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]] ; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]] ; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]] ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; GCN: buffer_store_dword [[VMIN]] +; GCN: {{flat|global}}_store_{{dword|b32}} v{{.+}}, [[VMIN]] ; EG: MIN_INT define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll b/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll index 9ac30e0b8de88a..5977566e2d00ba 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll @@ -1,24 +1,25 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; ; Check that PS is wave64 -; GFX10-LABEL: _amdgpu_ps_main: -; GFX10: s_and_saveexec_b64 +; GCN-LABEL: _amdgpu_ps_main: +; GCN: s_or_b64 exec, exec ; ; Check that VS is wave32 -; GFX10-LABEL: _amdgpu_vs_main: -; GFX10: s_and_saveexec_b32 +; GCN-LABEL: _amdgpu_vs_main: +; GCN: s_or_b32 exec_lo, exec_lo ; ; Check that GS is wave32 -; GFX10-LABEL: _amdgpu_gs_main: -; GFX10: s_and_saveexec_b32 +; GCN-LABEL: _amdgpu_gs_main: +; GCN: s_or_b32 exec_lo, exec_lo ; ; Check that HS is wave32 -; GFX10-LABEL: _amdgpu_hs_main: -; GFX10: s_and_saveexec_b32 +; GCN-LABEL: _amdgpu_hs_main: +; GCN: s_or_b32 exec_lo, exec_lo ; ; Check that CS is wave32 -; GFX10-LABEL: _amdgpu_cs_main: -; GFX10: s_and_saveexec_b32 +; GCN-LABEL: _amdgpu_cs_main: +; GCN: s_or_b32 exec_lo, exec_lo ; ; Check that: ; PS_W32_EN (bit 15) of SPI_PS_IN_CONTROL (0xa1b6) is 0; @@ -27,7 +28,7 @@ ; HS_W32_EN (bit 21) of VGT_SHADER_STAGES_EN (0xa2d5) is 1; ; CS_W32_EN (bit 15) of COMPUTE_DISPATCH_INITIATOR (0x2e00) is 1. ; -; GFX10: .amd_amdgpu_pal_metadata{{.*}},0x2e00,0x8000,{{.*}}0xa1b6,0x1,{{.*}},0xa2d5,0xe00000, +; GCN: .amd_amdgpu_pal_metadata{{.*}},0x2e00,0x8000,{{.*}}0xa1b6,0x1,{{.*}},0xa2d5,0xe00000, define dllexport amdgpu_ps void @_amdgpu_ps_main(float %arg10) #0 { .entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir index a0f970879da0c6..fc1b2c38d6002a 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir @@ -1,8 +1,9 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s -# GFX10-LABEL: name: diffoporder_add -# GFX10: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0 -# GFX10: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0 +# GCN-LABEL: name: diffoporder_add +# GCN: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0 +# GCN: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0 name: diffoporder_add body: | @@ -43,19 +44,29 @@ body: | ... --- -# GFX10-LABEL: name: LowestInMiddle +# GCN-LABEL: name: LowestInMiddle # GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6400 -# GFX10: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] -# GFX10: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_5]] -# GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX11: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200 + +# GCN: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GCN: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_5]] +# GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 # GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 1600, 0 +# GFX11: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0 +# +# GFX11: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400 +# GFX11: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]] +# GFX11: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_7]] +# GFX11: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1 # GFX10: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, +# GFX11: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, # # GFX10: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 11200 # GFX10: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]] # GFX10: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_7]] # GFX10: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1 # GFX10: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, +# GFX11: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, name: LowestInMiddle body: | @@ -101,18 +112,22 @@ body: | ... --- -# GFX10-LABEL: name: NegativeDistance +# GCN-LABEL: name: NegativeDistance # GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 -# GFX10: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] -# GFX10: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_5]] -# GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX11: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240 +# GCN: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GCN: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_5]] +# GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 # GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0 +# GFX11: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0 # GFX10: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0 +# GFX11: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0 # GFX10: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 10240 # GFX10: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]] # GFX10: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_32_xm0_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_CO_U32_e64_7]] # GFX10: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1 # GFX10: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0 +# GFX11: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0 name: NegativeDistance body: | @@ -194,9 +209,9 @@ body: | ... --- -# GFX10-LABEL: name: diffoporder_add_store -# GFX10: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0 -# GFX10: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0 +# GCN-LABEL: name: diffoporder_add_store +# GCN: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0 +# GCN: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0 name: diffoporder_add_store body: | diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 9d8397f640ac32..8ae513783b875a 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -1,13 +1,15 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,FLATSCR,GFX9-FLATSCR %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,FLATSCR,GFX10-FLATSCR %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,FLATSCR,GFX9-FLATSCR-PAL %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,FLATSCR,GFX10-FLATSCR-PAL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR-PAL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR-PAL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 @@ -69,14 +71,17 @@ ; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 +; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GCN-NOT: s_mov_b32 s0 -; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] -; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] +; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] +; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] +; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128 define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -128,6 +133,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off + define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -174,6 +182,10 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off + define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -196,13 +208,16 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10-NOT: s_mov_b32 s5 +; GFX9PLUS-NOT: s_mov_b32 s5 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -251,6 +266,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -300,7 +318,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-NOT: s_mov_b32 s5 +; GFX9PLUS-NOT: s_mov_b32 s5 ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -308,6 +326,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -360,6 +381,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off + +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/shift-select.ll b/llvm/test/CodeGen/AMDGPU/shift-select.ll index d825d14893433a..3c6a07ab05d080 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-select.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-select.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX6 %s -; RUN: llc -march=amdgcn -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8-10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8-10 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -march=amdgcn -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8PLUS %s ; GCN-LABEL: name: s_shl_i32 ; GCN: S_LSHL_B32 @@ -12,7 +13,7 @@ define amdgpu_kernel void @s_shl_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) ; GCN-LABEL: name: v_shl_i32 ; GFX6: V_LSHL_B32_e32 -; GFX8-10: V_LSHLREV_B32_e32 +; GFX8PLUS: V_LSHLREV_B32_e32 define amdgpu_kernel void @v_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -33,7 +34,7 @@ define amdgpu_kernel void @s_lshr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs ; GCN-LABEL: name: v_lshr_i32 ; GFX6: V_LSHR_B32_e32 -; GFX8-10: V_LSHRREV_B32_e64 +; GFX8PLUS: V_LSHRREV_B32_e64 define amdgpu_kernel void @v_lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -54,7 +55,7 @@ define amdgpu_kernel void @s_ashr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs ; GCN-LABEL: name: v_ashr_i32 ; GFX6: V_ASHR_I32_e32 -; GFX8-10: V_ASHRREV_I32_e64 +; GFX8PLUS: V_ASHRREV_I32_e64 define amdgpu_kernel void @v_ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir index 6735853f5c7438..d69acde0253bb1 100644 --- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir @@ -1,9 +1,10 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s # GCN-LABEL: name: hazard_smem_war -# GCN: S_LOAD_DWORD_IMM -# GCN: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GFX10-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war body: | @@ -64,10 +65,10 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_only_smem -# GCN: S_LOAD_DWORD_IMM -# GCN-NEXT: S_LOAD_DWORD_IMM -# GCN-NEXT: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_LOAD_DWORD_IMM +# GFX10-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war_only_smem body: | @@ -95,10 +96,10 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_only_vmcnt_0 -# GCN: S_LOAD_DWORD_IMM -# GCN-NEXT: S_WAITCNT 3952{{$}} -# GCN-NEXT: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT 3952{{$}} +# GFX10-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war_only_vmcnt_0 body: | @@ -111,10 +112,10 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_only_expcnt_0 -# GCN: S_LOAD_DWORD_IMM -# GCN-NEXT: S_WAITCNT 53007{{$}} -# GCN-NEXT: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT 53007{{$}} +# GFX10-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war_only_expcnt_0 body: | @@ -157,10 +158,10 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_only_waitcnt_lgkmcnt_1 -# GCN: S_LOAD_DWORD_IMM -# GCN-NEXT: S_WAITCNT_LGKMCNT -# GCN-NEXT: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT_LGKMCNT +# GFX10-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war_only_waitcnt_lgkmcnt_1 body: | @@ -173,9 +174,9 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_branch -# GCN: S_LOAD_DWORD_IMM -# GCN: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM +# GFX10: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 --- name: hazard_smem_war_branch body: | @@ -192,14 +193,14 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_cbranch -# GCN: S_AND_B64 -# GCN: S_LOAD_DWORD_IMM -# GCN: S_CBRANCH_VCCZ -# GCN-NOT: $sgpr_null = S_MOV_B32 0 -# GCN: V_CMP_EQ_F32 -# GCN: S_ENDPGM 0 -# GCN: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_AND_B64 +# GCN: S_LOAD_DWORD_IMM +# GCN: S_CBRANCH_VCCZ +# GFX10-NOT: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 +# GCN: S_ENDPGM 0 +# GFX10: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 --- name: hazard_smem_war_cbranch body: | @@ -222,16 +223,16 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_cbranch_carry -# GCN: S_AND_B64 -# GCN: S_LOAD_DWORD_IMM -# GCN: S_CBRANCH_VCCZ -# GCN-NOT: $sgpr_null = S_MOV_B32 0 -# GCN: V_CMP_EQ_F32 -# GCN-NEXT: S_ENDPGM 0 -# GCN-NOT: $sgpr_null = S_MOV_B32 0 -# GCN: V_CMP_EQ_F32 -# GCN: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 +# GCN: S_AND_B64 +# GCN: S_LOAD_DWORD_IMM +# GCN: S_CBRANCH_VCCZ +# GFX10-NOT: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 +# GCN-NEXT: S_ENDPGM 0 +# GFX10-NOT: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 +# GFX10: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 --- name: hazard_smem_war_cbranch_carry body: | @@ -259,9 +260,9 @@ body: | ... # GCN-LABEL: name: hazard_smem_war_backedge -# GCN: $sgpr_null = S_MOV_B32 0 -# GCN-NEXT: V_CMP_EQ_F32 -# GCN: S_LOAD_DWORD_IMM +# GFX10: $sgpr_null = S_MOV_B32 0 +# GCN: V_CMP_EQ_F32 +# GCN: S_LOAD_DWORD_IMM --- name: hazard_smem_war_backedge body: | @@ -278,7 +279,7 @@ body: | # GCN-LABEL: name: hazard_smem_war_impdef # GCN: S_LOAD_DWORD_IMM -# GCN: $sgpr_null = S_MOV_B32 0 +# GFX10: $sgpr_null = S_MOV_B32 0 # GCN-NEXT: V_CMP_EQ_F32 --- name: hazard_smem_war_impdef @@ -292,7 +293,7 @@ body: | # GCN-LABEL: name: hazard_smem_war_readlane # GCN: S_LOAD_DWORD_IMM -# GCN: $sgpr_null = S_MOV_B32 0 +# GFX10: $sgpr_null = S_MOV_B32 0 # GCN-NEXT: V_READLANE_B32 --- name: hazard_smem_war_readlane @@ -306,7 +307,7 @@ body: | # GCN-LABEL: name: hazard_smem_war_readfirstlane # GCN: S_LOAD_DWORD_IMM -# GCN: $sgpr_null = S_MOV_B32 0 +# GFX10: $sgpr_null = S_MOV_B32 0 # GCN-NEXT: V_READFIRSTLANE_B32 --- name: hazard_smem_war_readfirstlane diff --git a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll index bb52ac8cb39e86..631919afe1d707 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd-gfx10.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; GCN-LABEL: {{^}}smrd_imm_dlc: -; GCN: s_buffer_load_dword s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0xfa] +; GFX10: s_buffer_load_dword s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0xfa] +; GFX11: s_buffer_load_b32 s0, s[0:3], 0x0 dlc ; encoding: [0x00,0x20,0x20,0xf4,0x00,0x00,0x00,0xf8] define amdgpu_ps float @smrd_imm_dlc(<4 x i32> inreg %desc) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 4) @@ -9,7 +11,8 @@ main_body: } ; GCN-LABEL: {{^}}smrd_sgpr_dlc: -; GCN: s_buffer_load_dword s0, s[0:3], s4 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0x08] +; GFX10: s_buffer_load_dword s0, s[0:3], s4 dlc ; encoding: [0x00,0x40,0x20,0xf4,0x00,0x00,0x00,0x08] +; GFX11: s_buffer_load_b32 s0, s[0:3], s4 dlc ; encoding: [0x00,0x20,0x20,0xf4,0x00,0x00,0x00,0x08] define amdgpu_ps float @smrd_sgpr_dlc(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 4) @@ -17,7 +20,8 @@ main_body: } ; GCN-LABEL: {{^}}smrd_imm_glc_dlc: -; GCN: s_buffer_load_dword s0, s[0:3], 0x0 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0xfa] +; GFX10: s_buffer_load_dword s0, s[0:3], 0x0 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0xfa] +; GFX11: s_buffer_load_b32 s0, s[0:3], 0x0 glc dlc ; encoding: [0x00,0x60,0x20,0xf4,0x00,0x00,0x00,0xf8] define amdgpu_ps float @smrd_imm_glc_dlc(<4 x i32> inreg %desc) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 5) @@ -25,7 +29,8 @@ main_body: } ; GCN-LABEL: {{^}}smrd_sgpr_glc_dlc: -; GCN: s_buffer_load_dword s0, s[0:3], s4 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0x08] +; GFX10: s_buffer_load_dword s0, s[0:3], s4 glc dlc ; encoding: [0x00,0x40,0x21,0xf4,0x00,0x00,0x00,0x08] +; GFX11: s_buffer_load_b32 s0, s[0:3], s4 glc dlc ; encoding: [0x00,0x60,0x20,0xf4,0x00,0x00,0x00,0x08] define amdgpu_ps float @smrd_sgpr_glc_dlc(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 5) diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir index 37b6b0cabe08b7..7a802ed1cac2a6 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir @@ -1,9 +1,10 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo -# GCN: $sgpr0 = S_MOV_B32 $exec_lo -# GCN-NEXT: S_WAITCNT_DEPCTR 65534 -# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 +# GCN: $sgpr0 = S_MOV_B32 $exec_lo +# GFX10-NEXT: S_WAITCNT_DEPCTR 65534 +# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 --- name: hazard_vcmpx_smov_exec_lo body: | @@ -19,9 +20,9 @@ body: | ... # GCN-LABEL: name: hazard_vcmpx_smov_exec -# GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec -# GCN-NEXT: S_WAITCNT_DEPCTR 65534 -# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 +# GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec +# GFX10-NEXT: S_WAITCNT_DEPCTR 65534 +# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 --- name: hazard_vcmpx_smov_exec body: | @@ -145,9 +146,10 @@ body: | ... # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo_depctr_effe -# GCN: $sgpr0 = S_MOV_B32 $exec_lo -# GCN: S_WAITCNT_DEPCTR 65534 -# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 +# GCN: $sgpr0 = S_MOV_B32 $exec_lo +# GCN-NEXT: S_WAITCNT_DEPCTR 61438 +# GFX10-NEXT: S_WAITCNT_DEPCTR 65534 +# GCN-NEXT: V_CMPX_LE_F32_nosdst_e32 --- name: hazard_vcmpx_smov_exec_lo_depctr_effe body: | diff --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir index c15365e576adec..0f4dfa0f8114b1 100644 --- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -1,9 +1,10 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s # GCN-LABEL: name: vmem_write_sgpr -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr body: | @@ -15,9 +16,9 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_exec -# GCN: BUFFER_STORE_DWORD_OFFEN_exact -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_STORE_DWORD_OFFEN_exact +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_exec body: | @@ -30,13 +31,13 @@ body: | $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_write_sgpr_chain -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_MOV_B32 -# GCN-NEXT: S_MOV_B32 -# GCN-NEXT: S_MOV_B32 -# GCN-NEXT: S_MOV_B32 -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_MOV_B32 +# GCN-NEXT: S_MOV_B32 +# GCN-NEXT: S_MOV_B32 +# GCN-NEXT: S_MOV_B32 +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_chain body: | @@ -53,9 +54,9 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_smem_write_sgpr -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_LOAD_DWORD_IMM +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_LOAD_DWORD_IMM --- name: vmem_smem_write_sgpr body: | @@ -67,10 +68,10 @@ body: | $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ... # GCN-LABEL: name: vmem_snop_write_sgpr -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_NOP -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_NOP +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_snop_write_sgpr body: | @@ -113,10 +114,10 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_swait_any_write_sgpr -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_WAITCNT +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_swait_any_write_sgpr body: | @@ -129,9 +130,9 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_exec_impread -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B64 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B64 --- name: vmem_write_exec_impread body: | @@ -143,9 +144,9 @@ body: | $exec = S_MOV_B64 7 ... # GCN-LABEL: name: vmem_write_exec_expread -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B64 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B64 --- name: vmem_write_exec_expread body: | @@ -156,9 +157,9 @@ body: | $exec = S_MOV_B64 7 ... # GCN-LABEL: name: ds_write_m0 -# GCN: DS_READ_B32 -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: DS_READ_B32 +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: ds_write_m0 body: | @@ -170,9 +171,10 @@ body: | $m0 = S_MOV_B32 7 ... # GCN-LABEL: name: vmem_write_sgpr_fall_through -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_fall_through body: | @@ -187,10 +189,11 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_sgpr_branch -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_BRANCH -# GCN: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_BRANCH +# GCN: bb.1: +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch body: | @@ -206,11 +209,11 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_sgpr_branch_around -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_BRANCH -# GCN: bb.2: -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_BRANCH +# GCN: bb.2: +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch_around body: | @@ -230,15 +233,15 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_sgpr_cbranch_around -# GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_CBRANCH -# GCN-NEXT: S_BRANCH -# GCN: bb.1: -# GCN: S_WAITCNT -# GCN: V_ADD_CO_U32 -# GCN: bb.2: -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: BUFFER_LOAD_DWORD_OFFEN +# GCN-NEXT: S_CBRANCH +# GCN-NEXT: S_BRANCH +# GCN: bb.1: +# GCN: S_WAITCNT +# GCN: V_ADD_CO_U32 +# GCN: bb.2: +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_cbranch_around body: | @@ -261,9 +264,9 @@ body: | $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_sgpr_branch_backedge -# GCN: $vgpr0 = IMPLICIT_DEF -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: $vgpr0 = IMPLICIT_DEF +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch_backedge body: | @@ -279,9 +282,9 @@ body: | S_BRANCH %bb.0 ... # GCN-LABEL: name: ds_write_exec -# GCN: DS_WRITE_B32_gfx9 -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: DS_WRITE_B32_gfx9 +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: ds_write_exec body: | @@ -292,9 +295,9 @@ body: | $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_scratch_exec -# GCN: SCRATCH_LOAD_DWORD -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: SCRATCH_LOAD_DWORD +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_scratch_exec body: | @@ -304,9 +307,9 @@ body: | $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_flat_exec -# GCN: FLAT_LOAD_DWORD -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: FLAT_LOAD_DWORD +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_flat_exec body: | @@ -317,9 +320,9 @@ body: | $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_global_exec -# GCN: GLOBAL_LOAD_DWORD -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: GLOBAL_LOAD_DWORD +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_global_exec body: | @@ -330,9 +333,9 @@ body: | $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_global_atomic_exec -# GCN: GLOBAL_ATOMIC_ADD_RTN -# GCN-NEXT: S_WAITCNT_DEPCTR 65507 -# GCN-NEXT: S_MOV_B32 +# GCN: GLOBAL_ATOMIC_ADD_RTN +# GFX10-NEXT: S_WAITCNT_DEPCTR 65507 +# GCN-NEXT: S_MOV_B32 --- name: vmem_global_atomic_exec body: | diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index e78b5355c2d5d8..f1af64eb9b39df 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -1,12 +1,13 @@ ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9PLUS,GFX8_9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword -; GFX9_10: global_load_dword +; GFX9PLUS: global_load_{{dword|b32}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9_10: s_waitcnt vmcnt(0){{$}} +; GFX9PLUS: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) { bb: @@ -27,10 +28,10 @@ bb: ; GCN-LABEL: barrier_vscnt_global: ; GFX8: flat_store_dword -; GFX9_10: global_store_dword +; GFX9PLUS: global_store_{{dword|b32}} ; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX9: s_waitcnt vmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) { bb: @@ -53,10 +54,10 @@ bb: ; GCN-LABEL: barrier_vmcnt_vscnt_global: ; GFX8: flat_load_dword -; GFX9_10: global_load_dword +; GFX9PLUS: global_load_{{dword|b32}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9_10: s_waitcnt vmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX9PLUS: s_waitcnt vmcnt(0){{$}} +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) { bb: @@ -80,7 +81,7 @@ bb: } ; GCN-LABEL: barrier_vmcnt_flat: -; GCN: flat_load_dword +; GCN: flat_load_{{dword|b32}} ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) { @@ -101,10 +102,10 @@ bb: } ; GCN-LABEL: barrier_vscnt_flat: -; GCN: flat_store_dword +; GCN: flat_store_{{dword|b32}} ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}} +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) { bb: @@ -126,9 +127,9 @@ bb: } ; GCN-LABEL: barrier_vmcnt_vscnt_flat: -; GCN: flat_load_dword +; GCN: flat_load_{{dword|b32}} ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) { bb: @@ -152,11 +153,11 @@ bb: } ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: -; GCN: flat_load_dword +; GCN: flat_load_{{dword|b32}} ; GFX8_9: s_waitcnt lgkmcnt(0){{$}} ; GFX8_9: s_waitcnt vmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) { bb: @@ -181,10 +182,10 @@ bb: ; GCN-LABEL: load_vmcnt_global: ; GFX8: flat_load_dword -; GFX9_10: global_load_dword +; GFX9PLUS: global_load_{{dword|b32}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX9_10: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_dword +; GFX9PLUS: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: {{global|flat}}_store_{{dword|b32}} define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -200,10 +201,10 @@ bb: } ; GCN-LABEL: load_vmcnt_flat: -; GCN: flat_load_dword +; GCN: flat_load_{{dword|b32}} ; GCN-NOT: vscnt ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: {{global|flat}}_store_dword +; GCN-NEXT: {{global|flat}}_store_{{dword|b32}} define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -219,9 +220,9 @@ bb: } ; GCN-LABEL: store_vscnt_private: -; GCN: buffer_store_dword +; GCN: {{buffer|scratch}}_store_{{dword|b32}} ; GFX8_9: s_waitcnt vmcnt(0) -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_private(i32 addrspace(5)* %p) { store i32 0, i32 addrspace(5)* %p @@ -230,9 +231,9 @@ define void @store_vscnt_private(i32 addrspace(5)* %p) { ; GCN-LABEL: store_vscnt_global: ; GFX8: flat_store_dword -; GFX9_10: global_store_dword +; GFX9PLUS: global_store_{{dword|b32}} ; GFX8_9: s_waitcnt vmcnt(0) -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_global(i32 addrspace(1)* %p) { store i32 0, i32 addrspace(1)* %p @@ -240,10 +241,10 @@ define void @store_vscnt_global(i32 addrspace(1)* %p) { } ; GCN-LABEL: store_vscnt_flat: -; GCN: flat_store_dword +; GCN: flat_store_{{dword|b32}} ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}} +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_flat(i32* %p) { store i32 0, i32* %p @@ -252,7 +253,7 @@ define void @store_vscnt_flat(i32* %p) { ; GCN-LABEL: function_prologue: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @function_prologue() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir index 80d87c7b7a6a91..d20d7306674af5 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir @@ -1,8 +1,10 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s -# GFX10-LABEL: waitcnt-vscnt -# GFX10: GLOBAL_ATOMIC_ADD_RTN +# GCN-LABEL: waitcnt-vscnt +# GCN: GLOBAL_ATOMIC_ADD_RTN # GFX10-NEXT: S_WAITCNT 49279 +# GFX11-NEXT: S_WAITCNT 64519 --- name: waitcnt-vscnt machineFunctionInfo: