diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index d05424ffe773d..fccee3da6d77e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -1,53 +1,94 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -mcpu=gfx90a -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx90a -mattr=-xnack < %s | llc -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -mcpu=gfx90a -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7 %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx90a -mattr=-xnack < %s | llc -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s target triple = "amdgcn-amd-amdhsa" -; GCN-LABEL: {{^}}use_workitem_id_x: -; GCN: s_waitcnt -; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_x() #1 { +; GFX7-LABEL: use_workitem_id_x: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_x: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_y: -; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_y() #1 { +; GFX7-LABEL: use_workitem_id_y: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_y: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_z: -; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_z() #1 { +; GFX7-LABEL: use_workitem_id_z: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_z: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_xy: -; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xy() #1 { +; GFX7-LABEL: use_workitem_id_xy: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_xy: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %val0, ptr addrspace(1) poison @@ -55,17 +96,34 @@ define void @use_workitem_id_xy() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_xyz: -; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xyz() #1 { +; GFX7-LABEL: use_workitem_id_xyz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_xyz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -75,15 +133,28 @@ define void @use_workitem_id_xyz() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_xz: -; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xz() #1 { +; GFX7-LABEL: use_workitem_id_xz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_xz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val0, ptr addrspace(1) poison @@ -91,15 +162,28 @@ define void @use_workitem_id_xz() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_yz: -; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_yz() #1 { +; GFX7-LABEL: use_workitem_id_yz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: use_workitem_id_yz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val0, ptr addrspace(1) poison @@ -107,229 +191,639 @@ define void @use_workitem_id_yz() #1 { ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: -; GCN: v_mov_b32_e32 v31, v0 -; GCN: s_swappc_b64 -; GCN-NOT: v31 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_x() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: - -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN-NOT: v31 -; PACKED-TID: v_mov_b32_e32 v31, v0 -; UNPACKED-TID: v_lshlrev_b32_e32 v31, 10, v1 -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_y: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_y@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_y@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_y: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_y@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_y@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_y() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 1 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: - -; GCN-NOT: v0 -; GCN-NOT: v2 -; GCN-NOT: v31 -; PACKED-TID: v_mov_b32_e32 v31, v0 -; UNPACKED-TID: v_lshlrev_b32_e32 v31, 20, v2 -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_z: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_z@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_z@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 20, v2 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_z: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_z@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_z@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_z() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: -; GCN-NOT: v0 -; GCN-NOT: v1 -; PACKED-TID: v_mov_b32_e32 v31, v0 -; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDY]] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_xy: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_xy@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_xy@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_xy: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_xy@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_xy@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_xy() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: -; GCN-NOT: v0 -; GCN-NOT: v2 - -; PACKED-TID: v_mov_b32_e32 v31, v0 -; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]] -; GCN-NOT: v0 -; GCN-NOT: v2 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_xz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_xz@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_xz@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 20, v2 +; GFX7-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_xz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_xz@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_xz@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_xz() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: -; GCN-NOT: v1 -; GCN-NOT: v2 -; PACKED-TID: v_mov_b32_e32 v31, v0 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v31, [[IDY]], [[IDZ]] -; GCN-NOT: v1 -; GCN-NOT: v2 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_yz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_yz@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_yz@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 20, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7-NEXT: v_or_b32_e32 v31, v1, v0 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_yz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_yz@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_yz@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_yz() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN-NOT: v2 - -; PACKED-TID: v_mov_b32_e32 v31, v0 - -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]] -; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN-NOT: v2 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { +; GFX7-LABEL: kern_indirect_use_workitem_id_xyz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, use_workitem_id_xyz@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, use_workitem_id_xyz@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_use_workitem_id_xyz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, use_workitem_id_xyz@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, use_workitem_id_xyz@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @use_workitem_id_xyz() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_x() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_x() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_y() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_y@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_y@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_y() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_z() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_z@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_z@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_z() ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: -; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { +; GFX7-LABEL: other_arg_use_workitem_id_x: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: other_arg_use_workitem_id_x: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: -; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { +; GFX7-LABEL: other_arg_use_workitem_id_y: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: other_arg_use_workitem_id_y: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: -; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { +; GFX7-LABEL: other_arg_use_workitem_id_z: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: other_arg_use_workitem_id_z: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } - -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: - -; GCN: v_mov_b32_e32 v31, v0 -; GCN: v_mov_b32_e32 v0, 0x22b -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { +; GCN-LABEL: kern_indirect_other_arg_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x22b +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @other_arg_use_workitem_id_x(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 - -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: - -; UNPACKED-TID: v_lshlrev_b32_e32 v31, 10, v1 -; PACKED-TID: v_mov_b32_e32 v31, v0 -; GCN-NOT: v1 -; GCN: v_mov_b32_e32 v0, 0x22b -; GCN-NOT: v1 -; GCN: s_swappc_b64 -; GCN-NOT: v0 - -; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { +; GFX7-LABEL: kern_indirect_other_arg_use_workitem_id_y: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_y@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_y@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x22b +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_other_arg_use_workitem_id_y: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_y@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_y@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x22b +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @other_arg_use_workitem_id_y(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 1 -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: - -; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v31, 20, v2 -; PACKED-TID-DAG: v_mov_b32_e32 v31, v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { +; GFX7-LABEL: kern_indirect_other_arg_use_workitem_id_z: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_z@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_z@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 20, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x22b +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_indirect_other_arg_use_workitem_id_z: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_z@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_z@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x22b +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @other_arg_use_workitem_id_z(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN-DAG: v_and_b32_e32 v31, 0x3ff, v31 -; GCN-DAG: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x( +; GFX7-LABEL: too_many_args_use_workitem_id_x: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v9 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v18 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v19 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v20 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v21 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v22 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v23 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v25 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v26 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v27 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v30 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: too_many_args_use_workitem_id_x: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX90A-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v31, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v3, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v4, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v5, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v6, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v7, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v8, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v9, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v10, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v11, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v12, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v13, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v14, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v15, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v16, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v17, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v18, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v19, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v20, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v21, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v22, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v23, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v24, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v25, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v26, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v27, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v28, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v29, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v30, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -376,15 +870,53 @@ define void @too_many_args_use_workitem_id_x( ret void } -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: - -; GCN: s_mov_b32 s32, 0 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 v31, v0 -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x140 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -396,15 +928,136 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { i32 290, i32 300, i32 310, i32 320) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 -; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: -; GCN-NOT: v31 -; GCN: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NOT: v31 -; GCN: s_swappc_b64 -; GCN-NOT: v31 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { +; GFX7-LABEL: func_call_too_many_args_use_workitem_id_x: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s4, 2 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, 0x140 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX7-NEXT: v_mov_b32_e32 v0, 10 +; GFX7-NEXT: v_mov_b32_e32 v1, 20 +; GFX7-NEXT: v_mov_b32_e32 v2, 30 +; GFX7-NEXT: v_mov_b32_e32 v3, 40 +; GFX7-NEXT: v_mov_b32_e32 v4, 50 +; GFX7-NEXT: v_mov_b32_e32 v5, 60 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX7-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX7-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX7-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX7-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX7-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX7-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX7-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX7-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX7-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX7-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX7-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX7-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX7-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX7-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX7-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX7-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX7-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX7-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX7-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX7-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX7-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX7-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: func_call_too_many_args_use_workitem_id_x: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, s33 +; GFX90A-NEXT: s_mov_b32 s33, s32 +; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_addk_i32 s32, 0x400 +; GFX90A-NEXT: v_writelane_b32 v40, s4, 2 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x140 +; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX90A-NEXT: v_mov_b32_e32 v0, 10 +; GFX90A-NEXT: v_mov_b32_e32 v1, 20 +; GFX90A-NEXT: v_mov_b32_e32 v2, 30 +; GFX90A-NEXT: v_mov_b32_e32 v3, 40 +; GFX90A-NEXT: v_mov_b32_e32 v4, 50 +; GFX90A-NEXT: v_mov_b32_e32 v5, 60 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX90A-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX90A-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX90A-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-NEXT: s_mov_b32 s32, s33 +; GFX90A-NEXT: v_readlane_b32 s4, v40, 2 +; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_mov_b32 s33, s4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] store volatile i32 %arg0, ptr addrspace(1) poison call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -419,19 +1072,38 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { } ; Requires loading and storing to stack slot. -; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: -; GCN-DAG: s_addk_i32 s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword [[TMP_REG:v[0-9]+]], off, s[0:3], s33{{$}} - -; GCN: buffer_store_dword [[TMP_REG]], off, s[0:3], s32{{$}} - -; GCN: s_swappc_b64 - -; GCN: s_mov_b32 s32, s33 -; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( +; GCN-LABEL: too_many_args_call_too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -447,16 +1119,156 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; stack layout: ; frame[0] = stack passed arg23 ; frame[1] = byval arg32 - -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GCN-DAG: v_and_b32_e32 v31, 0x3ff, v31 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 -; GCN-DAG: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LOAD_ARG31]] -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( +; GFX7-LABEL: too_many_args_use_workitem_id_x_byval: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v9 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v18 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v19 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v20 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v21 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v22 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v23 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v25 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v26 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v27 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v30 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: too_many_args_use_workitem_id_x_byval: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX90A-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v31, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v3, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v4, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v5, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v6, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v7, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v8, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v9, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v10, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v11, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v12, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v13, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v14, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v15, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v16, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v17, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v18, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v19, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v20, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v21, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v22, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v23, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v24, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v25, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v26, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v27, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v28, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v29, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v30, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -505,25 +1317,60 @@ define void @too_many_args_use_workitem_id_x_byval( ; sp[0] = stack passed %arg31 ; sp[1] = byval - -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: - ; Local stack object initialize. Offset 0 is the emergency spill slot. -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: s_movk_i32 s32, 0x400 -; GCN: buffer_store_dword [[K]], off, s[0:3], 0 - -; Pass %arg31 on stack -; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} -; GCN: buffer_store_dword [[K1:v[0-9]+]], off, s[0:3], s32{{$}} - -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} -; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_x_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x140 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, ptr addrspace(5) %alloca call void @too_many_args_use_workitem_id_x_byval( @@ -538,16 +1385,76 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ptr addrspace(5) byval(i32) %alloca) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 -; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} -; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], -; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { +; GCN-LABEL: func_call_too_many_args_use_workitem_id_x_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x140 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, ptr addrspace(5) %alloca call void @too_many_args_use_workitem_id_x_byval( @@ -563,28 +1470,164 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ret void } -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GFX90A: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} -; GFX90A: v_and_b32_e32 [[ID_X:v[0-9]+]], 0x3ff, v31 -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_X]], off{{$}} -; GFX90A: v_bfe_u32 [[ID_Y:v[0-9]+]], v31, 10, 10 -; GFX90A: v_bfe_u32 [[ID_Z:v[0-9]+]], v31, 20, 10 -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Y]], off{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Z]], off{{$}} - -; GFX7: v_and_b32_e32 v32, 0x3ff, v31 -; GFX7: v_bfe_u32 v32, v31, 10, 10 -; GCN7: v_bfe_u32 v31, v31, 20, 10 -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}} -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31{{$}} -; GFX7: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} - -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] - -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( +; GFX7-LABEL: too_many_args_use_workitem_id_xyz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v32, v31, 10, 10 +; GFX7-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v9 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v18 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v19 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v20 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v21 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v22 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v23 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v25 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v26 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v27 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v30 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: too_many_args_use_workitem_id_xyz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX90A-NEXT: v_and_b32_e32 v33, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v33, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v33, v31, 10, 10 +; GFX90A-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v33, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v31, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v3, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v4, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v5, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v6, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v7, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v8, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v9, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v10, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v11, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v12, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v13, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v14, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v15, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v16, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v17, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v18, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v19, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v20, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v21, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v22, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v23, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v24, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v25, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v26, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v27, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v28, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v29, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v30, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -636,24 +1679,103 @@ define void @too_many_args_use_workitem_id_xyz( } ; frame[0] = ID { Z, Y, X } - -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: - -; GCN-DAG: s_mov_b32 s32, 0 - -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 -; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2 -; PACKED-TID-NOT: v0 -; PACKED-TID-NOT: v1 -; PACKED-TID-NOT: v2 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 -; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { +; GFX7-LABEL: kern_call_too_many_args_use_workitem_id_xyz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_xyz@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_xyz@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x140 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GFX7-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, 10 +; GFX7-NEXT: v_mov_b32_e32 v1, 20 +; GFX7-NEXT: v_mov_b32_e32 v2, 30 +; GFX7-NEXT: v_mov_b32_e32 v3, 40 +; GFX7-NEXT: v_mov_b32_e32 v4, 50 +; GFX7-NEXT: v_mov_b32_e32 v5, 60 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX7-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX7-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX7-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX7-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX7-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX7-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX7-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX7-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX7-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX7-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX7-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX7-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX7-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX7-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX7-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX7-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX7-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX7-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX7-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX7-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX7-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX7-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_call_too_many_args_use_workitem_id_xyz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_xyz@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_xyz@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x140 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 10 +; GFX90A-NEXT: v_mov_b32_e32 v1, 20 +; GFX90A-NEXT: v_mov_b32_e32 v2, 30 +; GFX90A-NEXT: v_mov_b32_e32 v3, 40 +; GFX90A-NEXT: v_mov_b32_e32 v4, 50 +; GFX90A-NEXT: v_mov_b32_e32 v5, 60 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX90A-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX90A-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX90A-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @too_many_args_use_workitem_id_xyz( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -665,24 +1787,163 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { i32 290, i32 300, i32 310, i32 320) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 ; workitem ID X in register, yz on stack ; v31 = workitem ID X ; frame[0] = workitem { Z, Y, X } - -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: {{flat|global}}_store_dword v[0:1], [[IDX]] -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]] -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] - -; GCN-COUNT-31: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}} -; GCN-NEXT: s_waitcnt -; GCN: s_setpc_b64 -; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( +; GFX7-LABEL: too_many_args_use_workitem_id_x_stack_yz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GFX7-NEXT: flat_store_dword v[0:1], v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v32, v31, 10, 10 +; GFX7-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GFX7-NEXT: flat_store_dword v[0:1], v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v9 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v18 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v19 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v20 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v21 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v22 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v23 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v25 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v26 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v27 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v30 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: too_many_args_use_workitem_id_x_stack_yz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_bfe_u32 v32, v31, 10, 10 +; GFX90A-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v31, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v3, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v4, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v5, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v6, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v7, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v8, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v9, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v10, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v11, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v12, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v13, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v14, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v15, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v16, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v17, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v18, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v19, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v20, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v21, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v22, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v23, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v24, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v25, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v26, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v27, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v28, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v29, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dword v[0:1], v30, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -731,21 +1992,101 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ret void } +; GCN: ScratchSize: 0 -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: - -; GCN-NOT: v0 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 -; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2 -; PACKED-TID: v_mov_b32_e32 v31, v0 - -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { +; GFX7-LABEL: kern_call_too_many_args_use_workitem_id_x_stack_yz: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_add_u32 s0, s0, s5 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, 10 +; GFX7-NEXT: v_mov_b32_e32 v1, 20 +; GFX7-NEXT: v_mov_b32_e32 v2, 30 +; GFX7-NEXT: v_mov_b32_e32 v3, 40 +; GFX7-NEXT: v_mov_b32_e32 v4, 50 +; GFX7-NEXT: v_mov_b32_e32 v5, 60 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX7-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX7-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX7-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX7-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX7-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX7-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX7-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX7-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX7-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX7-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX7-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX7-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX7-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX7-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX7-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX7-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX7-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX7-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX7-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX7-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX7-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX7-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX7-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX7-NEXT: s_mov_b32 s32, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: s_endpgm +; +; GFX90A-LABEL: kern_call_too_many_args_use_workitem_id_x_stack_yz: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_add_u32 s0, s0, s5 +; GFX90A-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 10 +; GFX90A-NEXT: v_mov_b32_e32 v1, 20 +; GFX90A-NEXT: v_mov_b32_e32 v2, 30 +; GFX90A-NEXT: v_mov_b32_e32 v3, 40 +; GFX90A-NEXT: v_mov_b32_e32 v4, 50 +; GFX90A-NEXT: v_mov_b32_e32 v5, 60 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x46 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x5a +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x64 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x6e +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x78 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x82 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x8c +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x96 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0xa0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0xaa +; GFX90A-NEXT: v_mov_b32_e32 v17, 0xb4 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0xbe +; GFX90A-NEXT: v_mov_b32_e32 v19, 0xc8 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0xd2 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0xdc +; GFX90A-NEXT: v_mov_b32_e32 v22, 0xe6 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0xf0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0xfa +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x104 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x10e +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x118 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x122 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x12c +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x136 +; GFX90A-NEXT: s_mov_b32 s32, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_endpgm call void @too_many_args_use_workitem_id_x_stack_yz( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -757,6 +2098,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() i32 290, i32 300, i32 310) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index b671d68a4b75b..bb2f06bfe83f8 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,53 +1,58 @@ -; RUN: opt -mcpu=kaveri -passes=amdgpu-attributor < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mcpu=kaveri -passes=amdgpu-attributor < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN %s target triple = "amdgcn-amd-amdhsa" -; GCN-LABEL: {{^}}use_workitem_id_x: -; GCN: s_waitcnt -; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_x() #1 { +; GCN-LABEL: use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_y: -; GCN: s_waitcnt -; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_y() #1 { +; GCN-LABEL: use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_z: -; GCN: s_waitcnt -; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_z() #1 { +; GCN-LABEL: use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}use_workitem_id_xy: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 - -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xy() #1 { +; GCN-LABEL: use_workitem_id_xy: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %val0, ptr addrspace(1) poison @@ -55,20 +60,20 @@ define void @use_workitem_id_xy() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_xyz: -; GCN: s_waitcnt - -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 - - -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xyz() #1 { +; GCN-LABEL: use_workitem_id_xyz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() %val2 = call i32 @llvm.amdgcn.workitem.id.z() @@ -78,16 +83,17 @@ define void @use_workitem_id_xyz() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_xz: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 - -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xz() #1 { +; GCN-LABEL: use_workitem_id_xz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val0, ptr addrspace(1) poison @@ -95,16 +101,17 @@ define void @use_workitem_id_xz() #1 { ret void } -; GCN-LABEL: {{^}}use_workitem_id_yz: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 - -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_yz() #1 { +; GCN-LABEL: use_workitem_id_yz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %val0, ptr addrspace(1) poison @@ -112,229 +119,423 @@ define void @use_workitem_id_yz() #1 { ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v31 -; FIXEDABI: v_mov_b32_e32 v31, v0{{$}} -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v31 - -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_x() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_lshlrev_b32_e32 v31, 10, v1 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 - -; GCN: s_swappc_b64 -; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_y@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_y@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_y() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 1 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI: v_lshlrev_b32_e32 v31, 20, v2 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 - -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_z@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_z@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 20, v2 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_z() ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 - -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_xy: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_xy@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_xy@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v1 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_xy() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v0, v1 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 - -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_xz: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_xz@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_xz@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 20, v2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v1 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_xz() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI:v_lshlrev_b32_e32 v0, 20, v2 -; FIXEDABI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-NEXT: v_or_b32_e32 v31, v1, v0 -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 - -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_yz: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_yz@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_yz@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v31, v1, v0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_yz() ret void } -; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] - -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { +; GCN-LABEL: kern_indirect_use_workitem_id_xyz: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_xyz@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_xyz@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @use_workitem_id_xyz() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_x() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_x() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_y() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_y@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_y@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_y() ret void } -; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z: -; GCN-NOT: v0 -; GCN: s_swappc_b64 -; GCN-NOT: v0 define void @func_indirect_use_workitem_id_z() #1 { +; GCN-LABEL: func_indirect_use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workitem_id_z@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workitem_id_z@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @use_workitem_id_z() ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 - -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { +; GCN-LABEL: other_arg_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { +; GCN-LABEL: other_arg_use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } -; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: -; GCN: s_waitcnt -; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { +; GCN-LABEL: other_arg_use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v0, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %arg0, ptr addrspace(1) poison store volatile i32 %val, ptr addrspace(1) poison ret void } - -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: - -; FIXEDABI-NOT: v0 -; FIXEDABI: v_mov_b32_e32 v31, v0 -; FIXEDABI: v_mov_b32_e32 v0, 0x22b - -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { +; GCN-LABEL: kern_indirect_other_arg_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x22b +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @other_arg_use_workitem_id_x(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 - -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_lshlrev_b32_e32 v31, 10, v1 -; FIXEDABI: v_mov_b32_e32 v0, 0x22b - -; GCN: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { +; GCN-LABEL: kern_indirect_other_arg_use_workitem_id_y: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_y@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_y@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x22b +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @other_arg_use_workitem_id_y(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 1 -; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_lshlrev_b32_e32 v31, 20, v2 -; FIXEDABI: v_mov_b32_e32 v0, 0x22b - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { +; GCN-LABEL: kern_indirect_other_arg_use_workitem_id_z: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, other_arg_use_workitem_id_z@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, other_arg_use_workitem_id_z@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 20, v2 +; GCN-NEXT: v_mov_b32_e32 v0, 0x22b +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @other_arg_use_workitem_id_z(i32 555) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 -; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} define void @too_many_args_use_workitem_id_x( +; GCN-LABEL: too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v17 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v18 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v19 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v21 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v22 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v23 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v26 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v27 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -381,20 +582,53 @@ define void @too_many_args_use_workitem_id_x( ret void } -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI-DAG: s_mov_b32 s32, 0 -; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; FIXEDABI-DAG: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; FIXEDABI-DAG: v_mov_b32_e32 v31, v0 - -; FIXEDABI: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x140 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -406,18 +640,72 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { i32 290, i32 300, i32 310, i32 320) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 0 -; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: - -; Touching the workitem id register is not necessary. -; FIXEDABI-NOT: v31 -; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} -; FIXEDABI-NOT: v31 -; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; FIXEDABI-NOT: v31 - -; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { +; GCN-LABEL: func_call_too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x140 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] store volatile i32 %arg0, ptr addrspace(1) poison call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -432,19 +720,38 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { } ; Requires loading and storing to stack slot. -; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: -; GCN-DAG: s_addk_i32 s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}} - -; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} - -; GCN: s_swappc_b64 - -; GCN: s_mov_b32 s32, s33 -; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( +; GCN-LABEL: too_many_args_call_too_many_args_use_workitem_id_x: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s33 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -461,17 +768,81 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; frame[0] = byval arg32 ; frame[1] = stack passed workitem ID x ; frame[2] = VGPR spill slot - -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: - -; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 -; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 - -; FIXEDABI: buffer_load_dword v31, off, s[0:3], s32{{$}} -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} -; FIXEDABI: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( +; GCN-LABEL: too_many_args_use_workitem_id_x_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v17 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v18 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v19 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v21 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v22 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v23 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v26 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v27 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -522,27 +893,59 @@ define void @too_many_args_use_workitem_id_x_byval( ; sp[0] = byval ; sp[1] = ?? ; sp[2] = stack passed workitem ID x - -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: - -; FIXEDABI-NOT: v0 -; FIXEDABI-NOT: v1 -; FIXEDABI-NOT: v2 -; FIXEDABI: v_mov_b32_e32 v31, v0 -; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: s_movk_i32 s32, 0x400{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 - -; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} - -; FIXME: Why this reload? -; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0{{$}} - -; FIXEDABI-NOT: s32 -; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 -; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_x_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x140 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, ptr addrspace(5) %alloca call void @too_many_args_use_workitem_id_x_byval( @@ -558,19 +961,74 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ret void } -; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: -; FIXED-ABI-NOT: v31 -; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} -; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} -; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} - -; FIXED-ABI-NOT: v31 -; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} -; FIXED-ABI-NOT: v31 -; FIXEDABI: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { +; GCN-LABEL: func_call_too_many_args_use_workitem_id_x_byval: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, 0x3e7 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0x140 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_byval@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_byval@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, ptr addrspace(5) %alloca call void @too_many_args_use_workitem_id_x_byval( @@ -586,17 +1044,85 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ret void } -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 -; FIXEDABI-NOT: buffer_load_dword -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] -; FIXEDABI-NOT: buffer_load_dword -; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 -; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 -; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] - define void @too_many_args_use_workitem_id_xyz( +; GCN-LABEL: too_many_args_use_workitem_id_xyz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v32, v31, 10, 10 +; GCN-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v17 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v18 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v19 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v21 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v22 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v23 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v26 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v27 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -647,22 +1173,56 @@ define void @too_many_args_use_workitem_id_xyz( ret void } -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 -; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 -; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 -; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] - -; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} -; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] - -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_xyz: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_xyz@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_xyz@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0x140 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @too_many_args_use_workitem_id_xyz( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -674,21 +1234,87 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { i32 290, i32 300, i32 310, i32 320) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 ; Var abi: workitem ID X in register, yz on stack ; v31 = workitem ID X ; frame[0] = workitem { Z, Y, X } - -; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 -; GCN-DAG: {{flat|global}}_store_dword v[0:1], [[IDX]] -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]] -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 -; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] -; GCN: s_setpc_b64 -; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( +; GCN-LABEL: too_many_args_use_workitem_id_x_stack_yz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GCN-NEXT: flat_store_dword v[0:1], v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_u32 v32, v31, 10, 10 +; GCN-NEXT: v_bfe_u32 v31, v31, 20, 10 +; GCN-NEXT: flat_store_dword v[0:1], v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v17 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v18 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v19 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v21 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v22 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v23 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v26 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v27 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, @@ -737,20 +1363,56 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ret void } +; GCN: ScratchSize: 0 -; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: - -; GCN-NOT: v0 -; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-DAG: v_or_b32_e32 v0, v0, v1 -; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-DAG: v_or_b32_e32 v31, v0, v2 - -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 - -; GCN: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { +; GCN-LABEL: kern_call_too_many_args_use_workitem_id_x_stack_yz: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s0, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, too_many_args_use_workitem_id_x_stack_yz@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v0, 10 +; GCN-NEXT: v_mov_b32_e32 v1, 20 +; GCN-NEXT: v_mov_b32_e32 v2, 30 +; GCN-NEXT: v_mov_b32_e32 v3, 40 +; GCN-NEXT: v_mov_b32_e32 v4, 50 +; GCN-NEXT: v_mov_b32_e32 v5, 60 +; GCN-NEXT: v_mov_b32_e32 v6, 0x46 +; GCN-NEXT: v_mov_b32_e32 v7, 0x50 +; GCN-NEXT: v_mov_b32_e32 v8, 0x5a +; GCN-NEXT: v_mov_b32_e32 v9, 0x64 +; GCN-NEXT: v_mov_b32_e32 v10, 0x6e +; GCN-NEXT: v_mov_b32_e32 v11, 0x78 +; GCN-NEXT: v_mov_b32_e32 v12, 0x82 +; GCN-NEXT: v_mov_b32_e32 v13, 0x8c +; GCN-NEXT: v_mov_b32_e32 v14, 0x96 +; GCN-NEXT: v_mov_b32_e32 v15, 0xa0 +; GCN-NEXT: v_mov_b32_e32 v16, 0xaa +; GCN-NEXT: v_mov_b32_e32 v17, 0xb4 +; GCN-NEXT: v_mov_b32_e32 v18, 0xbe +; GCN-NEXT: v_mov_b32_e32 v19, 0xc8 +; GCN-NEXT: v_mov_b32_e32 v20, 0xd2 +; GCN-NEXT: v_mov_b32_e32 v21, 0xdc +; GCN-NEXT: v_mov_b32_e32 v22, 0xe6 +; GCN-NEXT: v_mov_b32_e32 v23, 0xf0 +; GCN-NEXT: v_mov_b32_e32 v24, 0xfa +; GCN-NEXT: v_mov_b32_e32 v25, 0x104 +; GCN-NEXT: v_mov_b32_e32 v26, 0x10e +; GCN-NEXT: v_mov_b32_e32 v27, 0x118 +; GCN-NEXT: v_mov_b32_e32 v28, 0x122 +; GCN-NEXT: v_mov_b32_e32 v29, 0x12c +; GCN-NEXT: v_mov_b32_e32 v30, 0x136 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @too_many_args_use_workitem_id_x_stack_yz( i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70, i32 80, @@ -762,30 +1424,61 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() i32 290, i32 300, i32 310) ret void } +; GCN: .amdhsa_system_vgpr_workitem_id 2 declare hidden void @extern_hint(i32) #2 ; Workitem IDs should not be passed due to the attribute -; GCN-LABEL: {{^}}kern_call_no_workitem_id_hints: -; GCN-NOT: v30 -; GCN-NOT: v31 -; GCN: v_mov_b32_e32 v0, 9 -; GCN-NOT: v0 -; GCN-NOT: v31 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_no_workitem_id_hints() #2 { +; GCN-LABEL: kern_call_no_workitem_id_hints: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, extern_hint@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, extern_hint@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm call void @extern_hint(i32 9) ret void } -; GCN-LABEL: {{^}}func_call_no_workitem_id_hints: -; GCN-NOT: v30 -; GCN-NOT: v31 -; GCN: v_mov_b32_e32 v0, 9 -; GCN-NOT: v0 -; GCN-NOT: v31 -; GCN: s_swappc_b64 define void @func_call_no_workitem_id_hints() #2 { +; GCN-LABEL: func_call_no_workitem_id_hints: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_hint@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_hint@rel32@hi+12 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @extern_hint(i32 9) ret void } @@ -794,14 +1487,24 @@ declare hidden void @extern_nohint(i32) ; Check that the hint is respected on the callsite, not the function ; declaration -; GCN-LABEL: {{^}}kern_callsite_workitem_id_hints: -; GCN-NOT: v30 -; GCN-NOT: v31 -; GCN: v_mov_b32_e32 v0, 9 -; GCN-NOT: v0 -; GCN-NOT: v31 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_callsite_workitem_id_hints() #2 { +; GCN-LABEL: kern_callsite_workitem_id_hints: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, extern_nohint@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, extern_nohint@rel32@hi+12 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm call void @extern_nohint(i32 9) #2 ret void }