diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ccdbd3216e260..2cf804e3348e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -681,13 +681,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. - getActionDefinitionsBuilder({G_ADD, G_SUB}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32); + if (ST.hasScalarAddSub64()) { + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({S64, S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + } else { + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + } getActionDefinitionsBuilder(G_MUL) .legalFor({S32, S16, V2S16}) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 94b9e49b765a6..2eed024af60ae 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -677,6 +677,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return AddNoCarryInsts; } + bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } + bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3626715acb3c1..85cc3cfec19cd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4555,40 +4555,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + // For targets older than GFX12, we emit a sequence of 32-bit operations. + // For GFX12, we emit s_add_u64 and s_sub_u64. const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); - - unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); + if (Subtarget->hasScalarAddSub64()) { + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; + BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) + .addReg(Src0.getReg()) + .addReg(Src1.getReg()); + } else { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .add(Src0Sub0) + .add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .add(Src0Sub1) + .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + } MI.eraseFromParent(); return BB; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll new file mode 100644 index 0000000000000..b850c37c4a281 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s + +define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +; GFX11-LABEL: s_add_u64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s6, s0 +; GFX11-NEXT: s_addc_u32 s1, s7, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_u64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +; GCN-LABEL: v_add_u64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + %add = add i64 %a, %b + store i64 %add, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +; GFX11-LABEL: s_sub_u64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s0, s6, s0 +; GFX11-NEXT: s_subb_u32 s1, s7, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_sub_u64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %sub = sub i64 %a, %b + store i64 %sub, i64 addrspace(1)* %out + ret void +} + +define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +; GCN-LABEL: v_sub_u64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4 +; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + %sub = sub i64 %a, %b + store i64 %sub, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 4b6891e7aa20d..34a676bffcfe3 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -1,14 +1,89 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,FUNC %s - -; FUNC-LABEL: {{^}}s_add_i32: -; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} -; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]] -; GCN: buffer_store_{{dword|b32}} v[[V_REG]], +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s + define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GFX6-LABEL: s_add_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_add_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s0, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_add_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_add_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_co_i32 s2, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in %b = load i32, ptr addrspace(1) %b_ptr @@ -17,10 +92,96 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}s_add_v2i32: -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX6-LABEL: s_add_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s5, s7 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_add_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s0, s5, s7 +; GFX8-NEXT: s_add_i32 s1, s4, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_add_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s5, s7 +; GFX9-NEXT: s_add_i32 s3, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_add_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s2, s4, s6 +; GFX10-NEXT: s_add_i32 s3, s5, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s4, s6 +; GFX11-NEXT: s_add_i32 s3, s5, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_co_i32 s2, s4, s6 +; GFX12-NEXT: s_add_co_i32 s3, s5, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -29,12 +190,118 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}s_add_v4i32: -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX6-LABEL: s_add_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s3, s3, s7 +; GFX6-NEXT: s_add_i32 s2, s2, s6 +; GFX6-NEXT: s_add_i32 s1, s1, s5 +; GFX6-NEXT: s_add_i32 s0, s0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_add_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s3, s3, s7 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_add_i32 s1, s1, s5 +; GFX8-NEXT: s_add_i32 s0, s0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_add_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_add_i32 s1, s1, s5 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_add_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s3, s3, s7 +; GFX10-NEXT: s_add_i32 s2, s2, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: s_add_i32 s2, s2, s6 +; GFX11-NEXT: s_add_i32 s0, s0, s4 +; GFX11-NEXT: s_add_i32 s1, s1, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: s_add_co_i32 s2, s2, s6 +; GFX12-NEXT: s_add_co_i32 s0, s0, s4 +; GFX12-NEXT: s_add_co_i32 s1, s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -43,53 +310,565 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}s_add_v8i32: -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { +; GFX6-LABEL: s_add_v8i32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s11, s11, s19 +; GFX6-NEXT: s_add_i32 s10, s10, s18 +; GFX6-NEXT: s_add_i32 s9, s9, s17 +; GFX6-NEXT: s_add_i32 s8, s8, s16 +; GFX6-NEXT: s_add_i32 s7, s7, s15 +; GFX6-NEXT: s_add_i32 s6, s6, s14 +; GFX6-NEXT: s_add_i32 s5, s5, s13 +; GFX6-NEXT: s_add_i32 s4, s4, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_add_v8i32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s7, s7, s15 +; GFX8-NEXT: s_add_i32 s6, s6, s14 +; GFX8-NEXT: s_add_i32 s5, s5, s13 +; GFX8-NEXT: s_add_i32 s4, s4, s12 +; GFX8-NEXT: s_add_i32 s2, s11, s19 +; GFX8-NEXT: s_add_i32 s3, s10, s18 +; GFX8-NEXT: s_add_i32 s9, s9, s17 +; GFX8-NEXT: s_add_i32 s8, s8, s16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_add_v8i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s7, s15 +; GFX9-NEXT: s_add_i32 s3, s6, s14 +; GFX9-NEXT: s_add_i32 s6, s11, s19 +; GFX9-NEXT: s_add_i32 s7, s10, s18 +; GFX9-NEXT: s_add_i32 s9, s9, s17 +; GFX9-NEXT: s_add_i32 s8, s8, s16 +; GFX9-NEXT: s_add_i32 s5, s5, s13 +; GFX9-NEXT: s_add_i32 s4, s4, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_add_v8i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s2, s7, s15 +; GFX10-NEXT: s_add_i32 s3, s6, s14 +; GFX10-NEXT: s_add_i32 s6, s11, s19 +; GFX10-NEXT: s_add_i32 s7, s10, s18 +; GFX10-NEXT: s_add_i32 s8, s8, s16 +; GFX10-NEXT: s_add_i32 s9, s9, s17 +; GFX10-NEXT: s_add_i32 s5, s5, s13 +; GFX10-NEXT: s_add_i32 s4, s4, s12 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_v8i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s7, s15 +; GFX11-NEXT: s_add_i32 s3, s6, s14 +; GFX11-NEXT: s_add_i32 s6, s11, s19 +; GFX11-NEXT: s_add_i32 s7, s10, s18 +; GFX11-NEXT: s_add_i32 s8, s8, s16 +; GFX11-NEXT: s_add_i32 s9, s9, s17 +; GFX11-NEXT: s_add_i32 s5, s5, s13 +; GFX11-NEXT: s_add_i32 s4, s4, s12 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v6, s3 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_v8i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_co_i32 s2, s7, s15 +; GFX12-NEXT: s_add_co_i32 s3, s6, s14 +; GFX12-NEXT: s_add_co_i32 s6, s11, s19 +; GFX12-NEXT: s_add_co_i32 s7, s10, s18 +; GFX12-NEXT: s_add_co_i32 s8, s8, s16 +; GFX12-NEXT: s_add_co_i32 s9, s9, s17 +; GFX12-NEXT: s_add_co_i32 s5, s5, s13 +; GFX12-NEXT: s_add_co_i32 s4, s4, s12 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %0 = add <8 x i32> %a, %b store <8 x i32> %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}s_add_v16i32: -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 -; GCN: s_add_i32 define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { +; GFX6-LABEL: s_add_v16i32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; GFX6-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x29 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s19, s19, s51 +; GFX6-NEXT: s_add_i32 s18, s18, s50 +; GFX6-NEXT: s_add_i32 s17, s17, s49 +; GFX6-NEXT: s_add_i32 s16, s16, s48 +; GFX6-NEXT: s_add_i32 s15, s15, s47 +; GFX6-NEXT: s_add_i32 s14, s14, s46 +; GFX6-NEXT: s_add_i32 s13, s13, s45 +; GFX6-NEXT: s_add_i32 s12, s12, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_add_i32 s11, s11, s43 +; GFX6-NEXT: s_add_i32 s10, s10, s42 +; GFX6-NEXT: s_add_i32 s9, s9, s41 +; GFX6-NEXT: s_add_i32 s8, s8, s40 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_add_i32 s7, s7, s39 +; GFX6-NEXT: s_add_i32 s6, s6, s38 +; GFX6-NEXT: s_add_i32 s5, s5, s37 +; GFX6-NEXT: s_add_i32 s4, s4, s36 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_add_v16i32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s7, s7, s39 +; GFX8-NEXT: s_add_i32 s6, s6, s38 +; GFX8-NEXT: s_add_i32 s5, s5, s37 +; GFX8-NEXT: s_add_i32 s4, s4, s36 +; GFX8-NEXT: s_add_i32 s11, s11, s43 +; GFX8-NEXT: s_add_i32 s10, s10, s42 +; GFX8-NEXT: s_add_i32 s9, s9, s41 +; GFX8-NEXT: s_add_i32 s8, s8, s40 +; GFX8-NEXT: s_add_i32 s15, s15, s47 +; GFX8-NEXT: s_add_i32 s14, s14, s46 +; GFX8-NEXT: s_add_i32 s13, s13, s45 +; GFX8-NEXT: s_add_i32 s12, s12, s44 +; GFX8-NEXT: s_add_i32 s2, s19, s51 +; GFX8-NEXT: s_add_i32 s3, s18, s50 +; GFX8-NEXT: s_add_i32 s17, s17, s49 +; GFX8-NEXT: s_add_i32 s16, s16, s48 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_add_v16i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s7, s39 +; GFX9-NEXT: s_add_i32 s3, s6, s38 +; GFX9-NEXT: s_add_i32 s6, s11, s43 +; GFX9-NEXT: s_add_i32 s7, s10, s42 +; GFX9-NEXT: s_add_i32 s10, s15, s47 +; GFX9-NEXT: s_add_i32 s11, s14, s46 +; GFX9-NEXT: s_add_i32 s14, s19, s51 +; GFX9-NEXT: s_add_i32 s15, s18, s50 +; GFX9-NEXT: s_add_i32 s17, s17, s49 +; GFX9-NEXT: s_add_i32 s16, s16, s48 +; GFX9-NEXT: s_add_i32 s13, s13, s45 +; GFX9-NEXT: s_add_i32 s12, s12, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: s_add_i32 s9, s9, s41 +; GFX9-NEXT: s_add_i32 s8, s8, s40 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_add_i32 s5, s5, s37 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: s_add_i32 s4, s4, s36 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_add_v16i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_i32 s2, s7, s39 +; GFX10-NEXT: s_add_i32 s3, s6, s38 +; GFX10-NEXT: s_add_i32 s6, s11, s43 +; GFX10-NEXT: s_add_i32 s7, s10, s42 +; GFX10-NEXT: s_add_i32 s10, s15, s47 +; GFX10-NEXT: s_add_i32 s11, s14, s46 +; GFX10-NEXT: s_add_i32 s14, s19, s51 +; GFX10-NEXT: s_add_i32 s15, s18, s50 +; GFX10-NEXT: s_add_i32 s16, s16, s48 +; GFX10-NEXT: s_add_i32 s17, s17, s49 +; GFX10-NEXT: s_add_i32 s13, s13, s45 +; GFX10-NEXT: s_add_i32 s12, s12, s44 +; GFX10-NEXT: s_add_i32 s9, s9, s41 +; GFX10-NEXT: s_add_i32 s8, s8, s40 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_mov_b32_e32 v1, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s15 +; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: s_add_i32 s5, s5, s37 +; GFX10-NEXT: s_add_i32 s4, s4, s36 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s11 +; GFX10-NEXT: v_mov_b32_e32 v7, s10 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s9 +; GFX10-NEXT: v_mov_b32_e32 v10, s7 +; GFX10-NEXT: v_mov_b32_e32 v11, s6 +; GFX10-NEXT: v_mov_b32_e32 v12, s4 +; GFX10-NEXT: v_mov_b32_e32 v13, s5 +; GFX10-NEXT: v_mov_b32_e32 v14, s3 +; GFX10-NEXT: v_mov_b32_e32 v15, s2 +; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 +; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32 +; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_v16i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s7, s39 +; GFX11-NEXT: s_add_i32 s3, s6, s38 +; GFX11-NEXT: s_add_i32 s6, s11, s43 +; GFX11-NEXT: s_add_i32 s7, s10, s42 +; GFX11-NEXT: s_add_i32 s10, s15, s47 +; GFX11-NEXT: s_add_i32 s11, s14, s46 +; GFX11-NEXT: s_add_i32 s14, s19, s51 +; GFX11-NEXT: s_add_i32 s15, s18, s50 +; GFX11-NEXT: s_add_i32 s16, s16, s48 +; GFX11-NEXT: s_add_i32 s17, s17, s49 +; GFX11-NEXT: s_add_i32 s13, s13, s45 +; GFX11-NEXT: s_add_i32 s12, s12, s44 +; GFX11-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: s_add_i32 s9, s9, s41 +; GFX11-NEXT: s_add_i32 s8, s8, s40 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 +; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: s_add_i32 s5, s5, s37 +; GFX11-NEXT: s_add_i32 s4, s4, s36 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10 +; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX11-NEXT: v_mov_b32_e32 v14, s3 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_add_v16i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_co_i32 s2, s7, s39 +; GFX12-NEXT: s_add_co_i32 s3, s6, s38 +; GFX12-NEXT: s_add_co_i32 s6, s11, s43 +; GFX12-NEXT: s_add_co_i32 s7, s10, s42 +; GFX12-NEXT: s_add_co_i32 s10, s15, s47 +; GFX12-NEXT: s_add_co_i32 s11, s14, s46 +; GFX12-NEXT: s_add_co_i32 s14, s19, s51 +; GFX12-NEXT: s_add_co_i32 s15, s18, s50 +; GFX12-NEXT: s_add_co_i32 s16, s16, s48 +; GFX12-NEXT: s_add_co_i32 s17, s17, s49 +; GFX12-NEXT: s_add_co_i32 s13, s13, s45 +; GFX12-NEXT: s_add_co_i32 s12, s12, s44 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 +; GFX12-NEXT: s_add_co_i32 s9, s9, s41 +; GFX12-NEXT: s_add_co_i32 s8, s8, s40 +; GFX12-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13 +; GFX12-NEXT: s_add_co_i32 s5, s5, s37 +; GFX12-NEXT: s_add_co_i32 s4, s4, s36 +; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %0 = add <16 x i32> %a, %b store <16 x i32> %0, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}v_add_i32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[B:v[0-9]+]] -; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]] -; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]] -; GFX10: v_add_nc_u32_e32 v{{[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GFX6-LABEL: v_add_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: v_add_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_load_dword v2, v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_add_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_add_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_add_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_add_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1 @@ -100,12 +879,91 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ret void } -; FUNC-LABEL: {{^}}v_add_imm_i32: -; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[A:v[0-9]+]] -; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]] -; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]] -; GFX10: v_add_nc_u32_e32 v{{[0-9]+}}, 0x7b, [[A]] define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GFX6-LABEL: v_add_imm_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: v_add_imm_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_add_imm_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_add_imm_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_add_imm_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_add_imm_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1 @@ -115,10 +973,92 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; FUNC-LABEL: {{^}}add64: -; GCN: s_add_u32 -; GCN: s_addc_u32 define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; GFX6-LABEL: add64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_add_u32 s4, s6, s8 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_addc_u32 s5, s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: add64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add64: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s6, s0 +; GFX11-NEXT: s_addc_u32 s1, s7, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b store i64 %add, ptr addrspace(1) %out @@ -129,10 +1069,104 @@ entry: ; use VCC. The test is designed so that %a will be stored in an SGPR and ; %0 will be stored in a VGPR, so the comiler will be forced to copy %a ; to a VGPR before doing the add. - -; FUNC-LABEL: {{^}}add64_sgpr_vgpr: -; GCN-NOT: v_addc_u32_e32 s define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { +; GFX6-LABEL: add64_sgpr_vgpr: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_u32 s0, s2, s8 +; GFX6-NEXT: s_addc_u32 s1, s3, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: add64_sgpr_vgpr: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s2, s4 +; GFX8-NEXT: s_addc_u32 s1, s3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add64_sgpr_vgpr: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add64_sgpr_vgpr: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s6, s0 +; GFX10-NEXT: s_addc_u32 s1, s7, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add64_sgpr_vgpr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_addc_u32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add64_sgpr_vgpr: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %0 = load i64, ptr addrspace(1) %in %1 = add i64 %a, %0 @@ -141,10 +1175,160 @@ entry: } ; Test i64 add inside a branch. -; FUNC-LABEL: {{^}}add64_in_branch: -; GCN: s_add_u32 -; GCN: s_addc_u32 define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { +; GFX6-LABEL: add64_in_branch: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[10:11] +; GFX6-NEXT: s_cbranch_vccz .LBB9_4 +; GFX6-NEXT: ; %bb.1: ; %else +; GFX6-NEXT: s_add_u32 s4, s4, s6 +; GFX6-NEXT: s_addc_u32 s5, s5, s7 +; GFX6-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GFX6-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX6-NEXT: .LBB9_2: ; %if +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6-NEXT: .LBB9_3: ; %endif +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; GFX6-NEXT: .LBB9_4: +; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX6-NEXT: s_branch .LBB9_2 +; +; GFX8-LABEL: add64_in_branch: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX8-NEXT: ; %bb.1: ; %else +; GFX8-NEXT: s_add_u32 s4, s4, s6 +; GFX8-NEXT: s_addc_u32 s5, s5, s7 +; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GFX8-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX8-NEXT: .LBB9_2: ; %if +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX8-NEXT: .LBB9_3: ; %endif +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; GFX8-NEXT: .LBB9_4: +; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX8-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: add64_in_branch: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: s_add_u32 s4, s4, s6 +; GFX9-NEXT: s_addc_u32 s5, s5, s7 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GFX9-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %if +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: .LBB9_3: ; %endif +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX10-LABEL: add64_in_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: s_add_u32 s4, s4, s6 +; GFX10-NEXT: s_addc_u32 s5, s5, s7 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_cbranch_execnz .LBB9_3 +; GFX10-NEXT: .LBB9_2: ; %if +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: .LBB9_3: ; %endif +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; GFX10-NEXT: .LBB9_4: +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: add64_in_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: s_add_u32 s4, s4, s6 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_execnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %if +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 +; GFX11-NEXT: .LBB9_3: ; %endif +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB9_2 +; +; GFX12-LABEL: add64_in_branch: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX12-NEXT: ; %bb.1: ; %else +; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_cbranch_execnz .LBB9_3 +; GFX12-NEXT: .LBB9_2: ; %if +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 +; GFX12-NEXT: .LBB9_3: ; %endif +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB9_4: +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX12-NEXT: s_branch .LBB9_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -165,17 +1349,80 @@ endif: ; Make sure the VOP3 form of add is initially selected. Otherwise pair ; of opies from/to VCC would be necessary - -; GCN-LABEL: {{^}}add_select_vop3: -; SI: v_add_i32_e64 v0, s[0:1], s0, v0 -; VI: v_add_u32_e64 v0, s[0:1], s0, v0 -; GFX9: v_add_u32_e32 v0, s0, v0 -; GFX10: v_add_nc_u32_e32 v0, s0, v0 - -; GCN: ; def vcc -; GCN: ds_{{write|store}}_b32 -; GCN: ; use vcc define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) { +; GFX6-LABEL: add_select_vop3: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; def vcc +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ds_write_b32 v0, v0 +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; use vcc +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: add_select_vop3: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; def vcc +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; use vcc +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_select_vop3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def vcc +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use vcc +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_select_vop3: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def vcc +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ds_write_b32 v0, v0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use vcc +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_select_vop3: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def vcc +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use vcc +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_select_vop3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; def vcc +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: ds_store_b32 v0, v0 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; use vcc +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() %sub = add i32 %v, %s store i32 %sub, ptr addrspace(3) undef diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll new file mode 100644 index 0000000000000..3b57252741c20 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: add_reg_imm + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]] + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %lhs = load volatile i64, ptr addrspace(1) %ptr + %res = add i64 %lhs, 123456789123456789 + store i64 %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: add_reg_reg + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]] + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %lhs = load volatile i64, ptr addrspace(1) %ptr + %rhs = load volatile i64, ptr addrspace(1) %ptr + %res = add i64 %lhs, %rhs + store i64 %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: sub_reg_imm + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -28744524 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1395630315 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]] + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %lhs = load volatile i64, ptr addrspace(1) %ptr + %res = sub i64 %lhs, 123456789123456789 + store i64 %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: sub_imm_reg + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE]], [[GLOBAL_LOAD_DWORDX2_SADDR]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]] + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %rhs = load volatile i64, ptr addrspace(1) %ptr + %res = sub i64 123456789123456789, %rhs + store i64 %res, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) { + ; CHECK-LABEL: name: sub_reg_reg + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]] + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %lhs = load volatile i64, ptr addrspace(1) %ptr + %rhs = load volatile i64, ptr addrspace(1) %ptr + %res = sub i64 %lhs, %rhs + store i64 %res, ptr addrspace(1) %ptr + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index bee986f4d9112..098a4cbb36ede 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable @@ -39,6 +40,18 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: s_sub_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = sub i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void @@ -79,6 +92,20 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: s_sub_imm_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = sub i32 1234, %a store i32 %result, ptr addrspace(1) %out ret void @@ -127,6 +154,19 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in %b = load i32, ptr addrspace(1) %b_ptr @@ -178,6 +218,19 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_imm_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in %result = sub i32 123, %a store i32 %result, ptr addrspace(1) %out @@ -230,6 +283,20 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3 +; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2 +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -297,6 +364,24 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 +; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2 +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 +; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -360,6 +445,22 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 th:TH_LOAD_RT_NT +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0 +; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1 @@ -425,6 +526,20 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_v2i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 @@ -501,6 +616,21 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_sub_v4i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3 +; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2 +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 @@ -552,6 +682,20 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: s_sub_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 ret void @@ -617,6 +761,25 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: v_sub_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid @@ -693,6 +856,27 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: v_test_sub_v2i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -801,6 +985,36 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 ; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: v_test_sub_v4i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] +; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1] +; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16 +; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16 +; GFX12-NEXT: s_waitcnt vmcnt(2) +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo +; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -852,6 +1066,18 @@ define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { ; GFX9-NEXT: ; use vcc ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: sub_select_vop3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; def vcc +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: ds_store_b32 v0, v0 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; use vcc +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() %sub = sub i32 %v, %s store i32 %sub, ptr addrspace(3) undef