Skip to content

Commit

Permalink
[AMDGPU] CodeGen for GFX12 64-bit scalar add/sub (#75070)
Browse files Browse the repository at this point in the history
  • Loading branch information
jayfoad committed Dec 12, 2023
1 parent 671fa91 commit 8005ee6
Show file tree
Hide file tree
Showing 7 changed files with 1,833 additions and 112 deletions.
24 changes: 17 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -681,13 +681,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,

if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
if (ST.hasScalarAddSub64()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S64, S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
.legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
}

getActionDefinitionsBuilder(G_MUL)
.legalFor({S32, S16, V2S16})
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return AddNoCarryInsts;
}

bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }

bool hasUnpackedD16VMem() const {
return HasUnpackedD16VMem;
}
Expand Down
67 changes: 39 additions & 28 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4555,40 +4555,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
// For targets older than GFX12, we emit a sequence of 32-bit operations.
// For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();

MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);

Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);

MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);

bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);

unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
if (Subtarget->hasScalarAddSub64()) {
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
.addReg(Src0.getReg())
.addReg(Src1.getReg());
} else {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();

Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);

MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);

unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
.add(Src0Sub0)
.add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
.add(Src0Sub1)
.add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
MI.eraseFromParent();
return BB;
}
Expand Down
107 changes: 107 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s

define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_add_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s0, s6, s0
; GFX11-NEXT: s_addc_u32 s1, s7, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, i64 addrspace(1)* %out
ret void
}

define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GCN-LABEL: v_add_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, i64 addrspace(1)* %out
ret void
}

define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_sub_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s0, s6, s0
; GFX11-NEXT: s_subb_u32 s1, s7, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_sub_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, i64 addrspace(1)* %out
ret void
}

define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GCN-LABEL: v_sub_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, i64 addrspace(1)* %out
ret void
}
Loading

0 comments on commit 8005ee6

Please sign in to comment.