Skip to content

Commit

Permalink
[PowerPC] Accumulator/Unprimed Accumulator register copy, spill and r…
Browse files Browse the repository at this point in the history
…estore

This patch adds support for accumulator/unprimed accumulator
register copy, spill and restore for MMA.

Authored By: Baptiste Saleil

Reviewed By: #powerpc, bsaleil, amyk

Differential Revision: https://reviews.llvm.org/D90616
  • Loading branch information
Baptiste Saleil authored and Ahsan Saghir committed Nov 11, 2020
1 parent c8a0e27 commit 37c4ac8
Show file tree
Hide file tree
Showing 7 changed files with 794 additions and 15 deletions.
47 changes: 44 additions & 3 deletions llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
Expand Up @@ -1361,7 +1361,33 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = PPC::CROR;
else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
Opc = PPC::EVOR;
else
else if ((PPC::ACCRCRegClass.contains(DestReg) ||
PPC::UACCRCRegClass.contains(DestReg)) &&
(PPC::ACCRCRegClass.contains(SrcReg) ||
PPC::UACCRCRegClass.contains(SrcReg))) {
// If primed, de-prime the source register, copy the individual registers
// and prime the destination if needed. The vector subregisters are
// vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the
// source is primed, we need to re-prime it after the copy as well.
PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg);
bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg);
bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg);
MCRegister VSLSrcReg =
PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
MCRegister VSLDestReg =
PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
if (SrcPrimed)
BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
for (unsigned Idx = 0; Idx < 4; Idx++)
BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx)
.addReg(VSLSrcReg + Idx)
.addReg(VSLSrcReg + Idx, getKillRegState(KillSrc));
if (DestPrimed)
BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg);
if (SrcPrimed && !KillSrc)
BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
return;
} else
llvm_unreachable("Impossible reg-to-reg copy");

const MCInstrDesc &MCID = get(Opc);
Expand All @@ -1372,7 +1398,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
}

static unsigned getSpillIndex(const TargetRegisterClass *RC) {
unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const {
int OpcodeIndex = 0;

if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
Expand Down Expand Up @@ -1401,6 +1427,18 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) {
OpcodeIndex = SOK_VectorFloat4Spill;
} else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_SpillToVSR;
} else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) {
assert(Subtarget.pairedVectorMemops() &&
"Register unexpected when paired memops are disabled.");
OpcodeIndex = SOK_AccumulatorSpill;
} else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) {
assert(Subtarget.pairedVectorMemops() &&
"Register unexpected when paired memops are disabled.");
OpcodeIndex = SOK_UAccumulatorSpill;
} else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) {
assert(Subtarget.pairedVectorMemops() &&
"Register unexpected when paired memops are disabled.");
OpcodeIndex = SOK_PairedVecSpill;
} else {
llvm_unreachable("Unknown regclass!");
}
Expand Down Expand Up @@ -2799,7 +2837,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
}

unsigned PPCInstrInfo::getSpillTarget() const {
return Subtarget.hasP9Vector() ? 1 : 0;
// With P10, we may need to spill paired vector registers or accumulator
// registers. MMA implies paired vectors, so we can just check that.
bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops();
return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0;
}

const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
Expand Down
37 changes: 29 additions & 8 deletions llvm/lib/Target/PowerPC/PPCInstrInfo.h
Expand Up @@ -123,52 +123,72 @@ enum SpillOpcodeKey {
SOK_VectorFloat8Spill,
SOK_VectorFloat4Spill,
SOK_SpillToVSR,
SOK_PairedVecSpill,
SOK_AccumulatorSpill,
SOK_UAccumulatorSpill,
SOK_SPESpill,
SOK_LastOpcodeSpill // This must be last on the enum.
};

// Define list of load and store spill opcodes.
#define NoInstr PPC::INSTRUCTION_LIST_END
#define Pwr8LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \
PPC::SPILLTOVSR_LD, PPC::EVLDD \
PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD \
}

#define Pwr9LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
PPC::DFLOADf32, PPC::SPILLTOVSR_LD \
PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \
}

#define Pwr10LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \
PPC::RESTORE_UACC, NoInstr \
}

#define Pwr8StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \
PPC::SPILLTOVSR_ST, PPC::EVSTDD \
PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD \
}

#define Pwr9StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
PPC::SPILLTOVSR_ST \
PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr \
}

#define Pwr10StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \
NoInstr \
}

// Initialize arrays for load and store spill opcodes on supported subtargets.
#define StoreOpcodesForSpill \
{ Pwr8StoreOpcodes, Pwr9StoreOpcodes }
{ Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes }
#define LoadOpcodesForSpill \
{ Pwr8LoadOpcodes, Pwr9LoadOpcodes }
{ Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes }

class PPCSubtarget;
class PPCInstrInfo : public PPCGenInstrInfo {
PPCSubtarget &Subtarget;
const PPCRegisterInfo RI;
const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
StoreOpcodesForSpill;
const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
LoadOpcodesForSpill;

void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
Expand Down Expand Up @@ -226,6 +246,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
unsigned getSpillTarget() const;
const unsigned *getStoreOpcodesForSpillArray() const;
const unsigned *getLoadOpcodesForSpillArray() const;
unsigned getSpillIndex(const TargetRegisterClass *RC) const;
int16_t getFMAOpIdxInfo(unsigned Opcode) const;
void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/PowerPC/PPCInstrPrefix.td
Expand Up @@ -1294,6 +1294,18 @@ let Predicates = [MMA] in {
XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
"xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
let mayStore = 1 in {
def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
"#SPILL_ACC", []>;
def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst),
"#SPILL_UACC", []>;
}
let mayLoad = 1, hasSideEffects = 0 in {
def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src),
"#RESTORE_ACC", []>;
def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src),
"#RESTORE_UACC", []>;
}
}

let Predicates = [MMA, PrefixInstrs] in {
Expand Down
124 changes: 124 additions & 0 deletions llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
Expand Up @@ -75,6 +75,21 @@ MaxCRBitSpillDist("ppc-max-crbit-spill-dist",
"spill on ppc"),
cl::Hidden, cl::init(100));

// Copies/moves of physical accumulators are expensive operations
// that should be avoided whenever possible. MMA instructions are
// meant to be used in performance-sensitive computational kernels.
// This option is provided, at least for the time being, to give the
// user a tool to detect this expensive operation and either rework
// their code or report a compiler bug if that turns out to be the
// cause.
#ifndef NDEBUG
static cl::opt<bool>
ReportAccMoves("ppc-report-acc-moves",
cl::desc("Emit information about accumulator register spills "
"and copies"),
cl::Hidden, cl::init(false));
#endif

static unsigned offsetMinAlignForOpcode(unsigned OpC);

PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
Expand Down Expand Up @@ -936,6 +951,109 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
MBB.erase(II);
}

void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB,
MCRegister DestReg, MCRegister SrcReg) {
#ifdef NDEBUG
return;
#else
if (ReportAccMoves) {
std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc";
std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc";
dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n";
MBB.dump();
}
#endif
}

static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
bool IsRestore) {
#ifdef NDEBUG
return;
#else
if (ReportAccMoves) {
dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register "
<< (IsRestore ? "restore" : "spill") << ":\n";
MBB.dump();
}
#endif
}

/// lowerACCSpilling - Generate the code for spilling the accumulator register.
/// Similarly to other spills/reloads that use pseudo-ops, we do not actually
/// eliminate the FrameIndex here nor compute the stack offset. We simply
/// create a real instruction with an FI and rely on eliminateFrameIndex to
/// handle the FI elimination.
void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
MachineInstr &MI = *II; // SPILL_ACC <SrcReg>, <offset>
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
Register SrcReg = MI.getOperand(0).getReg();
bool IsKilled = MI.getOperand(0).isKill();

bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
Register Reg =
PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
bool IsLittleEndian = Subtarget.isLittleEndian();

emitAccSpillRestoreInfo(MBB, IsPrimed, false);

// De-prime the register being spilled, create two stores for the pair
// subregisters accounting for endianness and then re-prime the register if
// it isn't killed. This uses the Offset parameter to addFrameReference() to
// adjust the offset of the store that is within the 64-byte stack slot.
if (IsPrimed)
BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
.addReg(Reg, getKillRegState(IsKilled)),
FrameIndex, IsLittleEndian ? 32 : 0);
addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
.addReg(Reg + 1, getKillRegState(IsKilled)),
FrameIndex, IsLittleEndian ? 0 : 32);
if (IsPrimed && !IsKilled)
BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);

// Discard the pseudo instruction.
MBB.erase(II);
}

/// lowerACCRestore - Generate the code to restore the accumulator register.
void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
MachineInstr &MI = *II; // <DestReg> = RESTORE_ACC <offset>
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();

Register DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_ACC does not define its destination");

bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg);
Register Reg =
PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
bool IsLittleEndian = Subtarget.isLittleEndian();

emitAccSpillRestoreInfo(MBB, IsPrimed, true);

// Create two loads for the pair subregisters accounting for endianness and
// then prime the accumulator register being restored.
addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg),
FrameIndex, IsLittleEndian ? 32 : 0);
addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1),
FrameIndex, IsLittleEndian ? 0 : 32);
if (IsPrimed)
BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg);

// Discard the pseudo instruction.
MBB.erase(II);
}

bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
Register Reg, int &FrameIdx) const {
// For the nonvolatile condition registers (CR2, CR3, CR4) return true to
Expand Down Expand Up @@ -1067,6 +1185,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
} else if (OpC == PPC::RESTORE_CRBIT) {
lowerCRBitRestore(II, FrameIndex);
return;
} else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) {
lowerACCSpilling(II, FrameIndex);
return;
} else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
lowerACCRestore(II, FrameIndex);
return;
}

// Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/PowerPC/PPCRegisterInfo.h
Expand Up @@ -120,6 +120,14 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
void lowerCRBitRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;

void lowerACCSpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;
void lowerACCRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;

static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg,
MCRegister SrcReg);

bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
int &FrameIdx) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
Expand Down

0 comments on commit 37c4ac8

Please sign in to comment.