Skip to content

Commit

Permalink
[X86] Bring back the MOV64r0 pseudo instruction
Browse files Browse the repository at this point in the history
This patch brings back the MOV64r0 pseudo instruction for zeroing a 64-bit register. This replaces the SUBREG_TO_REG MOV32r0 sequence we use today. Post register allocation we will rewrite the MOV64r0 to a 32-bit xor with an implicit def of the 64-bit register similar to what we do for the various XMM/YMM/ZMM zeroing pseudos.

My main motivation is to enable the spill optimization in foldMemoryOperandImpl. As we were seeing some code that repeatedly did "xor eax, eax; store eax;" to spill several registers with a new xor for each store. With this optimization enabled we get a store of a 0 immediate instead of an xor. Though I admit the ideal solution would be one xor where there are multiple spills. I don't believe we have a test case that shows this optimization in here. I'll see if I can try to reduce one from the code were looking at.

There's definitely some other machine CSE(and maybe other passes) behavior changes exposed by this patch. So it seems like there might be some other deficiencies in SUBREG_TO_REG handling.

Differential Revision: https://reviews.llvm.org/D52757

llvm-svn: 345165
  • Loading branch information
topperc committed Oct 24, 2018
1 parent 2cce074 commit 2417273
Show file tree
Hide file tree
Showing 17 changed files with 523 additions and 472 deletions.
32 changes: 12 additions & 20 deletions llvm/lib/Target/X86/X86FastISel.cpp
Expand Up @@ -1916,8 +1916,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
{ &X86::GR64RegClass, X86::RAX, X86::RDX, {
{ X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
{ X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
{ X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
{ X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
{ X86::DIV64r, X86::MOV64r0, Copy, X86::RAX, U }, // UDiv
{ X86::DIV64r, X86::MOV64r0, Copy, X86::RDX, U }, // URem
}
}, // i64
};
Expand Down Expand Up @@ -1964,26 +1964,22 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpSignExtend));
else {
unsigned Zero32 = createResultReg(&X86::GR32RegClass);
unsigned ZeroReg = createResultReg(VT == MVT::i64 ? &X86::GR64RegClass
: &X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::MOV32r0), Zero32);
TII.get(OpEntry.OpSignExtend), ZeroReg);

// Copy the zero into the appropriate sub/super/identical physical
// register. Unfortunately the operations needed are not uniform enough
// to fit neatly into the table above.
if (VT == MVT::i16) {
if (VT == MVT::i16)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32, 0, X86::sub_16bit);
} else if (VT == MVT::i32) {
.addReg(ZeroReg, 0, X86::sub_16bit);
else
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32);
} else if (VT == MVT::i64) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
}
.addReg(ZeroReg);
}
}
// Generate the DIV/IDIV instruction.
Expand Down Expand Up @@ -3708,6 +3704,9 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {

uint64_t Imm = CI->getZExtValue();
if (Imm == 0) {
if (VT.SimpleTy == MVT::i64)
return fastEmitInst_(X86::MOV64r0, &X86::GR64RegClass);

unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
Expand All @@ -3720,13 +3719,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
X86::sub_16bit);
case MVT::i32:
return SrcReg;
case MVT::i64: {
unsigned ResultReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
return ResultReg;
}
}
}

Expand Down
13 changes: 4 additions & 9 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Expand Up @@ -3569,7 +3569,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
} else {
// Zero out the high part, effectively zero extending the input.
SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
unsigned ClrOpc = NVT.SimpleTy == MVT::i64 ? X86::MOV64r0
: X86::MOV32r0;
MVT ClrVT = NVT.SimpleTy == MVT::i64 ? MVT::i64 : MVT::i32;
SDValue ClrNode = SDValue(CurDAG->getMachineNode(ClrOpc, dl, ClrVT), 0);
switch (NVT.SimpleTy) {
case MVT::i16:
ClrNode =
Expand All @@ -3580,15 +3583,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
0);
break;
case MVT::i32:
break;
case MVT::i64:
ClrNode =
SDValue(CurDAG->getMachineNode(
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
CurDAG->getTargetConstant(X86::sub_32bit, dl,
MVT::i32)),
0);
break;
default:
llvm_unreachable("Unexpected division source");
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Expand Up @@ -275,16 +275,18 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
isPseudo = 1, AddedComplexity = 10 in
isPseudo = 1, AddedComplexity = 10 in {
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
[(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
def MOV64r0 : I<0, Pseudo, (outs GR64:$dst), (ins), "",
[(set GR64:$dst, 0)]>, Sched<[WriteZero]>;
}

// Other widths can also make use of the 32-bit xor, which may have a smaller
// encoding and avoid partial register updates.
let AddedComplexity = 10 in {
def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
}

let Predicates = [OptForSize, Not64BitMode],
Expand Down
23 changes: 19 additions & 4 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Expand Up @@ -683,8 +683,10 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
unsigned NewOpc = X86::MOV32ri;
int Value;
switch (Orig.getOpcode()) {
case X86::MOV64r0: NewOpc = X86::MOV32ri64; Value = 0; break;
case X86::MOV32r0: Value = 0; break;
case X86::MOV32r1: Value = 1; break;
case X86::MOV32r_1: Value = -1; break;
Expand All @@ -693,7 +695,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
}

const DebugLoc &DL = Orig.getDebugLoc();
BuildMI(MBB, I, DL, get(X86::MOV32ri))
BuildMI(MBB, I, DL, get(NewOpc))
.add(Orig.getOperand(0))
.addImm(Value);
} else {
Expand Down Expand Up @@ -3750,7 +3752,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// MOV32r0 etc. are implemented with xor which clobbers condition code.
// They are safe to move up, if the definition to EFLAGS is dead and
// earlier instructions do not read or write EFLAGS.
if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
if (!Movr0Inst &&
(Instr.getOpcode() == X86::MOV32r0 ||
Instr.getOpcode() == X86::MOV64r0) &&
Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
Movr0Inst = &Instr;
continue;
Expand Down Expand Up @@ -4155,6 +4159,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case X86::MOV32r0:
return Expand2AddrUndef(MIB, get(X86::XOR32rr));
case X86::MOV64r0: {
const TargetRegisterInfo *TRI = &getRegisterInfo();
unsigned Reg = MIB->getOperand(0).getReg();
unsigned Reg32 = TRI->getSubReg(Reg, X86::sub_32bit);
MIB->getOperand(0).setReg(Reg32);
Expand2AddrUndef(MIB, get(X86::XOR32rr));
MIB.addReg(Reg, RegState::ImplicitDefine);
return true;
}
case X86::MOV32r1:
return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
case X86::MOV32r_1:
Expand Down Expand Up @@ -4898,8 +4911,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
isTwoAddrFold = true;
} else {
if (OpNum == 0) {
if (MI.getOpcode() == X86::MOV32r0) {
NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
if (MI.getOpcode() == X86::MOV32r0 || MI.getOpcode() == X86::MOV64r0) {
unsigned NewOpc = MI.getOpcode() == X86::MOV64r0 ? X86::MOV64mi32
: X86::MOV32mi;
NewMI = MakeM0Inst(*this, NewOpc, MOs, InsertPt, MI);
if (NewMI)
return NewMI;
}
Expand Down
10 changes: 2 additions & 8 deletions llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
Expand Up @@ -487,20 +487,14 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
// Otherwise, just build the predicate state itself by zeroing a register
// as we don't need any initial state.
PS->InitialReg = MRI->createVirtualRegister(PS->RC);
unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
PredStateSubReg);
auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64r0),
PS->InitialReg);
++NumInstsInserted;
MachineOperand *ZeroEFLAGSDefOp =
ZeroI->findRegisterDefOperand(X86::EFLAGS);
assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
"Must have an implicit def of EFLAGS!");
ZeroEFLAGSDefOp->setIsDead(true);
BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
PS->InitialReg)
.addImm(0)
.addReg(PredStateSubReg)
.addImm(X86::sub_32bit);
}

// We're going to need to trace predicate state throughout the function's
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/GlobalISel/constant.ll
Expand Up @@ -54,7 +54,7 @@ define i64 @const_i64_i32() {
define void @main(i32 ** %data) {
; ALL-LABEL: main:
; ALL: # %bb.0:
; ALL-NEXT: movq $0, %rax
; ALL-NEXT: xorl %eax, %eax
; ALL-NEXT: movq %rax, (%rdi)
; ALL-NEXT: retq
store i32* null, i32** %data, align 8
Expand Down

0 comments on commit 2417273

Please sign in to comment.