[X86] Bring back the MOV64r0 pseudo instruction

This patch brings back the MOV64r0 pseudo instruction for zeroing a 64-bit register. This replaces the SUBREG_TO_REG MOV32r0 sequence we use today. Post register allocation we will rewrite the MOV64r0 to a 32-bit xor with an implicit def of the 64-bit register similar to what we do for the various XMM/YMM/ZMM zeroing pseudos. My main motivation is to enable the spill optimization in foldMemoryOperandImpl. As we were seeing some code that repeatedly did "xor eax, eax; store eax;" to spill several registers with a new xor for each store. With this optimization enabled we get a store of a 0 immediate instead of an xor. Though I admit the ideal solution would be one xor where there are multiple spills. I don't believe we have a test case that shows this optimization in here. I'll see if I can try to reduce one from the code were looking at. There's definitely some other machine CSE(and maybe other passes) behavior changes exposed by this patch. So it seems like there might be some other deficiencies in SUBREG_TO_REG handling. Differential Revision: https://reviews.llvm.org/D52757 llvm-svn: 345165
llvm · Oct 24, 2018 · 2417273 · 2417273
1 parent 2cce074
commit 2417273
Show file tree

Hide file tree

Showing 17 changed files with 523 additions and 472 deletions.
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1916,8 +1916,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
-        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
-        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
+        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
@@ -1964,26 +1964,22 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
-      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      unsigned ZeroReg = createResultReg(VT == MVT::i64 ? &X86::GR64RegClass
+                                                        : &X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::MOV32r0), Zero32);
+              TII.get(OpEntry.OpSignExtend), ZeroReg);
 
       // Copy the zero into the appropriate sub/super/identical physical
       // register. Unfortunately the operations needed are not uniform enough
       // to fit neatly into the table above.
-      if (VT == MVT::i16) {
+      if (VT == MVT::i16)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-          .addReg(Zero32, 0, X86::sub_16bit);
-      } else if (VT == MVT::i32) {
+          .addReg(ZeroReg, 0, X86::sub_16bit);
+      else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
-            .addReg(Zero32);
-      } else if (VT == MVT::i64) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
-            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
-      }
+            .addReg(ZeroReg);
     }
   }
   // Generate the DIV/IDIV instruction.
@@ -3708,6 +3704,9 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
 
   uint64_t Imm = CI->getZExtValue();
   if (Imm == 0) {
+    if (VT.SimpleTy == MVT::i64)
+      return fastEmitInst_(X86::MOV64r0, &X86::GR64RegClass);
+
     unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
@@ -3720,13 +3719,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
                                         X86::sub_16bit);
     case MVT::i32:
       return SrcReg;
-    case MVT::i64: {
-      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
-        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
-      return ResultReg;
-    }
     }
   }
 

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3569,7 +3569,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+        unsigned ClrOpc = NVT.SimpleTy == MVT::i64 ? X86::MOV64r0
+                                                   : X86::MOV32r0;
+        MVT ClrVT = NVT.SimpleTy == MVT::i64 ? MVT::i64 : MVT::i32;
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(ClrOpc, dl, ClrVT), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
@@ -3580,15 +3583,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
                       0);
           break;
         case MVT::i32:
-          break;
         case MVT::i64:
-          ClrNode =
-              SDValue(CurDAG->getMachineNode(
-                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
-                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
-                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
-                                                    MVT::i32)),
-                      0);
           break;
         default:
           llvm_unreachable("Unexpected division source");

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -275,16 +275,18 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 10 in
+    isPseudo = 1, AddedComplexity = 10 in {
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)]>, Sched<[WriteZero]>;
+def MOV64r0  : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, 0)]>, Sched<[WriteZero]>;
+}
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
 let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, Not64BitMode],

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -683,8 +683,10 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
   if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
     // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
     // effects.
+    unsigned NewOpc = X86::MOV32ri;
     int Value;
     switch (Orig.getOpcode()) {
+    case X86::MOV64r0:  NewOpc = X86::MOV32ri64; Value = 0; break;
     case X86::MOV32r0:  Value = 0; break;
     case X86::MOV32r1:  Value = 1; break;
     case X86::MOV32r_1: Value = -1; break;
@@ -693,7 +695,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     }
 
     const DebugLoc &DL = Orig.getDebugLoc();
-    BuildMI(MBB, I, DL, get(X86::MOV32ri))
+    BuildMI(MBB, I, DL, get(NewOpc))
         .add(Orig.getOperand(0))
         .addImm(Value);
   } else {
@@ -3750,7 +3752,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+      if (!Movr0Inst &&
+          (Instr.getOpcode() == X86::MOV32r0 ||
+           Instr.getOpcode() == X86::MOV64r0) &&
           Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = &Instr;
         continue;
@@ -4155,6 +4159,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::MOV64r0: {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    unsigned Reg = MIB->getOperand(0).getReg();
+    unsigned Reg32 = TRI->getSubReg(Reg, X86::sub_32bit);
+    MIB->getOperand(0).setReg(Reg32);
+    Expand2AddrUndef(MIB, get(X86::XOR32rr));
+    MIB.addReg(Reg, RegState::ImplicitDefine);
+    return true;
+  }
   case X86::MOV32r1:
     return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
   case X86::MOV32r_1:
@@ -4898,8 +4911,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     isTwoAddrFold = true;
   } else {
     if (OpNum == 0) {
-      if (MI.getOpcode() == X86::MOV32r0) {
-        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+      if (MI.getOpcode() == X86::MOV32r0 || MI.getOpcode() == X86::MOV64r0) {
+        unsigned NewOpc = MI.getOpcode() == X86::MOV64r0 ? X86::MOV64mi32
+                                                         : X86::MOV32mi;
+        NewMI = MakeM0Inst(*this, NewOpc, MOs, InsertPt, MI);
         if (NewMI)
           return NewMI;
       }

diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -487,20 +487,14 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
     // Otherwise, just build the predicate state itself by zeroing a register
     // as we don't need any initial state.
     PS->InitialReg = MRI->createVirtualRegister(PS->RC);
-    unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
-    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
-                         PredStateSubReg);
+    auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64r0),
+                         PS->InitialReg);
     ++NumInstsInserted;
     MachineOperand *ZeroEFLAGSDefOp =
         ZeroI->findRegisterDefOperand(X86::EFLAGS);
     assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
            "Must have an implicit def of EFLAGS!");
     ZeroEFLAGSDefOp->setIsDead(true);
-    BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
-            PS->InitialReg)
-        .addImm(0)
-        .addReg(PredStateSubReg)
-        .addImm(X86::sub_32bit);
   }
 
   // We're going to need to trace predicate state throughout the function's

diff --git a/llvm/test/CodeGen/X86/GlobalISel/constant.ll b/llvm/test/CodeGen/X86/GlobalISel/constant.ll
@@ -54,7 +54,7 @@ define i64 @const_i64_i32() {
 define void @main(i32 ** %data) {
 ; ALL-LABEL: main:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movq $0, %rax
+; ALL-NEXT:    xorl %eax, %eax
 ; ALL-NEXT:    movq %rax, (%rdi)
 ; ALL-NEXT:    retq
   store i32* null, i32** %data, align 8