Skip to content

Commit

Permalink
[X86] Replace (most) X86ISD::SHLD/SHRD usage with ISD::FSHL/FSHR gene…
Browse files Browse the repository at this point in the history
…ric opcodes (PR39467)

For i32 and i64 cases, X86ISD::SHLD/SHRD are close enough to ISD::FSHL/FSHR that we can use them directly, we just need to account for the operand commutation for SHRD.

The i16 SHLD/SHRD case is annoying as the shift amount is modulo-32 (vs funnel shift modulo-16), so I've added X86ISD::FSHL/FSHR equivalents, which matches the generic implementation in all other terms.

Something I'm slightly concerned with is that ISD::FSHL/FSHR legality is controlled by the Subtarget.isSHLDSlow() feature flag - we don't normally use non-ISA features for this but it allows the DAG combines to continue to operate after legalization in a lot more cases.

The X86 *bits.ll changes are all affected by the same issue - we now have a "FSHR(-1,-1,amt) -> ROTR(-1,amt) -> (-1)" simplification that reduces the dependencies enough for the branch fall through code to mess up.

Differential Revision: https://reviews.llvm.org/D75748
  • Loading branch information
RKSimon committed Mar 11, 2020
1 parent 6d5603e commit b3b4727
Show file tree
Hide file tree
Showing 13 changed files with 731 additions and 808 deletions.
22 changes: 12 additions & 10 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -207,10 +207,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
// For slow shld targets we only lower for code size.
LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , Custom);
setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
if (Subtarget.is64Bit())
setOperationAction(ShiftOp , MVT::i64 , Custom);
setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
}

if (!Subtarget.useSoftFloat()) {
Expand Down Expand Up @@ -18860,16 +18863,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();

if (IsFSHR)
std::swap(Op0, Op1);

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
if (VT == MVT::i16)
if (VT == MVT::i16) {
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
}

unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
return Op;
}

// Try to use a packed vector operation to handle i64 on 32-bit targets when
Expand Down Expand Up @@ -29963,8 +29965,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
NODE_NAME_CASE(BSF)
NODE_NAME_CASE(BSR)
NODE_NAME_CASE(SHLD)
NODE_NAME_CASE(SHRD)
NODE_NAME_CASE(FSHL)
NODE_NAME_CASE(FSHR)
NODE_NAME_CASE(FAND)
NODE_NAME_CASE(FANDN)
NODE_NAME_CASE(FOR)
Expand Down
10 changes: 6 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -33,10 +33,12 @@ namespace llvm {
/// Bit scan reverse.
BSR,

/// Double shift instructions. These correspond to
/// X86::SHLDxx and X86::SHRDxx instructions.
SHLD,
SHRD,
/// X86 funnel/double shift i16 instructions. These correspond to
/// X86::SHLDW and X86::SHRDW instructions which have different amt
/// modulo rules to generic funnel shifts.
/// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
FSHL,
FSHR,

/// Bitwise logical AND of floating point values. This corresponds
/// to X86::ANDPS or X86::ANDPD.
Expand Down
33 changes: 18 additions & 15 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Expand Up @@ -1782,21 +1782,24 @@ multiclass MaskedRotateAmountPats<SDNode frag, string name> {
defm : MaskedRotateAmountPats<rotl, "ROL">;
defm : MaskedRotateAmountPats<rotr, "ROR">;

// Double shift amount is implicitly masked.
multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 31)) ==> (shift x, y)
def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;

// (shift x (and y, 63)) ==> (shift x, y)
def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
(!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
}

defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
// Double "funnel" shift amount is implicitly masked.
// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
(SHLD16rrCL GR16:$src1, GR16:$src2)>;
def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
(SHRD16rrCL GR16:$src1, GR16:$src2)>;

// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
(SHLD32rrCL GR32:$src1, GR32:$src2)>;
def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
(SHRD32rrCL GR32:$src1, GR32:$src2)>;

// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
(SHLD64rrCL GR64:$src1, GR64:$src2)>;
def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
(SHRD64rrCL GR64:$src1, GR64:$src2)>;

let Predicates = [HasBMI2] in {
let AddedComplexity = 1 in {
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InstrInfo.td
Expand Up @@ -143,8 +143,8 @@ def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,

def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;

def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
def X86fcmp : SDNode<"X86ISD::FCMP", SDTX86FCmp>;
Expand Down
70 changes: 35 additions & 35 deletions llvm/lib/Target/X86/X86InstrShiftRotate.td
Expand Up @@ -661,32 +661,32 @@ let Uses = [CL], SchedRW = [WriteSHDrrcl] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
[(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>,
TB, OpSize16;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
[(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>,
TB, OpSize16;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
[(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>,
TB, OpSize32;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
[(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>,
TB, OpSize32;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
[(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
[(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>,
TB;
} // SchedRW

Expand All @@ -695,42 +695,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
[(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2,
(i8 imm:$src3)))]>,
TB, OpSize16;
def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
[(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1,
(i8 imm:$src3)))]>,
TB, OpSize16;
def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
[(set GR32:$dst, (fshl GR32:$src1, GR32:$src2,
(i8 imm:$src3)))]>,
TB, OpSize32;
def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
[(set GR32:$dst, (fshr GR32:$src2, GR32:$src1,
(i8 imm:$src3)))]>,
TB, OpSize32;
def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
[(set GR64:$dst, (fshl GR64:$src1, GR64:$src2,
(i8 imm:$src3)))]>,
TB;
def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
[(set GR64:$dst, (fshr GR64:$src2, GR64:$src1,
(i8 imm:$src3)))]>,
TB;
} // SchedRW
Expand All @@ -739,70 +739,70 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
let Uses = [CL], SchedRW = [WriteSHDmrcl] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)]>, TB, OpSize16;
[(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)]>, TB, OpSize16;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
addr:$dst)]>, TB, OpSize16;
[(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL),
addr:$dst)]>, TB, OpSize16;

def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
[(store (fshl (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)]>, TB, OpSize32;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)]>, TB, OpSize32;
[(store (fshr GR32:$src2, (loadi32 addr:$dst), CL),
addr:$dst)]>, TB, OpSize32;

def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)]>, TB;
[(store (fshl (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)]>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
addr:$dst)]>, TB;
[(store (fshr GR64:$src2, (loadi64 addr:$dst), CL),
addr:$dst)]>, TB;
} // SchedRW

let SchedRW = [WriteSHDmri] in {
def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi16 addr:$dst), GR16:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (X86fshl (loadi16 addr:$dst), GR16:$src2,
(i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;
def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (X86fshr GR16:$src2, (loadi16 addr:$dst),
(i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;

def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi32 addr:$dst), GR32:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (fshl (loadi32 addr:$dst), GR32:$src2,
(i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;
def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (fshr GR32:$src2, (loadi32 addr:$dst),
(i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;

def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shld (loadi64 addr:$dst), GR64:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (fshl (loadi64 addr:$dst), GR64:$src2,
(i8 imm:$src3)), addr:$dst)]>,
TB;
def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
(i8 imm:$src3)), addr:$dst)]>,
[(store (fshr GR64:$src2, (loadi64 addr:$dst),
(i8 imm:$src3)), addr:$dst)]>,
TB;
} // SchedRW

Expand Down

0 comments on commit b3b4727

Please sign in to comment.