Skip to content

Commit

Permalink
[X86] Add separate intrinsics for scalar FMA4 instructions.
Browse files Browse the repository at this point in the history
Summary:
These instructions zero the non-scalar part of the lower 128-bits which makes them different than the FMA3 instructions which pass through the non-scalar part of the lower 128-bits.

I've only added fmadd because we should be able to derive all other variants using operand negation in the intrinsic header like we do for AVX512.

I think there are still some missed negate folding opportunities with the FMA4 instructions in light of this behavior difference that I hadn't noticed before.

I've split the tests so that we can use different intrinsics for scalar testing between the two. I just copied the tests split the RUN lines and changed out the scalar intrinsics.

fma4-fneg-combine.ll is a new test to make sure we negate the fma4 intrinsics correctly though there are a couple TODOs in it.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D39851

llvm-svn: 318984
  • Loading branch information
topperc committed Nov 25, 2017
1 parent ea37e20 commit e485631
Show file tree
Hide file tree
Showing 18 changed files with 1,146 additions and 794 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/CodeGen/ISDOpcodes.h
Expand Up @@ -838,7 +838,7 @@ namespace ISD {
/// which do not reference a specific memory location should be less than
/// this value. Those that do must not be less than this value, and can
/// be used with SelectionDAG::getMemIntrinsicNode.
static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+300;
static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+400;

//===--------------------------------------------------------------------===//
/// MemIndexedMode enum - This enum defines the load / store indexed
Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Expand Up @@ -2116,6 +2116,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
[IntrNoMem]>;
def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
[IntrNoMem]>;
def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
[IntrNoMem]>;
def int_x86_fma_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -25169,6 +25169,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
Expand Down Expand Up @@ -35724,6 +35728,13 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
}
} else if (N->getOpcode() == X86ISD::FMADD4S) {
switch (NewOpcode) {
case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
}
} else {
llvm_unreachable("Unexpected opcode!");
}
Expand Down Expand Up @@ -37092,6 +37103,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMADDS3_RND:
case X86ISD::FMADDS1:
case X86ISD::FMADDS3:
case X86ISD::FMADD4S:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -505,6 +505,9 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,

// FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,

// Scalar intrinsic FMA.
FMADDS1, FMADDS3,
FNMADDS1, FNMADDS3,
Expand Down
44 changes: 22 additions & 22 deletions llvm/lib/Target/X86/X86InstrFMA.td
Expand Up @@ -253,18 +253,18 @@ let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC> {
def r_Int : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;

let mayLoad = 1 in
def m_Int : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, memopr:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, memopr:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[]>;
}

// The FMA 213 form is created for lowering of scalar FMA intrinscis
Expand Down Expand Up @@ -385,28 +385,28 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
let isCodeGenOnly = 1 in {
def rr_Int : FMA4S<opc, MRMSrcRegOp4, (outs VR128:$dst),
def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
VEX_LIG;
def rm_Int : FMA4S<opc, MRMSrcMemOp4, (outs VR128:$dst),
def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
mem_cpat:$src3)))]>, VEX_W, VEX_LIG;
def mr_Int : FMA4S<opc, MRMSrcMem, (outs VR128:$dst),
def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
VEX_LIG;
let hasSideEffects = 0 in
def rr_Int_REV : FMA4S<opc, MRMSrcReg, (outs VR128:$dst),
def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
Expand Down Expand Up @@ -476,18 +476,18 @@ let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
X86Fmadds1>;
X86Fmadd4s>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
X86Fmsubs1>;
X86Fmsub4s>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
X86Fnmadds1>;
X86Fnmadd4s>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
X86Fnmsubs1>;
X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
Expand All @@ -507,18 +507,18 @@ let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
X86Fmadds1>;
X86Fmadd4s>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
X86Fmsubs1>;
X86Fmsub4s>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
X86Fnmadds1>;
X86Fnmadd4s>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
X86Fnmsubs1>;
X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Target/X86/X86InstrFormats.td
Expand Up @@ -862,9 +862,13 @@ class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;

Expand All @@ -877,6 +881,10 @@ class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4]>;

// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
Expand Up @@ -506,6 +506,12 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;

// Scalar FMA4 intrinsics which zero the non-scalar bits.
def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;

// Scalar FMA intrinsics with passthru bits in operand 1.
def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/X86/X86InstrInfo.td
Expand Up @@ -850,6 +850,7 @@ def NoVLX_Or_NoVPCLMULQDQ :
def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasTBM : Predicate<"Subtarget->hasTBM()">;
def NoTBM : Predicate<"!Subtarget->hasTBM()">;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Expand Up @@ -1593,6 +1593,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -482,7 +482,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
// Prefer FMA4 to FMA - its better for commutation/memory folding and
// has equal or better performance on all supported targets.
bool hasFMA() const { return HasFMA && !HasFMA4; }
bool hasFMA() const { return HasFMA; }
bool hasFMA4() const { return HasFMA4; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
bool hasXOP() const { return HasXOP; }
Expand Down

0 comments on commit e485631

Please sign in to comment.