From 1579f571b3d1d7ea4b704c9e42c15deac8c131cd Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Tue, 21 May 2019 19:17:19 +0000
Subject: [PATCH] [Intrinsic] Signed Fixed Point Saturation Multiplication
 Intrinsic

Add an intrinsic that takes 2 signed integers with the scale of them provided
as the third argument and performs fixed point multiplication on them. The
result is saturated and clamped between the largest and smallest representable
values of the first 2 operands.

This is a part of implementing fixed point arithmetic in clang where some of
the more complex operations will be implemented as intrinsics.

Differential Revision: https://reviews.llvm.org/D55720

llvm-svn: 361289
---
 llvm/docs/LangRef.rst                         |  95 +++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   5 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |   1 +
 llvm/include/llvm/IR/Intrinsics.td            |   6 +
 .../include/llvm/Target/TargetSelectionDAG.td |   1 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   2 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 142 +++-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   8 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  56 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   1 +
 llvm/lib/IR/Verifier.cpp                      |  13 +-
 llvm/test/CodeGen/X86/smul_fix_sat.ll         | 739 ++++++++++++++++++
 .../CodeGen/X86/smul_fix_sat_constants.ll     | 101 +++
 16 files changed, 1150 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/smul_fix_sat.ll
 create mode 100644 llvm/test/CodeGen/X86/smul_fix_sat_constants.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3f35752b450fd..503b89892071a 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13278,6 +13278,31 @@ are useful for representing fractional values to a specific precision. The
 following intrinsics perform fixed point arithmetic operations on 2 operands
 of the same scale, specified as the third argument.
 
+The `llvm.*mul.fix` family of intrinsic functions represents a multiplication
+of fixed point numbers through scaled integers. Therefore, fixed point
+multplication can be represented as
+
+::
+        %result = call i4 @llvm.smul.fix.i4(i4 %a, i4 %b, i32 %scale)
+          =>
+        %a2 = sext i4 %a to i8
+        %b2 = sext i4 %b to i8
+        %mul = mul nsw nuw i8 %a, %b
+        %scale2 = trunc i32 %scale to i8
+        %r = ashr i8 %mul, i8 %scale2  ; this is for a target rounding down towards negative infinity
+        %result = trunc i8 %r to i4
+
+For each of these functions, if the result cannot be represented exactly with
+the provided scale, the result is rounded. Rounding is unspecified since
+preferred rounding may vary for different targets. Rounding is specified
+through a target hook. Different pipelines should legalize or optimize this
+using the rounding specified by this hook if it is provided. Operations like
+constant folding, instruction combining, KnownBits, and ValueTracking should
+also use this hook, if provided, and not assume the direction of rounding. A
+rounded result must always be within one unit of precision from the true
+result. That is, the error between the returned result and the true result must
+be less than 1/2^(scale).
+
 
 '``llvm.smul.fix.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -13398,6 +13423,76 @@ Examples
       %res = call i4 @llvm.umul.fix.i4(i4 15, i4 1, i32 1)  ; %res = 7 (or 8) (7.5 x 0.5 = 3.75)
 
 
+'``llvm.smul.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.smul.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.smul.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+      declare i32 @llvm.smul.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+      declare i64 @llvm.smul.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+      declare <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.smul.fix.sat``' family of intrinsic functions perform signed
+fixed point saturation multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo signed fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest signed value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest signed value representable by this bit width.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 2, i32 0)  ; %res = 6 (2 x 3 = 6)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 2, i32 1)  ; %res = 3 (1.5 x 1 = 1.5)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 -2, i32 1)  ; %res = -3 (1.5 x -1 = -1.5)
+
+      ; The result in the following could be rounded up to -2 or down to -2.5
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 3, i4 -3, i32 1)  ; %res = -5 (or -4) (1.5 x -1.5 = -2.25)
+
+      ; Saturation
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 7, i4 2, i32 0)  ; %res = 7
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 7, i4 2, i32 2)  ; %res = 7
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 -8, i4 2, i32 2)  ; %res = -8
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 -8, i4 -2, i32 2)  ; %res = 7
+
+      ; Scale can affect the saturation result
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 0)  ; %res = 7 (2 x 4 -> clamped to 7)
+      %res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 1)  ; %res = 4 (1 x 2 = 2)
+
+
 Specialised Arithmetic Intrinsics
 ---------------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 0f798b5d20c33..9b765299b10c5 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -278,6 +278,11 @@ namespace ISD {
     /// multiplication on 2 integers.
     SMULFIX, UMULFIX,
 
+    /// Same as the corresponding unsaturated fixed point instructions, but the
+    /// result is clamped between the min and max values representable by the
+    /// bits of the first 2 operands.
+    SMULFIXSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b65612cb25b49..7f4b2bad803de 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -855,6 +855,7 @@ class TargetLoweringBase {
     default:
       llvm_unreachable("Unexpected fixed point operation.");
     case ISD::SMULFIX:
+    case ISD::SMULFIXSAT:
     case ISD::UMULFIX:
       Supported = isSupportedFixedPointOperation(Op, VT, Scale);
       break;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 2957478ef5baa..b329d5c3eb88f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -874,6 +874,12 @@ def int_umul_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                              [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
 
+//===------------------- Fixed Point Saturation Arithmetic Intrinsics ----------------===//
+//
+def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
+                                 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                                 [IntrNoMem, IntrSpeculatable, Commutative, ImmArg<2>]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 6b1ef477bfa14..28a2eb0727a69 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -391,6 +391,7 @@ def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
 def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
 
 def smulfix    : SDNode<"ISD::SMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
+def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
 def umulfix    : SDNode<"ISD::UMULFIX"   , SDTIntScaledBinOp, [SDNPCommutative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 63d407cfd5f07..52ae1e01a9ef8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1140,6 +1140,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     break;
   }
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
@@ -3334,6 +3335,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(TLI.expandAddSubSat(Node, DAG));
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0930b63eecd43..357654fb1af85 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -149,6 +149,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT:
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:     Res = PromoteIntRes_MULFIX(N); break;
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
@@ -670,14 +671,35 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
   // Can just promote the operands then continue with operation.
   SDLoc dl(N);
   SDValue Op1Promoted, Op2Promoted;
-  if (N->getOpcode() == ISD::SMULFIX) {
+  bool Signed =
+      N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+  if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
   } else {
     Op1Promoted = ZExtPromotedInteger(N->getOperand(0));
     Op2Promoted = ZExtPromotedInteger(N->getOperand(1));
   }
+  EVT OldType = N->getOperand(0).getValueType();
   EVT PromotedType = Op1Promoted.getValueType();
+  unsigned DiffSize =
+      PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
+
+  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+  if (Saturating) {
+    // Promoting the operand and result values changes the saturation width,
+    // which is extends the values that we clamp to on saturation. This could be
+    // resolved by shifting one of the operands the same amount, which would
+    // also shift the result we compare against, then shifting back.
+    EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+    Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                              DAG.getConstant(DiffSize, dl, ShiftTy));
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                 Op2Promoted, N->getOperand(2));
+    unsigned ShiftOp = Signed ? ISD::SRA : ISD::SRL;
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result,
+                       DAG.getConstant(DiffSize, dl, ShiftTy));
+  }
   return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
                      N->getOperand(2));
 }
@@ -1125,6 +1147,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break;
 
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
@@ -1688,7 +1711,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
+
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
@@ -2712,19 +2737,40 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
   SplitInteger(Result, Lo, Hi);
 }
 
+/// This performs an expansion of the integer result for a fixed point
+/// multiplication. The default expansion performs rounding down towards
+/// negative infinity, though targets that do care about rounding should specify
+/// a target hook for rounding and provide their own expansion or lowering of
+/// fixed point multiplication to be consistent with rounding.
 void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  assert(
-      (N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::UMULFIX) &&
-      "Expected operand to be signed or unsigned fixed point multiplication");
-
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
+  unsigned VTSize = VT.getScalarSizeInBits();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   uint64_t Scale = N->getConstantOperandVal(2);
+  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+  EVT BoolVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
   if (!Scale) {
-    SDValue Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    SDValue Result;
+    if (!Saturating) {
+      Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else {
+      Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MinVal = APInt::getSignedMinValue(VTSize);
+      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+      Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+    }
     SplitInteger(Result, Lo, Hi);
     return;
   }
@@ -2735,7 +2781,8 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   GetExpandedInteger(RHS, RL, RH);
   SmallVector<SDValue, 4> Result;
 
-  bool Signed = N->getOpcode() == ISD::SMULFIX;
+  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
+                 N->getOpcode() == ISD::SMULFIXSAT);
   unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
@@ -2744,8 +2791,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
     return;
   }
 
-  unsigned VTSize = VT.getScalarSizeInBits();
   unsigned NVTSize = NVT.getScalarSizeInBits();
+  assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
+                                    "the size of the current value type");
   EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
 
   // Shift whole amount by scale.
@@ -2754,6 +2802,12 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   SDValue ResultHL = Result[2];
   SDValue ResultHH = Result[3];
 
+  SDValue SatMax, SatMin;
+  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
+  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
+  EVT BoolNVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), NVT);
+
   // After getting the multplication result in 4 parts, we need to perform a
   // shift right by the amount of the scale to get the result in that scale.
   // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
@@ -2782,11 +2836,60 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
     Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
     Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
                      DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
+
+    // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
+    // highest bit of HH determines saturation direction in the event of
+    // saturation.
+    // The number of overflow bits we can check are VTSize - Scale + 1 (we
+    // include the sign bit). If these top bits are > 0, then we overflowed past
+    // the max value. If these top bits are < -1, then we overflowed past the
+    // min value. Otherwise, we did not overflow.
+    if (Saturating) {
+      unsigned OverflowBits = VTSize - Scale + 1;
+      assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
+             "Extent of overflow bits must start within HL");
+      SDValue HLHiMask = DAG.getConstant(
+          APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
+      SDValue HLLoMask = DAG.getConstant(
+          APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
+
+      // HH > 0 or HH == 0 && HL > HLLoMask
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+      SDValue HLPos =
+          DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
+      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLPos));
+
+      // HH < -1 or HH == -1 && HL < HLHiMask
+      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+      SDValue HLNeg =
+          DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
+    }
   } else if (Scale == NVTSize) {
     // If the scales are equal, Lo and Hi are ResultLH and Result HL,
     // respectively. Avoid shifting to prevent undefined behavior.
     Lo = ResultLH;
     Hi = ResultHL;
+
+    // We overflow max if HH > 0 or HH == 0 && HL sign is negative.
+    // We overflow min if HH < -1 or HH == -1 && HL sign is 0.
+    if (Saturating) {
+      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+      SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
+
+      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+      SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGT);
+      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
+                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
+    }
   } else if (Scale < VTSize) {
     // If the scale is instead less than the old VT size, but greater than or
     // equal to the expanded VT size, the first part of the result (ResultLL) is
@@ -2801,6 +2904,19 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
     Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
     Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
                      DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
+
+    // This is similar to the case when we saturate if Scale < NVTSize, but we
+    // only need to chech HH.
+    if (Saturating) {
+      unsigned OverflowBits = VTSize - Scale + 1;
+      SDValue HHHiMask = DAG.getConstant(
+          APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+      SDValue HHLoMask = DAG.getConstant(
+          APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
+
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
+      SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+    }
   } else if (Scale == VTSize) {
     assert(
         !Signed &&
@@ -2812,6 +2928,16 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
     llvm_unreachable("Expected the scale to be less than or equal to the width "
                      "of the operands");
   }
+
+  if (Saturating) {
+    APInt LHMax = APInt::getSignedMaxValue(NVTSize);
+    APInt LLMax = APInt::getAllOnesValue(NVTSize);
+    APInt LHMin = APInt::getSignedMinValue(NVTSize);
+    Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LHMax, dl, NVT), Hi);
+    Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
+    Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LLMax, dl, NVT), Lo);
+    Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
+  }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ad2e398dff10b..f77ccd994da13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -438,6 +438,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index add97ec105705..8570f57616e4f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -183,6 +183,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     R = ScalarizeVecRes_OverflowOp(N, ResNo);
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     R = ScalarizeVecRes_MULFIX(N);
     break;
@@ -971,6 +972,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
     break;
   case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
     SplitVecRes_MULFIX(N, Lo, Hi);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 31410f208ace6..5ac9d796f7823 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6298,6 +6298,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              Op1.getValueType(), Op1, Op2, Op3));
     return;
   }
+  case Intrinsic::smul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(ISD::SMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+                             Op3));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index cbef6cc24f7fd..2841633657830 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -301,7 +301,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UADDSAT:                    return "uaddsat";
   case ISD::SSUBSAT:                    return "ssubsat";
   case ISD::USUBSAT:                    return "usubsat";
+
   case ISD::SMULFIX:                    return "smulfix";
+  case ISD::SMULFIXSAT:                 return "smulfixsat";
   case ISD::UMULFIX:                    return "umulfix";
 
   // Conversion operators.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f07180a2faaa6..ac45f4e08a75f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5695,25 +5695,42 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
 SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
-          Node->getOpcode() == ISD::UMULFIX) &&
-         "Expected opcode to be SMULFIX or UMULFIX.");
+          Node->getOpcode() == ISD::UMULFIX ||
+          Node->getOpcode() == ISD::SMULFIXSAT) &&
+         "Expected a fixed point multiplication opcode");
 
   SDLoc dl(Node);
   SDValue LHS = Node->getOperand(0);
   SDValue RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
   unsigned Scale = Node->getConstantOperandVal(2);
+  bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  unsigned VTSize = VT.getScalarSizeInBits();
 
-  // [us]mul.fix(a, b, 0) -> mul(a, b)
   if (!Scale) {
-    if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
-      return SDValue();
-    return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    // [us]mul.fix(a, b, 0) -> mul(a, b)
+    if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
+      return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+
+      APInt MinVal = APInt::getSignedMinValue(VTSize);
+      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+      return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    }
   }
 
-  unsigned VTSize = VT.getScalarSizeInBits();
-  bool Signed = Node->getOpcode() == ISD::SMULFIX;
-
+  bool Signed =
+      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
          "Expected scale to be less than the number of bits if signed or at "
          "most the number of bits if unsigned.");
@@ -5746,8 +5763,25 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   // are scaled. The result is given to us in 2 halves, so we only want part of
   // both in the result.
   EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
-  return DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
-                     DAG.getConstant(Scale, dl, ShiftTy));
+  SDValue Result = DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
+                               DAG.getConstant(Scale, dl, ShiftTy));
+  if (!Saturating)
+    return Result;
+
+  unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
+  SDValue HiMask =
+      DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
+  SDValue LoMask = DAG.getConstant(
+      APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
+  APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+  APInt MinVal = APInt::getSignedMinValue(VTSize);
+
+  Result = DAG.getSelectCC(dl, Hi, LoMask,
+                           DAG.getConstant(MaxVal, dl, VT), Result,
+                           ISD::SETGT);
+  return DAG.getSelectCC(dl, Hi, HiMask,
+                         DAG.getConstant(MinVal, dl, VT), Result,
+                         ISD::SETLT);
 }
 
 void TargetLowering::expandUADDSUBO(
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index eeb8aa1374f53..32f97f7e2aacb 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -623,6 +623,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SSUBSAT, VT, Expand);
     setOperationAction(ISD::USUBSAT, VT, Expand);
     setOperationAction(ISD::SMULFIX, VT, Expand);
+    setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
 
     // Overflow operations default to expand
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 67d43ce77402a..fc8d210e67ad9 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4595,27 +4595,28 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::smul_fix:
+  case Intrinsic::smul_fix_sat:
   case Intrinsic::umul_fix: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
-           "first operand of [us]mul_fix must be an int type or vector "
+           "first operand of [us]mul_fix[_sat] must be an int type or vector "
            "of ints");
     Assert(Op2->getType()->isIntOrIntVectorTy(),
-           "second operand of [us]mul_fix must be an int type or vector "
+           "second operand of [us]mul_fix_[sat] must be an int type or vector "
            "of ints");
 
     auto *Op3 = cast<ConstantInt>(Call.getArgOperand(2));
     Assert(Op3->getType()->getBitWidth() <= 32,
-           "third argument of [us]mul_fix must fit within 32 bits");
+           "third argument of [us]mul_fix[_sat] must fit within 32 bits");
 
-    if (ID == Intrinsic::smul_fix) {
+    if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat) {
       Assert(
           Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
-          "the scale of smul_fix must be less than the width of the operands");
+          "the scale of smul_fix[_sat] must be less than the width of the operands");
     } else {
       Assert(Op3->getZExtValue() <= Op1->getType()->getScalarSizeInBits(),
-             "the scale of umul_fix must be less than or equal to the width of "
+             "the scale of umul_fix[_sat] must be less than or equal to the width of "
              "the operands");
     }
     break;
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
new file mode 100644
index 0000000000000..44603703c5168
--- /dev/null
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -0,0 +1,739 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare  i4  @llvm.smul.fix.sat.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.sat.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movslq %esi, %rax
+; X64-NEXT:    movslq %edi, %rcx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    shrdl $2, %eax, %ecx
+; X64-NEXT:    cmpl $1, %eax
+; X64-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovlel %ecx, %edx
+; X64-NEXT:    cmpl $-2, %eax
+; X64-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    cmovgel %edx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 2);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnsl %ecx, %edi
+; X86-NEXT:    cmovnsl %edx, %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl $0, %ebp
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %edi, %ebp
+; X86-NEXT:    cmovnsl %esi, %ecx
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    setg %bh
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $30, %esi, %eax
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    orb %bh, %bl
+; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmpl $-1, %ebp
+; X86-NEXT:    setl %bl
+; X86-NEXT:    sete %bh
+; X86-NEXT:    cmpl $-2, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    andb %bh, %cl
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orb %bl, %cl
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 2);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    imull %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    shrl $8, %ecx
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    shlb $6, %dl
+; X64-NEXT:    orb %al, %dl
+; X64-NEXT:    movzbl %dl, %eax
+; X64-NEXT:    cmpb $1, %cl
+; X64-NEXT:    movl $127, %edx
+; X64-NEXT:    cmovlel %eax, %edx
+; X64-NEXT:    cmpb $-2, %cl
+; X64-NEXT:    movl $128, %eax
+; X64-NEXT:    cmovgel %edx, %eax
+; X64-NEXT:    sarb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func3:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movb %ah, %cl
+; X86-NEXT:    shlb $6, %cl
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    cmpb $1, %ah
+; X86-NEXT:    movl $127, %edx
+; X86-NEXT:    cmovlel %ecx, %edx
+; X86-NEXT:    cmpb $-2, %ah
+; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmovgel %edx, %eax
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 2);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movslq %ecx, %rdx
+; X64-NEXT:    imulq %rax, %rdx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    shrdl $2, %ecx, %edx
+; X64-NEXT:    cmpl $1, %ecx
+; X64-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovgl %eax, %edx
+; X64-NEXT:    cmpl $-2, %ecx
+; X64-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X64-NEXT:    cmovll %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm3, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    movd %xmm0, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %esi
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    imulq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    shrdl $2, %edx, %esi
+; X64-NEXT:    cmpl $1, %edx
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmpl $-2, %edx
+; X64-NEXT:    cmovll %ecx, %esi
+; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrdl $2, %edx, %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ebp, %ecx
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl $2, %edx, %edi
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %edi
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl $2, %edx, %ebx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %ebx
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    cmovgl %ebp, %eax
+; X86-NEXT:    cmpl $-2, %edx
+; X86-NEXT:    cmovll %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2);
+  ret <4 x i32> %tmp;
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    imull %esi, %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %ecx, %ecx
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %esi, %edi
+; X64-NEXT:    cmovnol %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 0);
+  ret i32 %tmp;
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    setns %cl
+; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    imulq %rsi, %rdi
+; X64-NEXT:    cmovnoq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl %esp, %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __mulodi4
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -20
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    cmpl $0, (%esp)
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 0);
+  ret i64 %tmp;
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb $4, %sil
+; X64-NEXT:    sarb $4, %sil
+; X64-NEXT:    shlb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    imulb %sil
+; X64-NEXT:    seto %cl
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    setns %dl
+; X64-NEXT:    addl $127, %edx
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    sarb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func6:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    imulb %cl
+; X86-NEXT:    seto %dl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %tmp = call i4 @llvm.smul.fix.sat.i4(i4 %x, i4 %y, i32 0);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm2, %r8d
+; X64-NEXT:    movl %r8d, %edx
+; X64-NEXT:    imull %ecx, %edx
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    setns %sil
+; X64-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %ecx, %r8d
+; X64-NEXT:    cmovol %esi, %r8d
+; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    imull %edx, %esi
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    setns %dil
+; X64-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %edx, %ecx
+; X64-NEXT:    cmovol %edi, %ecx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    imull %edx, %edi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %edx, %esi
+; X64-NEXT:    cmovol %eax, %esi
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; X64-NEXT:    movd %xmm1, %r9d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    movl %edx, %edi
+; X64-NEXT:    imull %r9d, %edi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    setns %al
+; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    imull %r9d, %edx
+; X64-NEXT:    cmovol %eax, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    movd %r8d, %xmm2
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
+;
+; X86-LABEL: vec2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovol %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovol %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovol %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    setns %bl
+; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovol %ebx, %edi
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0);
+  ret <4 x i32> %tmp;
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebp, %esi
+; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %esi, %edi
+; X86-NEXT:    cmovnsl %ecx, %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setg %cl
+; X86-NEXT:    sets %ch
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setg %bl
+; X86-NEXT:    sete %bh
+; X86-NEXT:    andb %ch, %bh
+; X86-NEXT:    orb %bl, %bh
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmpl $-1, %edi
+; X86-NEXT:    setl %ch
+; X86-NEXT:    sete %bl
+; X86-NEXT:    andb %cl, %bl
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    movabsq $4611686018427387903, %rcx # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    movabsq $-4611686018427387904, %rcx # imm = 0xC000000000000000
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: func8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %ebx, %esi
+; X86-NEXT:    cmovnsl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnsl %esi, %ebx
+; X86-NEXT:    cmovnsl %ecx, %edi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    shrdl $31, %edi, %eax
+; X86-NEXT:    cmpl $1073741823, %ebx # imm = 0x3FFFFFFF
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $-1073741824, %ebx # imm = 0xC0000000
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 63);
+  ret i64 %tmp;
+}
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
new file mode 100644
index 0000000000000..53c2074de1ffb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+
+; Verify expansion by using constant values. We just want to cover all the paths layed out by ExpandIntRes_MULFIX.
+
+declare  i4  @llvm.smul.fix.sat.i4   (i4,  i4, i32)
+declare  i32 @llvm.smul.fix.sat.i32  (i32, i32, i32)
+declare  i64 @llvm.smul.fix.sat.i64  (i64, i64, i32)
+declare  <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
+
+define i64 @func() nounwind {
+; X64-LABEL: func:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $2, %ecx
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    imulq %rcx
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 3, i64 2, i32 2);
+  ret i64 %tmp;
+}
+
+define i64 @func2() nounwind {
+; X64-LABEL: func2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $3, %eax
+; X64-NEXT:    imulq $2, %rax, %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    setns %dl
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    imulq $2, %rax, %rax
+; X64-NEXT:    cmovoq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 3, i64 2, i32 0);
+  ret i64 %tmp;
+}
+
+define i64 @func3() nounwind {
+; X64-LABEL: func3:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $2, %rdx, %rax
+; X64-NEXT:    cmpq $1, %rdx
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2);
+  ret i64 %tmp;
+}
+
+define i64 @func4() nounwind {
+; X64-LABEL: func4:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $32, %rdx, %rax
+; X64-NEXT:    cmpq $2147483647, %rdx # imm = 0x7FFFFFFF
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmpq $-2147483648, %rdx # imm = 0x80000000
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32);
+  ret i64 %tmp;
+}
+
+define i64 @func5() nounwind {
+; X64-LABEL: func5:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    imulq %rdx
+; X64-NEXT:    shrdq $63, %rdx, %rax
+; X64-NEXT:    movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF
+; X64-NEXT:    cmpq %rsi, %rdx
+; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    movabsq $-4611686018427387904, %rcx # imm = 0xC000000000000000
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmovlq %rcx, %rax
+; X64-NEXT:    retq
+  %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63);
+  ret i64 %tmp;
+}