diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 384442e6a9359..26c7be4789145 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4572,6 +4572,12 @@ def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLMad : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_mad"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 29baf542f6cbf..25c4c58ad4ae4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -14146,7 +14146,8 @@ class Sema final {
   bool SemaBuiltinVectorMath(CallExpr *TheCall, QualType &Res);
   bool SemaBuiltinVectorToScalarMath(CallExpr *TheCall);
   bool SemaBuiltinElementwiseMath(CallExpr *TheCall);
-  bool SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall);
+  bool SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall,
+                                         bool CheckForFloatArgs = true);
   bool PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall);
   bool PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall);
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9ee51ca7142c7..191f6d5a9fab5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18044,6 +18044,25 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType*/ Op0->getType(), Intrinsic::dx_frac,
         ArrayRef<Value *>{Op0}, nullptr, "dx.frac");
   }
+  case Builtin::BI__builtin_hlsl_mad: {
+    Value *M = EmitScalarExpr(E->getArg(0));
+    Value *A = EmitScalarExpr(E->getArg(1));
+    Value *B = EmitScalarExpr(E->getArg(2));
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      return Builder.CreateIntrinsic(
+          /*ReturnType*/ M->getType(), Intrinsic::fmuladd,
+          ArrayRef<Value *>{M, A, B}, nullptr, "dx.fmad");
+    }
+    if (E->getArg(0)->getType()->hasSignedIntegerRepresentation()) {
+      return Builder.CreateIntrinsic(
+          /*ReturnType*/ M->getType(), Intrinsic::dx_imad,
+          ArrayRef<Value *>{M, A, B}, nullptr, "dx.imad");
+    }
+    assert(E->getArg(0)->getType()->hasUnsignedIntegerRepresentation());
+    return Builder.CreateIntrinsic(
+        /*ReturnType*/ M->getType(), Intrinsic::dx_umad,
+        ArrayRef<Value *>{M, A, B}, nullptr, "dx.umad");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 5180530363889..b5bef78fae72f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -511,6 +511,111 @@ double3 log2(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_log2)
 double4 log2(double4);
 
+//===----------------------------------------------------------------------===//
+// mad builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T mad(T M, T A, T B)
+/// \brief The result of \a M * \a A + \a B.
+/// \param M The multiplication value.
+/// \param A The first addition value.
+/// \param B The second addition value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+half mad(half, half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+half2 mad(half2, half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+half3 mad(half3, half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+half4 mad(half4, half4, half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int16_t mad(int16_t, int16_t, int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int16_t2 mad(int16_t2, int16_t2, int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int16_t3 mad(int16_t3, int16_t3, int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int16_t4 mad(int16_t4, int16_t4, int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint16_t mad(uint16_t, uint16_t, uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint16_t2 mad(uint16_t2, uint16_t2, uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint16_t3 mad(uint16_t3, uint16_t3, uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint16_t4 mad(uint16_t4, uint16_t4, uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int mad(int, int, int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int2 mad(int2, int2, int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int3 mad(int3, int3, int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int4 mad(int4, int4, int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint mad(uint, uint, uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint2 mad(uint2, uint2, uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint3 mad(uint3, uint3, uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint4 mad(uint4, uint4, uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int64_t mad(int64_t, int64_t, int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int64_t2 mad(int64_t2, int64_t2, int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int64_t3 mad(int64_t3, int64_t3, int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+int64_t4 mad(int64_t4, int64_t4, int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint64_t mad(uint64_t, uint64_t, uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint64_t2 mad(uint64_t2, uint64_t2, uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint64_t3 mad(uint64_t3, uint64_t3, uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+uint64_t4 mad(uint64_t4, uint64_t4, uint64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+float mad(float, float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+float2 mad(float2, float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+float3 mad(float3, float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+float4 mad(float4, float4, float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+double mad(double, double, double);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+double2 mad(double2, double2, double2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+double3 mad(double3, double3, double3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_mad)
+double4 mad(double4, double4, double4);
+
 //===----------------------------------------------------------------------===//
 // max builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2cda1d08784ec..85477061d1909 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5298,6 +5298,14 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_mad: {
+    if (checkArgCount(*this, TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(this, TheCall))
+      return true;
+    if (SemaBuiltinElementwiseTernaryMath(TheCall, /*CheckForFloatArgs*/ false))
+      return true;
+  }
   }
   return false;
 }
@@ -19798,7 +19806,8 @@ bool Sema::SemaBuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
   return false;
 }
 
-bool Sema::SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall) {
+bool Sema::SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall,
+                                             bool CheckForFloatArgs) {
   if (checkArgCount(*this, TheCall, 3))
     return true;
 
@@ -19810,11 +19819,20 @@ bool Sema::SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall) {
     Args[I] = Converted.get();
   }
 
-  int ArgOrdinal = 1;
-  for (Expr *Arg : Args) {
-    if (checkFPMathBuiltinElementType(*this, Arg->getBeginLoc(), Arg->getType(),
+  if (CheckForFloatArgs) {
+    int ArgOrdinal = 1;
+    for (Expr *Arg : Args) {
+      if (checkFPMathBuiltinElementType(*this, Arg->getBeginLoc(),
+                                        Arg->getType(), ArgOrdinal++))
+        return true;
+    }
+  } else {
+    int ArgOrdinal = 1;
+    for (Expr *Arg : Args) {
+      if (checkMathBuiltinElementType(*this, Arg->getBeginLoc(), Arg->getType(),
                                       ArgOrdinal++))
-      return true;
+        return true;
+    }
   }
 
   for (int I = 1; I < 3; ++I) {
diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl
new file mode 100644
index 0000000000000..749eac6d64736
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl
@@ -0,0 +1,191 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2)
+// NATIVE_HALF: ret i16 %dx.umad
+uint16_t test_mad_uint16_t(uint16_t p0, uint16_t p1, uint16_t p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.umad = call <2 x i16>  @llvm.dx.umad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
+// NATIVE_HALF: ret <2 x i16> %dx.umad
+uint16_t2 test_mad_uint16_t2(uint16_t2 p0, uint16_t2 p1, uint16_t2 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.umad = call <3 x i16>  @llvm.dx.umad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
+// NATIVE_HALF: ret <3 x i16> %dx.umad
+uint16_t3 test_mad_uint16_t3(uint16_t3 p0, uint16_t3 p1, uint16_t3 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.umad = call <4 x i16>  @llvm.dx.umad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
+// NATIVE_HALF: ret <4 x i16> %dx.umad
+uint16_t4 test_mad_uint16_t4(uint16_t4 p0, uint16_t4 p1, uint16_t4 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2)
+// NATIVE_HALF: ret i16 %dx.imad
+int16_t test_mad_int16_t(int16_t p0, int16_t p1, int16_t p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.imad = call <2 x i16>  @llvm.dx.imad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
+// NATIVE_HALF: ret <2 x i16> %dx.imad
+int16_t2 test_mad_int16_t2(int16_t2 p0, int16_t2 p1, int16_t2 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.imad = call <3 x i16>  @llvm.dx.imad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
+// NATIVE_HALF: ret <3 x i16> %dx.imad
+int16_t3 test_mad_int16_t3(int16_t3 p0, int16_t3 p1, int16_t3 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.imad = call <4 x i16>  @llvm.dx.imad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
+// NATIVE_HALF: ret <4 x i16> %dx.imad
+int16_t4 test_mad_int16_t4(int16_t4 p0, int16_t4 p1, int16_t4 p2) { return mad(p0, p1, p2); }
+#endif // __HLSL_ENABLE_16_BIT
+
+// NATIVE_HALF: %dx.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2)
+// NATIVE_HALF: ret half %dx.fmad
+// NO_HALF: %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+// NO_HALF: ret float %dx.fmad
+half test_mad_half(half p0, half p1, half p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.fmad = call <2 x half>  @llvm.fmuladd.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %dx.fmad
+// NO_HALF: %dx.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %dx.fmad
+half2 test_mad_half2(half2 p0, half2 p1, half2 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.fmad = call <3 x half>  @llvm.fmuladd.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %dx.fmad
+// NO_HALF: %dx.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %dx.fmad
+half3 test_mad_half3(half3 p0, half3 p1, half3 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %dx.fmad = call <4 x half>  @llvm.fmuladd.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %dx.fmad
+// NO_HALF: %dx.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %dx.fmad
+half4 test_mad_half4(half4 p0, half4 p1, half4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+// CHECK: ret float %dx.fmad
+float test_mad_float(float p0, float p1, float p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %dx.fmad
+float2 test_mad_float2(float2 p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %dx.fmad
+float3 test_mad_float3(float3 p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %dx.fmad
+float4 test_mad_float4(float4 p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2)
+// CHECK: ret double %dx.fmad
+double test_mad_double(double p0, double p1, double p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <2 x double>  @llvm.fmuladd.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+// CHECK: ret <2 x double> %dx.fmad
+double2 test_mad_double2(double2 p0, double2 p1, double2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <3 x double>  @llvm.fmuladd.v3f64(<3 x double> %0, <3 x double> %1, <3 x double> %2)
+// CHECK: ret <3 x double> %dx.fmad
+double3 test_mad_double3(double3 p0, double3 p1, double3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <4 x double>  @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)
+// CHECK: ret <4 x double> %dx.fmad
+double4 test_mad_double4(double4 p0, double4 p1, double4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2)
+// CHECK: ret i32 %dx.imad
+int test_mad_int(int p0, int p1, int p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <2 x i32>  @llvm.dx.imad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
+// CHECK: ret <2 x i32> %dx.imad
+int2 test_mad_int2(int2 p0, int2 p1, int2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <3 x i32>  @llvm.dx.imad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
+// CHECK: ret <3 x i32> %dx.imad
+int3 test_mad_int3(int3 p0, int3 p1, int3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <4 x i32>  @llvm.dx.imad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// CHECK: ret <4 x i32> %dx.imad
+int4 test_mad_int4(int4 p0, int4 p1, int4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2)
+// CHECK: ret i64 %dx.imad
+int64_t test_mad_int64_t(int64_t p0, int64_t p1, int64_t p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <2 x i64>  @llvm.dx.imad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+// CHECK: ret <2 x i64> %dx.imad
+int64_t2 test_mad_int64_t2(int64_t2 p0, int64_t2 p1, int64_t2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <3 x i64>  @llvm.dx.imad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
+// CHECK: ret <3 x i64> %dx.imad
+int64_t3 test_mad_int64_t3(int64_t3 p0, int64_t3 p1, int64_t3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.imad = call <4 x i64>  @llvm.dx.imad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
+// CHECK: ret <4 x i64> %dx.imad
+int64_t4 test_mad_int64_t4(int64_t4 p0, int64_t4 p1, int64_t4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2)
+// CHECK: ret i32 %dx.umad
+uint test_mad_uint(uint p0, uint p1, uint p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <2 x i32>  @llvm.dx.umad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
+// CHECK: ret <2 x i32> %dx.umad
+uint2 test_mad_uint2(uint2 p0, uint2 p1, uint2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <3 x i32>  @llvm.dx.umad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
+// CHECK: ret <3 x i32> %dx.umad
+uint3 test_mad_uint3(uint3 p0, uint3 p1, uint3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <4 x i32>  @llvm.dx.umad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// CHECK: ret <4 x i32> %dx.umad
+uint4 test_mad_uint4(uint4 p0, uint4 p1, uint4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2)
+// CHECK: ret i64 %dx.umad
+uint64_t test_mad_uint64_t(uint64_t p0, uint64_t p1, uint64_t p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <2 x i64>  @llvm.dx.umad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+// CHECK: ret <2 x i64> %dx.umad
+uint64_t2 test_mad_uint64_t2(uint64_t2 p0, uint64_t2 p1, uint64_t2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <3 x i64>  @llvm.dx.umad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
+// CHECK: ret <3 x i64> %dx.umad
+uint64_t3 test_mad_uint64_t3(uint64_t3 p0, uint64_t3 p1, uint64_t3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.umad = call <4 x i64>  @llvm.dx.umad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
+// CHECK: ret <4 x i64> %dx.umad
+uint64_t4 test_mad_uint64_t4(uint64_t4 p0, uint64_t4 p1, uint64_t4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %dx.fmad
+float2 test_mad_float2_splat(float p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %dx.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %dx.fmad
+float3 test_mad_float3_splat(float p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
+
+// CHECK:  %dx.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK:  ret <4 x float> %dx.fmad
+float4 test_mad_float4_splat(float p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK: %dx.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %dx.fmad
+float2 test_mad_float2_int_splat(float2 p0, float2 p1, int p2) {
+  return mad(p0, p1, p2);
+}
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
+// CHECK:  %dx.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %dx.fmad
+float3 test_mad_float3_int_splat(float3 p0, float3 p1, int p2) {
+  return mad(p0, p1, p2);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
new file mode 100644
index 0000000000000..0b6843591455b
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_mad_no_second_arg(float2 p0) {
+  return mad(p0);
+  // expected-error@-1 {{no matching function for call to 'mad'}}
+}
+
+float2 test_mad_vector_size_mismatch(float3 p0, float2 p1) {
+  return mad(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_mad_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must have the same type}}
+}
+
+float test_mad_scalar_mismatch(float p0, half p1) {
+  return mad(p1, p0, p1);
+  // expected-error@-1 {{call to 'mad' is ambiguous}}
+}
+
+float2 test_mad_element_type_mismatch(half2 p0, float2 p1) {
+  return mad(p1, p0, p1);
+  // expected-error@-1 {{call to 'mad' is ambiguous}}
+}
+
+float2 test_builtin_mad_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float3 test_builtin_mad_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float4 test_builtin_mad_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float2 test_mad_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float3 test_mad_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float2 test_builtin_mad_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_mad(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_mad(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_mad_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_mad(p0, p0, p1);
+  // expected-error@-1 {{arguments are of different types ('double' vs 'int')}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index b44d1c6d3d2f0..3096442335ce4 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -31,4 +31,7 @@ def int_dx_lerp :
     Intrinsic<[LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>,LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, IntrWillReturn] >;
+
+def int_dx_imad  : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+def int_dx_umad  : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 33b08ed93e3d0..8f73c08a65809 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -226,6 +226,12 @@ def Round : DXILOpMapping<26, unary, int_round,
                          "within a floating-point type.">;
 def UMax : DXILOpMapping<39, binary, int_umax,
                          "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
+def FMad : DXILOpMapping<46, tertiary, int_fmuladd,
+                         "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m * a + b.">;
+def IMad : DXILOpMapping<48, tertiary, int_dx_imad,
+                         "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m * a + b.">;
+def UMad : DXILOpMapping<49, tertiary, int_dx_umad,
+                         "Unsigned integer arithmetic multiply/add operation. umad(m,a,b) = m * a + b.">;
 def ThreadId : DXILOpMapping<93, threadId, int_dx_thread_id,
                              "Reads the thread ID">;
 def GroupId  : DXILOpMapping<94, groupId, int_dx_group_id,
diff --git a/llvm/test/CodeGen/DirectX/fmad.ll b/llvm/test/CodeGen/DirectX/fmad.ll
new file mode 100644
index 0000000000000..693e237e70dc0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fmad.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for round are generated for float and half.
+; CHECK:call half @dx.op.tertiary.f16(i32 46, half %{{.*}}, half %{{.*}}, half %{{.*}})
+; CHECK:call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}})
+; CHECK:call double @dx.op.tertiary.f64(i32 46, double %{{.*}}, double %{{.*}}, double %{{.*}})
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define noundef half @fmad_half(half noundef %p0, half noundef %p1, half noundef %p2) #0 {
+entry:
+  %p2.addr = alloca half, align 2
+  %p1.addr = alloca half, align 2
+  %p0.addr = alloca half, align 2
+  store half %p2, ptr %p2.addr, align 2
+  store half %p1, ptr %p1.addr, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %1 = load half, ptr %p1.addr, align 2
+  %2 = load half, ptr %p2.addr, align 2
+  %dx.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2)
+  ret half %dx.fmad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare half @llvm.fmuladd.f16(half, half, half) #2
+
+; Function Attrs: noinline nounwind optnone
+define noundef float @fmad_float(float noundef %p0, float noundef %p1, float noundef %p2) #0 {
+entry:
+  %p2.addr = alloca float, align 4
+  %p1.addr = alloca float, align 4
+  %p0.addr = alloca float, align 4
+  store float %p2, ptr %p2.addr, align 4
+  store float %p1, ptr %p1.addr, align 4
+  store float %p0, ptr %p0.addr, align 4
+  %0 = load float, ptr %p0.addr, align 4
+  %1 = load float, ptr %p1.addr, align 4
+  %2 = load float, ptr %p2.addr, align 4
+  %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  ret float %dx.fmad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.fmuladd.f32(float, float, float) #2
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @fmad_double(double noundef %p0, double noundef %p1, double noundef %p2) #0 {
+entry:
+  %p2.addr = alloca double, align 8
+  %p1.addr = alloca double, align 8
+  %p0.addr = alloca double, align 8
+  store double %p2, ptr %p2.addr, align 8
+  store double %p1, ptr %p1.addr, align 8
+  store double %p0, ptr %p0.addr, align 8
+  %0 = load double, ptr %p0.addr, align 8
+  %1 = load double, ptr %p1.addr, align 8
+  %2 = load double, ptr %p2.addr, align 8
+  %dx.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2)
+  ret double %dx.fmad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.fmuladd.f64(double, double, double) #2
diff --git a/llvm/test/CodeGen/DirectX/imad.ll b/llvm/test/CodeGen/DirectX/imad.ll
new file mode 100644
index 0000000000000..5b818f86bc7f2
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/imad.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for round are generated for float and half.
+; CHECK:call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; CHECK:call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK:call i64 @dx.op.tertiary.i64(i32 48, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+; Function Attrs: noinline nounwind optnone
+define noundef i16 @imad_short(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i16, align 2
+  %p1.addr = alloca i16, align 2
+  %p0.addr = alloca i16, align 2
+  store i16 %p2, ptr %p2.addr, align 2
+  store i16 %p1, ptr %p1.addr, align 2
+  store i16 %p0, ptr %p0.addr, align 2
+  %0 = load i16, ptr %p0.addr, align 2
+  %1 = load i16, ptr %p1.addr, align 2
+  %2 = load i16, ptr %p2.addr, align 2
+  %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2)
+  ret i16 %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i16 @llvm.dx.imad.i16(i16, i16, i16) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef i32 @imad_int(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i32, align 4
+  %p1.addr = alloca i32, align 4
+  %p0.addr = alloca i32, align 4
+  store i32 %p2, ptr %p2.addr, align 4
+  store i32 %p1, ptr %p1.addr, align 4
+  store i32 %p0, ptr %p0.addr, align 4
+  %0 = load i32, ptr %p0.addr, align 4
+  %1 = load i32, ptr %p1.addr, align 4
+  %2 = load i32, ptr %p2.addr, align 4
+  %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2)
+  ret i32 %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i32 @llvm.dx.imad.i32(i32, i32, i32) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef i64 @imad_int64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i64, align 8
+  %p1.addr = alloca i64, align 8
+  %p0.addr = alloca i64, align 8
+  store i64 %p2, ptr %p2.addr, align 8
+  store i64 %p1, ptr %p1.addr, align 8
+  store i64 %p0, ptr %p0.addr, align 8
+  %0 = load i64, ptr %p0.addr, align 8
+  %1 = load i64, ptr %p1.addr, align 8
+  %2 = load i64, ptr %p2.addr, align 8
+  %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2)
+  ret i64 %dx.imad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i64 @llvm.dx.imad.i64(i64, i64, i64) #1
diff --git a/llvm/test/CodeGen/DirectX/umad.ll b/llvm/test/CodeGen/DirectX/umad.ll
new file mode 100644
index 0000000000000..583fdddfe03f3
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/umad.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for round are generated for float and half.
+; CHECK:call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; CHECK:call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK:call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+; Function Attrs: noinline nounwind optnone
+define noundef i16 @umad_ushort(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i16, align 2
+  %p1.addr = alloca i16, align 2
+  %p0.addr = alloca i16, align 2
+  store i16 %p2, ptr %p2.addr, align 2
+  store i16 %p1, ptr %p1.addr, align 2
+  store i16 %p0, ptr %p0.addr, align 2
+  %0 = load i16, ptr %p0.addr, align 2
+  %1 = load i16, ptr %p1.addr, align 2
+  %2 = load i16, ptr %p2.addr, align 2
+  %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2)
+  ret i16 %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i16 @llvm.dx.umad.i16(i16, i16, i16) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef i32 @umad_uint(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i32, align 4
+  %p1.addr = alloca i32, align 4
+  %p0.addr = alloca i32, align 4
+  store i32 %p2, ptr %p2.addr, align 4
+  store i32 %p1, ptr %p1.addr, align 4
+  store i32 %p0, ptr %p0.addr, align 4
+  %0 = load i32, ptr %p0.addr, align 4
+  %1 = load i32, ptr %p1.addr, align 4
+  %2 = load i32, ptr %p2.addr, align 4
+  %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2)
+  ret i32 %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i32 @llvm.dx.umad.i32(i32, i32, i32) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef i64 @umad_uint64(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) #0 {
+entry:
+  %p2.addr = alloca i64, align 8
+  %p1.addr = alloca i64, align 8
+  %p0.addr = alloca i64, align 8
+  store i64 %p2, ptr %p2.addr, align 8
+  store i64 %p1, ptr %p1.addr, align 8
+  store i64 %p0, ptr %p0.addr, align 8
+  %0 = load i64, ptr %p0.addr, align 8
+  %1 = load i64, ptr %p1.addr, align 8
+  %2 = load i64, ptr %p2.addr, align 8
+  %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2)
+  ret i64 %dx.umad
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i64 @llvm.dx.umad.i64(i64, i64, i64) #1