diff --git a/clang/lib/Headers/openmp_wrappers/math.h b/clang/lib/Headers/openmp_wrappers/math.h
index 1ce22e065c27a8..e917a149b5c9a5 100644
--- a/clang/lib/Headers/openmp_wrappers/math.h
+++ b/clang/lib/Headers/openmp_wrappers/math.h
@@ -7,6 +7,19 @@
  *===-----------------------------------------------------------------------===
  */
 
+// If we are in C++ mode and include <math.h> (not <cmath>) first, we still need
+// to make sure <cmath> is read first. The problem otherwise is that we haven't
+// seen the declarations of the math.h functions when the system math.h includes
+// our cmath overlay. However, our cmath overlay, or better the underlying
+// overlay, e.g. CUDA, uses the math.h functions. Since we haven't declared them
+// yet we get errors. CUDA avoids this by eagerly declaring all math functions
+// (in the __device__ space) but we cannot do this. Instead we break the
+// dependence by forcing cmath to go first. While our cmath will in turn include
+// this file, the cmath guards will prevent recursion.
+#ifdef __cplusplus
+#include <cmath>
+#endif
+
 #ifndef __CLANG_OPENMP_MATH_H__
 #define __CLANG_OPENMP_MATH_H__
 
diff --git a/clang/test/Headers/Inputs/include/math.h b/clang/test/Headers/Inputs/include/math.h
index a60ad45b4d7138..b13b14f2b12443 100644
--- a/clang/test/Headers/Inputs/include/math.h
+++ b/clang/test/Headers/Inputs/include/math.h
@@ -197,3 +197,7 @@ float ynf(int __a, float __b);
  * math functions.
  */
 #define HUGE_VAL (__builtin_huge_val())
+
+#ifdef __cplusplus
+#include <cmath>
+#endif
diff --git a/clang/test/Headers/nvptx_device_math_sincos.cpp b/clang/test/Headers/nvptx_device_math_sincos.cpp
index 5419ee2c351307..cf9b67903bf65a 100644
--- a/clang/test/Headers/nvptx_device_math_sincos.cpp
+++ b/clang/test/Headers/nvptx_device_math_sincos.cpp
@@ -1,8 +1,13 @@
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers         -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -DCMATH -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
 
+#ifdef CMATH
 #include <cmath>
+#else
+#include <math.h>
+#endif
 
 // 4 calls to sincos(f), all translated to __nv_sincos calls: