diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index a2a30c2941d6b..3e3059570c184 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -5,6 +5,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   math/clc_exp2.cl
   math/clc_exp2_fast.cl
   math/clc_exp10.cl
+  math/clc_fmod.cl
   math/clc_frexp.cl
   math/clc_half_exp.cl
   math/clc_half_exp2.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_fmod.cl b/libclc/clc/lib/amdgpu/math/clc_fmod.cl
new file mode 100644
index 0000000000000..a05b75d968ce7
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_fmod.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_fmod.h"
+
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod
+#define __CLC_BODY "clc/shared/binary_def.inc"
+
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_fmod.cl b/libclc/clc/lib/generic/math/clc_fmod.cl
index 8add0cefd621f..7f60b403b53e6 100644
--- a/libclc/clc/lib/generic/math/clc_fmod.cl
+++ b/libclc/clc/lib/generic/math/clc_fmod.cl
@@ -6,10 +6,187 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clc/internal/clc.h"
+#include <clc/clc_convert.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_floor.h>
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_ldexp.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/shared/clc_max.h>
 
+_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) {
+  int ux = __clc_as_int(x);
+  int ax = ux & EXSIGNBIT_SP32;
+  float xa = __clc_as_float(ax);
+  int sx = ux ^ ax;
+  int ex = ax >> EXPSHIFTBITS_SP32;
+
+  int uy = __clc_as_int(y);
+  int ay = uy & EXSIGNBIT_SP32;
+  float ya = __clc_as_float(ay);
+  int ey = ay >> EXPSHIFTBITS_SP32;
+
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
+  int c;
+  int k = ex - ey;
+
+  while (k > 0) {
+    c = xr >= yr;
+    xr -= c ? yr : 0.0f;
+    xr += xr;
+    --k;
+  }
+
+  c = xr >= yr;
+  xr -= c ? yr : 0.0f;
+
+  int lt = ex < ey;
+
+  xr = lt ? xa : xr;
+  yr = lt ? ya : yr;
+
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
+  xr *= lt ? 1.0f : s;
+
+  c = ax == ay;
+  xr = c ? 0.0f : xr;
+
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
+
+  c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
+      ay == 0;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
+
+  return xr;
+}
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
+  ulong ux = __clc_as_ulong(x);
+  ulong ax = ux & ~SIGNBIT_DP64;
+  ulong xsgn = ux ^ ax;
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
+  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
+  xexp1 = xexp < 1 ? xexp1 : xexp;
+
+  ulong uy = __clc_as_ulong(y);
+  ulong ay = uy & ~SIGNBIT_DP64;
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
+  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
+  yexp1 = yexp < 1 ? yexp1 : yexp;
+
+  // First assume |x| > |y|
+
+  // Set ntimes to the number of times we need to do a
+  // partial remainder. If the exponent of x is an exact multiple
+  // of 53 larger than the exponent of y, and the mantissa of x is
+  // less than the mantissa of y, ntimes will be one too large
+  // but it doesn't matter - it just means that we'll go round
+  // the loop below one extra time.
+  int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
+  double w = __clc_ldexp(dy, ntimes * 53);
+  w = ntimes == 0 ? dy : w;
+  double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
+
+  // Each time round the loop we compute a partial remainder.
+  // This is done by subtracting a large multiple of w
+  // from x each time, where w is a scaled up version of y.
+  // The subtraction must be performed exactly in quad
+  // precision, though the result at each stage can
+  // fit exactly in a double precision number.
+  int i;
+  double t, v, p, pp;
+
+  for (i = 0; i < ntimes; i++) {
+    // Compute integral multiplier
+    t = __clc_trunc(dx / w);
+
+    // Compute w * t in quad precision
+    p = w * t;
+    pp = __clc_fma(w, t, -p);
+
+    // Subtract w * t from dx
+    v = dx - p;
+    dx = v + (((dx - v) - p) - pp);
+
+    // If t was one too large, dx will be negative. Add back one w.
+    dx += dx < 0.0 ? w : 0.0;
+
+    // Scale w down by 2^(-53) for the next iteration
+    w *= scale;
+  }
+
+  // One more time
+  t = __clc_floor(dx / w);
+
+  p = w * t;
+  pp = __clc_fma(w, t, -p);
+  v = dx - p;
+  dx = v + (((dx - v) - p) - pp);
+  i = dx < 0.0;
+  dx += i ? w : 0.0;
+
+  // At this point, dx lies in the range [0,dy)
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
+
+  // Now handle |x| == |y|
+  int c = dx == dy;
+  t = __clc_as_double(xsgn);
+  ret = c ? t : ret;
+
+  // Next, handle |x| < |y|
+  c = dx < dy;
+  ret = c ? x : ret;
+
+  // We don't need anything special for |x| == 0
+
+  // |y| is 0
+  c = dy == 0.0;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  // y is +-Inf, NaN
+  c = yexp > BIASEDEMAX_DP64;
+  t = y == y ? x : y;
+  ret = c ? t : ret;
+
+  // x is +=Inf, NaN
+  c = xexp > BIASEDEMAX_DP64;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
+
+  return ret;
+}
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_FUNCTION __clc_fmod
+#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// Forward the half version of this builtin onto the float one
+#define __CLC_HALF_ONLY
 #define __CLC_FUNCTION __clc_fmod
-#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_fmod
-#define __CLC_BODY "clc/shared/binary_def.inc"
+#define __CLC_BODY <clc/math/binary_def_via_fp32.inc>
+#include <clc/math/gentype.inc>
 
-#include "clc/math/gentype.inc"
+#endif