[libc][math] Implement double precision log function correctly rounde…

…d to all rounding modes. Implement double precision log function correctly rounded to all rounding modes. See https://reviews.llvm.org/D150014 for a more detail description of the algorithm. **Performance** - For `0.5 <= x <= 2`, the fast pass hitting rate is about 99.93%. - Reciprocal throughput from CORE-MATH's perf tool on Ryzen 5900X: ``` $ ./perf.sh log GNU libc version: 2.35 GNU libc release: stable -- CORE-MATH reciprocal throughput -- with FMA [####################] 100 % Ntrial = 20 ; Min = 17.465 + 0.596 clc/call; Median-Min = 0.602 clc/call; Max = 18.389 clc/call; -- CORE-MATH reciprocal throughput -- without FMA (-march=x86-64-v2) [####################] 100 % Ntrial = 20 ; Min = 54.961 + 2.606 clc/call; Median-Min = 2.180 clc/call; Max = 59.583 clc/call; -- System LIBC reciprocal throughput -- [####################] 100 % Ntrial = 20 ; Min = 12.608 + 0.276 clc/call; Median-Min = 0.359 clc/call; Max = 13.147 clc/call; -- LIBC reciprocal throughput -- with FMA [####################] 100 % Ntrial = 20 ; Min = 20.952 + 0.468 clc/call; Median-Min = 0.602 clc/call; Max = 21.881 clc/call; -- LIBC reciprocal throughput -- without FMA [####################] 100 % Ntrial = 20 ; Min = 18.569 + 0.552 clc/call; Median-Min = 0.601 clc/call; Max = 19.259 clc/call; ``` - Latency from CORE-MATH's perf tool on Ryzen 5900X: ``` $ ./perf.sh log --latency GNU libc version: 2.35 GNU libc release: stable -- CORE-MATH latency -- with FMA [####################] 100 % Ntrial = 20 ; Min = 48.431 + 0.699 clc/call; Median-Min = 0.073 clc/call; Max = 51.269 clc/call; -- CORE-MATH latency -- without FMA (-march=x86-64-v2) [####################] 100 % Ntrial = 20 ; Min = 64.865 + 3.235 clc/call; Median-Min = 3.475 clc/call; Max = 71.788 clc/call; -- System LIBC latency -- [####################] 100 % Ntrial = 20 ; Min = 42.151 + 2.090 clc/call; Median-Min = 2.270 clc/call; Max = 44.773 clc/call; -- LIBC latency -- with FMA [####################] 100 % Ntrial = 20 ; Min = 35.266 + 0.479 clc/call; Median-Min = 0.373 clc/call; Max = 36.798 clc/call; -- LIBC latency -- without FMA [####################] 100 % Ntrial = 20 ; Min = 48.518 + 0.484 clc/call; Median-Min = 0.500 clc/call; Max = 49.896 clc/call; ``` - Accurate pass latency: ``` $ ./perf.sh log --latency --simple_stat GNU libc version: 2.35 GNU libc release: stable -- CORE-MATH latency -- with FMA 598.306 -- CORE-MATH latency -- without FMA (-march=x86-64-v2) 632.925 -- LIBC latency -- with FMA 455.632 -- LIBC latency -- without FMA 488.564 ``` Reviewed By: zimmermann6 Differential Revision: https://reviews.llvm.org/D150131
llvm · May 23, 2023 · a68bbf4 · a68bbf4
1 parent cc6a6c4
commit a68bbf4
Show file tree

Hide file tree

Showing 15 changed files with 1,866 additions and 777 deletions.
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
@@ -178,6 +178,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.log10f
     libc.src.math.log1pf
     libc.src.math.log2f
+    libc.src.math.log
     libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
@@ -289,6 +289,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.log10f
     libc.src.math.log1pf
     libc.src.math.log2f
+    libc.src.math.log
     libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
@@ -294,6 +294,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.log10f
     libc.src.math.log1pf
     libc.src.math.log2f
+    libc.src.math.log
     libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf

diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
@@ -171,6 +171,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.log10f
     libc.src.math.log1pf
     libc.src.math.log2f
+    libc.src.math.log
     libc.src.math.logf
     libc.src.math.logb
     libc.src.math.logbf

diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
@@ -412,6 +412,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"log2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"log", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"logf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
           FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,

diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
@@ -118,6 +118,7 @@ add_math_entrypoint_object(log1pf)
 
 add_math_entrypoint_object(log2f)
 
+add_math_entrypoint_object(log)
 add_math_entrypoint_object(logf)
 
 add_math_entrypoint_object(logb)

diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
@@ -768,6 +768,15 @@ add_object_library(
     libc.src.__support.number_pair
 )
 
+add_header_library(
+  log_range_reduction
+  HDRS
+    log_range_reduction.h
+  DEPENDS
+    .common_constants
+    libc.src.__support.FPUtil.dyadic_float
+)
+
 add_entrypoint_object(
   log10
   SRCS
@@ -776,6 +785,7 @@ add_entrypoint_object(
     ../log10.h
   DEPENDS
     .common_constants
+    .log_range_reduction
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
@@ -840,6 +850,26 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  log
+  SRCS
+    log.cpp
+  HDRS
+    ../log.h
+  DEPENDS
+    .common_constants
+    .log_range_reduction
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   logf
   SRCS

diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp
@@ -365,6 +365,78 @@ alignas(64) const NumberPair<double> LOG_R_DD[128] = {
     {0.0, 0.0},
 };
 
+// Logarithm range reduction - Step 2:
+//   r(k) = 2^-16 round(2^16 / (1 + k*2^-14)) for k = -2^6 .. 2^7.
+// Output range:
+//   [-0x1.3ffcp-15, 0x1.3e3dp-15]
+// We store S2[i] = 2^16 (r(i - 2^6) - 1).
+alignas(64) const int S2[193] = {
+    0x101,  0xfd,   0xf9,   0xf5,   0xf1,   0xed,   0xe9,   0xe5,   0xe1,
+    0xdd,   0xd9,   0xd5,   0xd1,   0xcd,   0xc9,   0xc5,   0xc1,   0xbd,
+    0xb9,   0xb4,   0xb0,   0xac,   0xa8,   0xa4,   0xa0,   0x9c,   0x98,
+    0x94,   0x90,   0x8c,   0x88,   0x84,   0x80,   0x7c,   0x78,   0x74,
+    0x70,   0x6c,   0x68,   0x64,   0x60,   0x5c,   0x58,   0x54,   0x50,
+    0x4c,   0x48,   0x44,   0x40,   0x3c,   0x38,   0x34,   0x30,   0x2c,
+    0x28,   0x24,   0x20,   0x1c,   0x18,   0x14,   0x10,   0xc,    0x8,
+    0x4,    0x0,    -0x4,   -0x8,   -0xc,   -0x10,  -0x14,  -0x18,  -0x1c,
+    -0x20,  -0x24,  -0x28,  -0x2c,  -0x30,  -0x34,  -0x38,  -0x3c,  -0x40,
+    -0x44,  -0x48,  -0x4c,  -0x50,  -0x54,  -0x58,  -0x5c,  -0x60,  -0x64,
+    -0x68,  -0x6c,  -0x70,  -0x74,  -0x78,  -0x7c,  -0x80,  -0x84,  -0x88,
+    -0x8c,  -0x90,  -0x94,  -0x98,  -0x9c,  -0xa0,  -0xa4,  -0xa8,  -0xac,
+    -0xb0,  -0xb4,  -0xb7,  -0xbb,  -0xbf,  -0xc3,  -0xc7,  -0xcb,  -0xcf,
+    -0xd3,  -0xd7,  -0xdb,  -0xdf,  -0xe3,  -0xe7,  -0xeb,  -0xef,  -0xf3,
+    -0xf7,  -0xfb,  -0xff,  -0x103, -0x107, -0x10b, -0x10f, -0x113, -0x117,
+    -0x11b, -0x11f, -0x123, -0x127, -0x12b, -0x12f, -0x133, -0x137, -0x13a,
+    -0x13e, -0x142, -0x146, -0x14a, -0x14e, -0x152, -0x156, -0x15a, -0x15e,
+    -0x162, -0x166, -0x16a, -0x16e, -0x172, -0x176, -0x17a, -0x17e, -0x182,
+    -0x186, -0x18a, -0x18e, -0x192, -0x195, -0x199, -0x19d, -0x1a1, -0x1a5,
+    -0x1a9, -0x1ad, -0x1b1, -0x1b5, -0x1b9, -0x1bd, -0x1c1, -0x1c5, -0x1c9,
+    -0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec,
+    -0x1f0, -0x1f4, -0x1f8, -0x1fc};
+
+// Logarithm range reduction - Step 3:
+//   r(k) = 2^-21 round(2^21 / (1 + k*2^-21)) for k = -80 .. 80.
+// Output range:
+//   [-0x1.01928p-22 , 0x1p-22]
+// We store S[i] = 2^21 (r(i - 80) - 1).
+alignas(64) const int S3[161] = {
+    0x50,  0x4f,  0x4e,  0x4d,  0x4c,  0x4b,  0x4a,  0x49,  0x48,  0x47,  0x46,
+    0x45,  0x44,  0x43,  0x42,  0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,
+    0x3a,  0x39,  0x38,  0x37,  0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,
+    0x2f,  0x2e,  0x2d,  0x2c,  0x2b,  0x2a,  0x29,  0x28,  0x27,  0x26,  0x25,
+    0x24,  0x23,  0x22,  0x21,  0x20,  0x1f,  0x1e,  0x1d,  0x1c,  0x1b,  0x1a,
+    0x19,  0x18,  0x17,  0x16,  0x15,  0x14,  0x13,  0x12,  0x11,  0x10,  0xf,
+    0xe,   0xd,   0xc,   0xb,   0xa,   0x9,   0x8,   0x7,   0x6,   0x5,   0x4,
+    0x3,   0x2,   0x1,   0x0,   -0x1,  -0x2,  -0x3,  -0x4,  -0x5,  -0x6,  -0x7,
+    -0x8,  -0x9,  -0xa,  -0xb,  -0xc,  -0xd,  -0xe,  -0xf,  -0x10, -0x11, -0x12,
+    -0x13, -0x14, -0x15, -0x16, -0x17, -0x18, -0x19, -0x1a, -0x1b, -0x1c, -0x1d,
+    -0x1e, -0x1f, -0x20, -0x21, -0x22, -0x23, -0x24, -0x25, -0x26, -0x27, -0x28,
+    -0x29, -0x2a, -0x2b, -0x2c, -0x2d, -0x2e, -0x2f, -0x30, -0x31, -0x32, -0x33,
+    -0x34, -0x35, -0x36, -0x37, -0x38, -0x39, -0x3a, -0x3b, -0x3c, -0x3d, -0x3e,
+    -0x3f, -0x40, -0x41, -0x42, -0x43, -0x44, -0x45, -0x46, -0x47, -0x48, -0x49,
+    -0x4a, -0x4b, -0x4c, -0x4d, -0x4e, -0x4f, -0x50,
+};
+
+// Logarithm range reduction - Step 4
+//   r(k) = 2^-28 round(2^28 / (1 + k*2^-28)) for k = -65 .. 64.
+// Output range:
+//   [-0x1.0002143p-29 , 0x1p-29]
+// We store S[i] = 2^28 (r(i - 65) - 1).
+alignas(64) const int S4[130] = {
+    0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,  0x3a,  0x39,  0x38,  0x37,
+    0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,  0x2f,  0x2e,  0x2d,  0x2c,
+    0x2b,  0x2a,  0x29,  0x28,  0x27,  0x26,  0x25,  0x24,  0x23,  0x22,  0x21,
+    0x20,  0x1f,  0x1e,  0x1d,  0x1c,  0x1b,  0x1a,  0x19,  0x18,  0x17,  0x16,
+    0x15,  0x14,  0x13,  0x12,  0x11,  0x10,  0xf,   0xe,   0xd,   0xc,   0xb,
+    0xa,   0x9,   0x8,   0x7,   0x6,   0x5,   0x4,   0x3,   0x2,   0x1,   0x0,
+    -0x1,  -0x2,  -0x3,  -0x4,  -0x5,  -0x6,  -0x7,  -0x8,  -0x9,  -0xa,  -0xb,
+    -0xc,  -0xd,  -0xe,  -0xf,  -0x10, -0x11, -0x12, -0x13, -0x14, -0x15, -0x16,
+    -0x17, -0x18, -0x19, -0x1a, -0x1b, -0x1c, -0x1d, -0x1e, -0x1f, -0x20, -0x21,
+    -0x22, -0x23, -0x24, -0x25, -0x26, -0x27, -0x28, -0x29, -0x2a, -0x2b, -0x2c,
+    -0x2d, -0x2e, -0x2f, -0x30, -0x31, -0x32, -0x33, -0x34, -0x35, -0x36, -0x37,
+    -0x38, -0x39, -0x3a, -0x3b, -0x3c, -0x3d, -0x3e, -0x3f, -0x40,
+};
+
 // Lookup table for exp(m) with m = -104, ..., 89.
 //   -104 = floor(log(single precision's min denormal))
 //     89 = ceil(log(single precision's max normal))

diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
@@ -39,6 +39,11 @@ constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
                                   -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
                                   -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
 
+// Logarithm Range Reduction - Step 2, 3, and 4.
+extern const int S2[193];
+extern const int S3[161];
+extern const int S4[130];
+
 // log(2) generated by Sollya with:
 // > a = 2^-43 * nearestint(2^43*log(2));
 // LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.