numpy · seiko2plus · Oct 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,24 @@
+# Temporary #
+#############
+.cache
+.direnv
+
+# Compiled source #
+###################
+*.a
+*.dll
+*.exe
+*.o
+*.o.d
+*.py[ocd]
+*.so
+*.mod
+
+# Logs #
+########
+*.log
+
+# Patches #
+###########
+*.patch
+*.diff
diff --git a/.spin/cmds.py b/.spin/cmds.py
@@ -0,0 +1,16 @@
+import pathlib
+import sys
+import click
+
+curdir = pathlib.Path(__file__).parent
+rootdir = curdir.parent
+toolsdir = rootdir / "tools"
+sys.path.insert(0, str(toolsdir))
+
+
+@click.command(help="Generate sollya c++/python based files")
+@click.option("-f", "--force", is_flag=True, help="Force regenerate all files")
+def sollya(*, force):
+    import sollya  # type: ignore[import]
+
+    sollya.main(force=force)
diff --git a/npsr/trig/data/approx.h.sol b/npsr/trig/data/approx.h.sol
@@ -0,0 +1,101 @@
+// Suppress rounding mode information messages and NaN warnings for boundary angles
+suppressmessage(184, 185, 186); // suppress info no rounding, round-up, round-down
+suppressmessage(419); // expected nan when angle is at specific multiples causing derivative singularities
+
+// Generates a 4-element lookup table for fast trigonometric function approximation.
+// 
+// This procedure creates optimized lookup tables that store:
+// - Function values split into high/low precision components
+// - Derivative information with power-of-2 scaling for efficient interpolation
+//
+// The table format per angle is: [deriv, sigma, high, low]
+// where:
+//   deriv = actual_derivative - sigma (reduces storage requirements)
+//   sigma = 2^k, where k = ceil(log2(|actual_derivative|))
+//   high  = main part of function value
+//   low   = residual for extended precision
+//
+// Parameters:
+//   pT        - Type descriptor with .kSize, .kDigits, .kRound
+//   pFunc     - Function to approximate (sin or cos)
+//   pFuncDriv - Derivative of pFunc (cos or -sin)
+procedure ApproxLut4_(pT, pFunc, pFuncDriv) {
+  var r, i, $;
+  // Table size: 512 entries for 64-bit, 256 for 32-bit
+  // More entries for double precision to maintain accuracy
+  // These sizes balance table memory usage with interpolation accuracy:
+  // - 256 entries = 1.4° spacing for float (sufficient for 24-bit mantissa)
+  // - 512 entries = 0.7° spacing for double (needed for 53-bit mantissa)
+  $.num_lut = match pT.kSize
+    with 64: (2^9)
+    default: (2^8);
+
+  // Low part rounding configuration:
+  // - 64-bit: Round to 24 bits with round-to-zero (faster, sufficient for residual)
+  // - 32-bit: Use full precision with round-to-nearest
+  $.low_round = match pT.kSize 
+    with 64: ([|24, RZ|])
+    default: ([|pT.kDigits, RN|]);
+
+  // Scale factor to convert table index to angle in radians
+  $.scale = 2.0 * pi / $.num_lut;
+
+  r = [||];
+  for i from 0 to $.num_lut - 1 do {
+    // Sample angle uniformly distributed from 0 to 2π
+    $.angle = i * $.scale;
+
+    // Compute exact function value
+    $.exact = pFunc($.angle);
+
+    // Split into high and low parts for extended precision
+    // High part gets the main value rounded to type precision
+    $.high = pT.kRound($.exact);
+
+    // Low part stores the residual, rounded to reduced precision
+    // This allows accurate reconstruction: value ≈ high + low
+    $.low = pT.kRound(round($.exact - $.high, $.low_round[0], $.low_round[1]));
+
+    // Compute derivative for interpolation
+    $.deriv_exact = pFuncDriv($.angle);
+
+    // Find power-of-2 scale factor closest to derivative magnitude
+    // This allows efficient storage and reconstruction
+    $.k = ceil(log2(abs($.deriv_exact)));
+    if ($.deriv_exact < 0) then $.k = -$.k;
+
+    // Sigma is the power-of-2 scale factor
+    $.sigma = 2.0^$.k;
+
+    // Store derivative minus sigma (typically a small value)
+    // Actual derivative = sigma + stored_deriv
+    $.deriv = pT.kRound($.deriv_exact - $.sigma);
+
+    r = r @ [|$.deriv, $.sigma, $.high, $.low|];
+  };
+
+  // Format as C array with 4 elements per table entry
+  return CArrayT(pT, r, 4) @ ";";
+};
+
+// Generate C++ header content with specialized lookup tables
+// for both float and double precision sine and cosine
+// Template declarations (empty for unsupported types)
+Append(
+  "template <typename T> inline constexpr char kSinApproxTable[] = {};",
+  "template <> inline constexpr float kSinApproxTable<float>[] = ",
+  ApproxLut4_(Float32, sin(x), cos(x)),  // sin table with cos derivative
+  "",
+  "template <> inline constexpr double kSinApproxTable<double>[] = ",
+  ApproxLut4_(Float64, sin(x), cos(x)),
+  "",
+  "template <typename T> inline constexpr char kCosApproxTable[] = {};",
+  "template <> inline constexpr float kCosApproxTable<float>[] = ",
+  ApproxLut4_(Float32, cos(x), -sin(x)), // cos table with -sin derivative
+  "",
+  "template <> inline constexpr double kCosApproxTable<double>[] = ",
+  ApproxLut4_(Float64, cos(x), -sin(x)),
+  ""
+);
+
+WriteCPPHeader("npsr::trig::data");
diff --git a/npsr/trig/data/constants.h.sol b/npsr/trig/data/constants.h.sol
@@ -0,0 +1,97 @@
+// Suppress rounding mode information messages that are expected during constant generation
+suppressmessage(185, 186); // suppress expected info round-up, round-down
+
+// Helper procedure to format constants array for C++ output
+// Takes a type descriptor followed by constant values and formats them
+// as a C array with 4 elements per line
+procedure KArray_(pArgs = ...) {
+  var pT;
+  pT = head(pArgs);
+  return CArrayT(pT, Constants @ tail(pArgs), 4) @ ";";
+};
+
+// Generate C++ header with various π-related constants for Cody-Waite reduction
+// These constants enable accurate computation of r = x - n*π with extended precision
+//
+// The Cody-Waite method splits π into multiple parts:
+//   r = x - n*π₁ - n*π₂ - n*π₃ ...
+// where each πᵢ has limited precision to ensure exact multiplication
+//
+// Different versions are provided for:
+// - FMA (Fused Multiply-Add) vs non-FMA architectures
+// - float vs double precision
+// - Various precision requirements
+Append(
+  // Generic template declaration
+  "template <typename T, bool FMA> inline constexpr char kPi[] = {};",
+
+  // Float π constants for Low precision implementation
+  "template <> inline constexpr float kPi<float, true>[] = " @
+  KArray_(Float32, pi, [|RN, 24, 24, 24|]),  // FMA: 3x24-bit pieces (full precision each)
+
+  "template <> inline constexpr float kPi<float, false>[] = " @
+  KArray_(Float32, pi, [|RD, 11, 11, 11|], [|RN, 24|]), // no FMA: 3x11-bit + 1x24-bit
+  // The 11-bit pieces ensure n*πᵢ is exact (no rounding) for |n| < 2^13
+
+  // Double π constants for Low precision implementation
+  "template <> inline constexpr double kPi<double, true>[] = " @
+  KArray_(Float64, pi, [|RN, 53|], [|RD, 53|], [|RU, 53|]), // FMA: Different roundings for error compensation
+
+  "template <> inline constexpr double kPi<double, false>[] = " @
+  KArray_(Float64, pi, [|RN, 24, 24, 24|], [|RN, 53|]), // no FMA: 3x24-bit + 1x53-bit
+  // The 24-bit pieces ensure n*πᵢ is exact for |n| < 2^29
+  "",
+
+  // Special 35-bit precision π for specific algorithms
+  "template <bool FMA> inline constexpr double kPiPrec35[] = " @
+  KArray_(Float64, pi, [|RN, 35|], [|RD, 53|]),
+
+  "template <> inline constexpr double kPiPrec35<false>[] = " @
+  KArray_(Float64, pi, [|RN, 24, 24, 24|]),
+  "",
+
+  // 2π constants for angle wrapping
+  "template <typename T> inline constexpr char kPiMul2[] = {};",
+  "template <> inline constexpr float kPiMul2<float>[] = " @
+  KArray_(Float32, pi*2, [|RN, 24, 24|]),  // 2x24-bit pieces
+
+  "template <> inline constexpr double kPiMul2<double>[] = " @
+  KArray_(Float64, pi*2, [|RN, 53, 53|]),  // 2x53-bit pieces
+  "" 
+);
+
+// Non-FMA version of π/16 for High precision implementation
+// Special handling: components are reordered [0,2,3,1] for proper evaluation
+// Without FMA, multiplication order matters to minimize rounding errors
+vNFma = Constants(pi/16, [|RN, 27, 27|], [|RN, 29|], [|RN, 53|]);
+Append(
+  "template <bool FMA> inline constexpr double kPiDiv16Prec29[] = " @
+  KArray_(Float64, pi/16, [|RN, 53|], [|RN, 29|], [|RN, 53|]),
+
+  // Non-FMA version reorders components: [0], [2], [3], [1]
+  // This ordering ensures proper evaluation without FMA:
+  // r = x - n*π₁/16 - n*π₃/16 - n*π₄/16 - n*π₂/16
+  "template <> inline constexpr double kPiDiv16Prec29<false>[] = " @
+  CArray([|vNFma[0], vNFma[2], vNFma[3], vNFma[1]|], 4) @ ";",
+  "",
+
+  // Simple scalar constants
+  "template <typename T> inline constexpr char kInvPi = '_';",
+  "template <> inline constexpr float kInvPi<float> = " @ single(1/pi) @ "f;",  
+  "template <> inline constexpr double kInvPi<double> = " @ double(1/pi) @ ";",
+  "",
+
+  "template <typename T> inline constexpr char kHalfPi = '_';",
+  "template <> inline constexpr float kHalfPi<float> = " @ single(pi/2) @ "f;",
+  "template <> inline constexpr double kHalfPi<double> = " @ double(pi/2) @ ";",
+  "",
+
+  "template <typename T> inline constexpr char k16DivPi = '_';",
+  "template <> inline constexpr float k16DivPi<float> = " @ single(16/pi) @ "f;",
+  "template <> inline constexpr double k16DivPi<double> = " @ double(16/pi) @ ";",
+  ""
+); 
+// Dump();
+
+WriteCPPHeader("npsr::trig::data");
+
diff --git a/npsr/trig/data/data.h.sol b/npsr/trig/data/data.h.sol
@@ -0,0 +1,13 @@
+{
+var header;
+Append("#include \"npsr/lut-inl.h\"");
+for header in [|"constants", "kpi16-inl", "approx", "reduction"|] do {
+  Append(
+    "#include \"npsr/trig/data/" @ header @ ".h\""
+  );
+};
+};
+
+Write();
+
+
diff --git a/npsr/trig/data/kpi16-inl.h.sol b/npsr/trig/data/kpi16-inl.h.sol
@@ -0,0 +1,86 @@
+// Generates lookup table for sin(k·π/16) and cos(k·π/16) values
+// Used in the high-precision trigonometric implementation for range reduction
+//
+// This table supports the algorithm where input x is reduced to:
+//   x = n*(π/16) + r, where |r| < π/16
+// Then sin(x) and cos(x) are reconstructed using angle addition formulas
+//
+// Parameters:
+//   pT    - Type descriptor (Float64 in this case)
+//   pFunc - Function to evaluate (sin or cos)
+//   pBy   - Divisor for π (16 in this case, giving π/16 intervals)
+procedure PiDivTable_(pT, pFunc, pBy) {
+  var r, i, pi_by;
+  pi_by = pi / pBy;
+  r = [||];
+
+  // Generate function values at k*π/16 for k = 0, 1, ..., 15
+  for i from 0 to pBy - 1 do {
+    r = r :. pT.kRound(pFunc(i * pi_by));
+  };
+
+  // Format as C array with 4 elements per line
+  return CArrayT(pT, r, 4);
+};
+
+// Generates packed low-precision parts of sin and cos values
+// This packing scheme saves memory by storing two 32-bit values in one 64-bit word
+//
+// The packing works as follows for double (64-bit):
+// - sin_low occupies bits [31:0] (lower 32 bits)
+// - cos_low occupies bits [63:32] (upper 32 bits)
+//
+// This is why in the C++ code:
+// - cos_lo can be used directly (it's already in the upper bits)
+// - sin_lo needs to be extracted with a 32-bit left shift
+procedure PiDivPackLowTable_(pT, pFunc0, pFunc1, pBy) {
+  var r, i, digits, $;
+  $.pi_by = pi / pBy;
+  r = [||];
+
+  // First, compute the low precision parts (residuals after high precision)
+  for i from 0 to pBy - 1 do {
+    $.hi0 = pT.kRound(pFunc0(i * $.pi_by));  // High precision sin
+    $.hi1 = pT.kRound(pFunc1(i * $.pi_by));  // High precision cos
+    // Low precision parts: exact value minus high precision part
+    $.hi0_low = pT.kRound(pFunc0(i * $.pi_by) - $.hi0);  // sin_low
+    $.hi1_low = pT.kRound(pFunc1(i * $.pi_by) - $.hi1);  // cos_low
+    r = r @ [|$.hi0_low, $.hi1_low|];
+  }; 
+
+  // Convert to binary representation for bit manipulation
+  digits = ToDigits(pT, r);
+  $.half_size = pT.kSize / 2;  // 32 for double
+  $.lower_bits = 2^$.half_size;  // Mask for lower 32 bits
+
+  r = [||];
+  // Pack pairs of values into single 64-bit words
+  for i from 0 to length(digits) - 1 by 2 do {
+    $.hi0 = digits[i];     // sin_low bits
+    $.hi1 = digits[i + 1]; // cos_low bits
+    $.pack = mod(RightShift($.hi0, $.half_size), $.lower_bits);
+    $.pack = $.pack + $.hi1 - mod($.hi1, $.lower_bits);
+    r = r :. $.pack; 
+  };
+
+  // Convert back from binary representation
+  r = FromDigits(pT, r);
+  return CArrayT(pT, r, 4);
+};
+
+Append(
+  "inline constexpr auto kKPi16Table = MakeLut<double>(",
+  "// High parts of sin(k·π/16) where k = 0, 1, ..., 15",
+  PiDivTable_(Float64, sin(x), 16) @ ",",
+  "// High parts of cos(k·π/16) where k = 0, 1, ..., 15",
+  PiDivTable_(Float64, cos(x), 16) @ ",",
+  "// Lower parts of sin(k·π/16) and cos(k·π/16) packed together",
+  "// Format: bits [63:32] = cos_low, bits [31:0] = sin_low",
+  "// This packing saves 16×8 = 128 bytes of memory",
+  PiDivPackLowTable_(Float64, sin(x), cos(x), 16),
+  "",
+  ");"
+);
+
+WriteHighwayHeader("npsr::HWY_NAMESPACE::trig");
+