Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Temporary #
#############
.cache
.direnv

# Compiled source #
###################
*.a
*.dll
*.exe
*.o
*.o.d
*.py[ocd]
*.so
*.mod

# Logs #
########
*.log

# Patches #
###########
*.patch
*.diff
16 changes: 16 additions & 0 deletions .spin/cmds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pathlib
import sys
import click

curdir = pathlib.Path(__file__).parent
rootdir = curdir.parent
toolsdir = rootdir / "tools"
sys.path.insert(0, str(toolsdir))


@click.command(help="Generate sollya c++/python based files")
@click.option("-f", "--force", is_flag=True, help="Force regenerate all files")
def sollya(*, force):
import sollya # type: ignore[import]

sollya.main(force=force)
101 changes: 101 additions & 0 deletions npsr/trig/data/approx.h.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Suppress rounding mode information messages and NaN warnings for boundary angles
suppressmessage(184, 185, 186); // suppress info no rounding, round-up, round-down
suppressmessage(419); // expected nan when angle is at specific multiples causing derivative singularities

// Generates a 4-element lookup table for fast trigonometric function approximation.
//
// This procedure creates optimized lookup tables that store:
// - Function values split into high/low precision components
// - Derivative information with power-of-2 scaling for efficient interpolation
//
// The table format per angle is: [deriv, sigma, high, low]
// where:
// deriv = actual_derivative - sigma (reduces storage requirements)
// sigma = 2^k, where k = ceil(log2(|actual_derivative|))
// high = main part of function value
// low = residual for extended precision
//
// Parameters:
// pT - Type descriptor with .kSize, .kDigits, .kRound
// pFunc - Function to approximate (sin or cos)
// pFuncDriv - Derivative of pFunc (cos or -sin)
procedure ApproxLut4_(pT, pFunc, pFuncDriv) {
var r, i, $;
// Table size: 512 entries for 64-bit, 256 for 32-bit
// More entries for double precision to maintain accuracy
// These sizes balance table memory usage with interpolation accuracy:
// - 256 entries = 1.4° spacing for float (sufficient for 24-bit mantissa)
// - 512 entries = 0.7° spacing for double (needed for 53-bit mantissa)
$.num_lut = match pT.kSize
with 64: (2^9)
default: (2^8);

// Low part rounding configuration:
// - 64-bit: Round to 24 bits with round-to-zero (faster, sufficient for residual)
// - 32-bit: Use full precision with round-to-nearest
$.low_round = match pT.kSize
with 64: ([|24, RZ|])
default: ([|pT.kDigits, RN|]);

// Scale factor to convert table index to angle in radians
$.scale = 2.0 * pi / $.num_lut;

r = [||];
for i from 0 to $.num_lut - 1 do {
// Sample angle uniformly distributed from 0 to 2π
$.angle = i * $.scale;

// Compute exact function value
$.exact = pFunc($.angle);

// Split into high and low parts for extended precision
// High part gets the main value rounded to type precision
$.high = pT.kRound($.exact);

// Low part stores the residual, rounded to reduced precision
// This allows accurate reconstruction: value ≈ high + low
$.low = pT.kRound(round($.exact - $.high, $.low_round[0], $.low_round[1]));

// Compute derivative for interpolation
$.deriv_exact = pFuncDriv($.angle);

// Find power-of-2 scale factor closest to derivative magnitude
// This allows efficient storage and reconstruction
$.k = ceil(log2(abs($.deriv_exact)));
if ($.deriv_exact < 0) then $.k = -$.k;

// Sigma is the power-of-2 scale factor
$.sigma = 2.0^$.k;

// Store derivative minus sigma (typically a small value)
// Actual derivative = sigma + stored_deriv
$.deriv = pT.kRound($.deriv_exact - $.sigma);

r = r @ [|$.deriv, $.sigma, $.high, $.low|];
};

// Format as C array with 4 elements per table entry
return CArrayT(pT, r, 4) @ ";";
};

// Generate C++ header content with specialized lookup tables
// for both float and double precision sine and cosine
// Template declarations (empty for unsupported types)
Append(
"template <typename T> inline constexpr char kSinApproxTable[] = {};",
"template <> inline constexpr float kSinApproxTable<float>[] = ",
ApproxLut4_(Float32, sin(x), cos(x)), // sin table with cos derivative
"",
"template <> inline constexpr double kSinApproxTable<double>[] = ",
ApproxLut4_(Float64, sin(x), cos(x)),
"",
"template <typename T> inline constexpr char kCosApproxTable[] = {};",
"template <> inline constexpr float kCosApproxTable<float>[] = ",
ApproxLut4_(Float32, cos(x), -sin(x)), // cos table with -sin derivative
"",
"template <> inline constexpr double kCosApproxTable<double>[] = ",
ApproxLut4_(Float64, cos(x), -sin(x)),
""
);

WriteCPPHeader("npsr::trig::data");
97 changes: 97 additions & 0 deletions npsr/trig/data/constants.h.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Suppress rounding mode information messages that are expected during constant generation
suppressmessage(185, 186); // suppress expected info round-up, round-down

// Helper procedure to format constants array for C++ output
// Takes a type descriptor followed by constant values and formats them
// as a C array with 4 elements per line
procedure KArray_(pArgs = ...) {
var pT;
pT = head(pArgs);
return CArrayT(pT, Constants @ tail(pArgs), 4) @ ";";
};

// Generate C++ header with various π-related constants for Cody-Waite reduction
// These constants enable accurate computation of r = x - n*π with extended precision
//
// The Cody-Waite method splits π into multiple parts:
// r = x - n*π₁ - n*π₂ - n*π₃ ...
// where each πᵢ has limited precision to ensure exact multiplication
//
// Different versions are provided for:
// - FMA (Fused Multiply-Add) vs non-FMA architectures
// - float vs double precision
// - Various precision requirements
Append(
// Generic template declaration
"template <typename T, bool FMA> inline constexpr char kPi[] = {};",

// Float π constants for Low precision implementation
"template <> inline constexpr float kPi<float, true>[] = " @
KArray_(Float32, pi, [|RN, 24, 24, 24|]), // FMA: 3x24-bit pieces (full precision each)

"template <> inline constexpr float kPi<float, false>[] = " @
KArray_(Float32, pi, [|RD, 11, 11, 11|], [|RN, 24|]), // no FMA: 3x11-bit + 1x24-bit
// The 11-bit pieces ensure n*πᵢ is exact (no rounding) for |n| < 2^13

// Double π constants for Low precision implementation
"template <> inline constexpr double kPi<double, true>[] = " @
KArray_(Float64, pi, [|RN, 53|], [|RD, 53|], [|RU, 53|]), // FMA: Different roundings for error compensation

"template <> inline constexpr double kPi<double, false>[] = " @
KArray_(Float64, pi, [|RN, 24, 24, 24|], [|RN, 53|]), // no FMA: 3x24-bit + 1x53-bit
// The 24-bit pieces ensure n*πᵢ is exact for |n| < 2^29
"",

// Special 35-bit precision π for specific algorithms
"template <bool FMA> inline constexpr double kPiPrec35[] = " @
KArray_(Float64, pi, [|RN, 35|], [|RD, 53|]),

"template <> inline constexpr double kPiPrec35<false>[] = " @
KArray_(Float64, pi, [|RN, 24, 24, 24|]),
"",

// 2π constants for angle wrapping
"template <typename T> inline constexpr char kPiMul2[] = {};",
"template <> inline constexpr float kPiMul2<float>[] = " @
KArray_(Float32, pi*2, [|RN, 24, 24|]), // 2x24-bit pieces

"template <> inline constexpr double kPiMul2<double>[] = " @
KArray_(Float64, pi*2, [|RN, 53, 53|]), // 2x53-bit pieces
""
);

// Non-FMA version of π/16 for High precision implementation
// Special handling: components are reordered [0,2,3,1] for proper evaluation
// Without FMA, multiplication order matters to minimize rounding errors
vNFma = Constants(pi/16, [|RN, 27, 27|], [|RN, 29|], [|RN, 53|]);
Append(
"template <bool FMA> inline constexpr double kPiDiv16Prec29[] = " @
KArray_(Float64, pi/16, [|RN, 53|], [|RN, 29|], [|RN, 53|]),

// Non-FMA version reorders components: [0], [2], [3], [1]
// This ordering ensures proper evaluation without FMA:
// r = x - n*π₁/16 - n*π₃/16 - n*π₄/16 - n*π₂/16
"template <> inline constexpr double kPiDiv16Prec29<false>[] = " @
CArray([|vNFma[0], vNFma[2], vNFma[3], vNFma[1]|], 4) @ ";",
"",

// Simple scalar constants
"template <typename T> inline constexpr char kInvPi = '_';",
"template <> inline constexpr float kInvPi<float> = " @ single(1/pi) @ "f;",
"template <> inline constexpr double kInvPi<double> = " @ double(1/pi) @ ";",
"",

"template <typename T> inline constexpr char kHalfPi = '_';",
"template <> inline constexpr float kHalfPi<float> = " @ single(pi/2) @ "f;",
"template <> inline constexpr double kHalfPi<double> = " @ double(pi/2) @ ";",
"",

"template <typename T> inline constexpr char k16DivPi = '_';",
"template <> inline constexpr float k16DivPi<float> = " @ single(16/pi) @ "f;",
"template <> inline constexpr double k16DivPi<double> = " @ double(16/pi) @ ";",
""
);
// Dump();

WriteCPPHeader("npsr::trig::data");

13 changes: 13 additions & 0 deletions npsr/trig/data/data.h.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
var header;
Append("#include \"npsr/lut-inl.h\"");
for header in [|"constants", "kpi16-inl", "approx", "reduction"|] do {
Append(
"#include \"npsr/trig/data/" @ header @ ".h\""
);
};
};

Write();


86 changes: 86 additions & 0 deletions npsr/trig/data/kpi16-inl.h.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Generates lookup table for sin(k·π/16) and cos(k·π/16) values
// Used in the high-precision trigonometric implementation for range reduction
//
// This table supports the algorithm where input x is reduced to:
// x = n*(π/16) + r, where |r| < π/16
// Then sin(x) and cos(x) are reconstructed using angle addition formulas
//
// Parameters:
// pT - Type descriptor (Float64 in this case)
// pFunc - Function to evaluate (sin or cos)
// pBy - Divisor for π (16 in this case, giving π/16 intervals)
procedure PiDivTable_(pT, pFunc, pBy) {
var r, i, pi_by;
pi_by = pi / pBy;
r = [||];

// Generate function values at k*π/16 for k = 0, 1, ..., 15
for i from 0 to pBy - 1 do {
r = r :. pT.kRound(pFunc(i * pi_by));
};

// Format as C array with 4 elements per line
return CArrayT(pT, r, 4);
};

// Generates packed low-precision parts of sin and cos values
// This packing scheme saves memory by storing two 32-bit values in one 64-bit word
//
// The packing works as follows for double (64-bit):
// - sin_low occupies bits [31:0] (lower 32 bits)
// - cos_low occupies bits [63:32] (upper 32 bits)
//
// This is why in the C++ code:
// - cos_lo can be used directly (it's already in the upper bits)
// - sin_lo needs to be extracted with a 32-bit left shift
procedure PiDivPackLowTable_(pT, pFunc0, pFunc1, pBy) {
var r, i, digits, $;
$.pi_by = pi / pBy;
r = [||];

// First, compute the low precision parts (residuals after high precision)
for i from 0 to pBy - 1 do {
$.hi0 = pT.kRound(pFunc0(i * $.pi_by)); // High precision sin
$.hi1 = pT.kRound(pFunc1(i * $.pi_by)); // High precision cos
// Low precision parts: exact value minus high precision part
$.hi0_low = pT.kRound(pFunc0(i * $.pi_by) - $.hi0); // sin_low
$.hi1_low = pT.kRound(pFunc1(i * $.pi_by) - $.hi1); // cos_low
r = r @ [|$.hi0_low, $.hi1_low|];
};

// Convert to binary representation for bit manipulation
digits = ToDigits(pT, r);
$.half_size = pT.kSize / 2; // 32 for double
$.lower_bits = 2^$.half_size; // Mask for lower 32 bits

r = [||];
// Pack pairs of values into single 64-bit words
for i from 0 to length(digits) - 1 by 2 do {
$.hi0 = digits[i]; // sin_low bits
$.hi1 = digits[i + 1]; // cos_low bits
$.pack = mod(RightShift($.hi0, $.half_size), $.lower_bits);
$.pack = $.pack + $.hi1 - mod($.hi1, $.lower_bits);
r = r :. $.pack;
};

// Convert back from binary representation
r = FromDigits(pT, r);
return CArrayT(pT, r, 4);
};

Append(
"inline constexpr auto kKPi16Table = MakeLut<double>(",
"// High parts of sin(k·π/16) where k = 0, 1, ..., 15",
PiDivTable_(Float64, sin(x), 16) @ ",",
"// High parts of cos(k·π/16) where k = 0, 1, ..., 15",
PiDivTable_(Float64, cos(x), 16) @ ",",
"// Lower parts of sin(k·π/16) and cos(k·π/16) packed together",
"// Format: bits [63:32] = cos_low, bits [31:0] = sin_low",
"// This packing saves 16×8 = 128 bytes of memory",
PiDivPackLowTable_(Float64, sin(x), cos(x), 16),
"",
");"
);

WriteHighwayHeader("npsr::HWY_NAMESPACE::trig");

Loading