fix: use roundTiesToEven mode for rounding (#956)

petamoriken · Jul 30, 2023 · 92e5f2c · 92e5f2c
1 parent 4fa5480
commit 92e5f2c
Show file tree

Hide file tree

Showing 6 changed files with 123 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -177,7 +177,7 @@ environments:
 ```js
 const array = new Float16Array([1.0, 1.1, 1.2, 1.3]);
 for (const value of array) {
-  // 1, 1.099609375, 1.19921875, 1.2998046875
+  // 1, 1.099609375, 1.2001953125, 1.2998046875
   console.log(value);
 }
 

diff --git a/src/_util/converter.mjs b/src/_util/converter.mjs
@@ -1,93 +1,114 @@
-// algorithm: http://fox-toolkit.org/ftp/fasthalffloatconversion.pdf
-
 import {
+  MathAbs,
+  MathFloor,
+  MathLog2,
+  MathPow,
+  MathSign,
+  MathTrunc,
   NativeArrayBuffer,
   NativeFloat32Array,
+  NativeUint16Array,
   NativeUint32Array,
+  NumberIsFinite,
+  NumberIsNaN,
+  ObjectIs,
 } from "./primordials.mjs";
 
-const buffer = new NativeArrayBuffer(4);
-const floatView = new NativeFloat32Array(buffer);
-const uint32View = new NativeUint32Array(buffer);
+// base algorithm: https://github.com/feross/ieee754
+// BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource>
 
-const baseTable = new NativeUint32Array(512);
-const shiftTable = new NativeUint32Array(512);
-
-for (let i = 0; i < 256; ++i) {
-  const e = i - 127;
-
-  // very small number (0, -0)
-  if (e < -27) {
-    baseTable[i]         = 0x0000;
-    baseTable[i | 0x100] = 0x8000;
-    shiftTable[i]         = 24;
-    shiftTable[i | 0x100] = 24;
-
-  // small number (denorm)
-  } else if (e < -14) {
-    baseTable[i]         =  0x0400 >> (-e - 14);
-    baseTable[i | 0x100] = (0x0400 >> (-e - 14)) | 0x8000;
-    shiftTable[i]         = -e - 1;
-    shiftTable[i | 0x100] = -e - 1;
-
-  // normal number
-  } else if (e <= 15) {
-    baseTable[i]         =  (e + 15) << 10;
-    baseTable[i | 0x100] = ((e + 15) << 10) | 0x8000;
-    shiftTable[i]         = 13;
-    shiftTable[i | 0x100] = 13;
-
-  // large number (Infinity, -Infinity)
-  } else if (e < 128) {
-    baseTable[i]         = 0x7c00;
-    baseTable[i | 0x100] = 0xfc00;
-    shiftTable[i]         = 24;
-    shiftTable[i | 0x100] = 24;
-
-  // stay (NaN, Infinity, -Infinity)
-  } else {
-    baseTable[i]         = 0x7c00;
-    baseTable[i | 0x100] = 0xfc00;
-    shiftTable[i]         = 13;
-    shiftTable[i | 0x100] = 13;
+/**
+ * round a number to nearest value; if the number falls midway,
+ * it is rounded to the nearest value with an even least significant digit.
+ * @param {number} num - double float
+ * @returns {number} half float number bits
+ */
+function roundTiesToEven(num) {
+  const truncated = MathTrunc(num);
+  const isOdd = truncated % 2 !== 0;
+  const delta = MathAbs(num - truncated);
+  if (delta > 0.5 || delta === 0.5 && isOdd) {
+    return truncated + MathSign(num);
   }
+  return truncated;
 }
 
+const f16EMax = 31;
+const f16EBias = 15;
+const f16MLen = 10;
+const f16MMask = 0x3ff;
+
 /**
  * round a number to a half float number bits
  * @param {unknown} num - double float
  * @returns {number} half float number bits
  */
 export function roundToFloat16Bits(num) {
-  floatView[0] = /** @type {any} */ (num);
-  const f = uint32View[0];
-  const e = (f >> 23) & 0x1ff;
-  return baseTable[e] + ((f & 0x007fffff) >> shiftTable[e]);
+  const absNum = MathAbs(/** @type {number} */ (num));
+
+  const s = /** @type {number} */ (num) < 0 || ObjectIs(num, -0) ? 1 : 0;
+  let m, e;
+
+  // NaN, Infinity, -Infinity
+  if (!NumberIsFinite(absNum)) {
+    m = NumberIsNaN(absNum) ? 0x200 : 0;
+    e = f16EMax;
+
+  // finite
+  } else {
+    let rawE = MathFloor(MathLog2(absNum));
+    let c = MathPow(2, -rawE);
+    if (absNum * c < 1) {
+      --rawE;
+      c *= 2;
+    }
+    if (absNum * c >= 2) {
+      ++rawE;
+      c /= 2;
+    }
+
+    if (rawE + f16EBias >= f16EMax) {
+      m = 0;
+      e = f16EMax;
+    } else if (rawE + f16EBias >= 1) {
+      m = roundTiesToEven(((absNum * c) - 1) * 0x400) & f16MMask;
+      e = rawE + f16EBias;
+    } else {
+      m = roundTiesToEven(absNum * 0x1000000) & f16MMask;
+      e = 0;
+    }
+  }
+
+  return s << 15 | e << f16MLen | m;
 }
 
-const mantissaTable = new NativeUint32Array(2048);
-const exponentTable = new NativeUint32Array(64);
-const offsetTable = new NativeUint32Array(64);
+// base algorithm: http://fox-toolkit.org/ftp/fasthalffloatconversion.pdf
 
+const buffer = new NativeArrayBuffer(4);
+const floatView = new NativeFloat32Array(buffer);
+const uint32View = new NativeUint32Array(buffer);
+
+const mantissaTable = new NativeUint32Array(2048);
 for (let i = 1; i < 1024; ++i) {
-  let m = i << 13;    // zero pad mantissa bits
-  let e = 0;          // zero exponent
+  let m = i << 13; // zero pad mantissa bits
+  let e = 0; // zero exponent
 
   // normalized
-  while((m & 0x00800000) === 0) {
+  while ((m & 0x00800000) === 0) {
     m <<= 1;
-    e -= 0x00800000;  // decrement exponent
+    e -= 0x00800000; // decrement exponent
   }
 
-  m &= ~0x00800000;   // clear leading 1 bit
-  e += 0x38800000;    // adjust bias
+  m &= ~0x00800000; // clear leading 1 bit
+  e += 0x38800000; // adjust bias
 
   mantissaTable[i] = m | e;
 }
 for (let i = 1024; i < 2048; ++i) {
   mantissaTable[i] = 0x38000000 + ((i - 1024) << 13);
 }
 
+const exponentTable = new NativeUint32Array(64);
 for (let i = 1; i < 31; ++i) {
   exponentTable[i] = i << 23;
 }
@@ -98,6 +119,7 @@ for (let i = 33; i < 63; ++i) {
 }
 exponentTable[63] = 0xc7800000;
 
+const offsetTable = new NativeUint16Array(64);
 for (let i = 1; i < 64; ++i) {
   if (i !== 32) {
     offsetTable[i] = 1024;
@@ -110,7 +132,7 @@ for (let i = 1; i < 64; ++i) {
  * @returns {number} double float
  */
 export function convertToNumber(float16bits) {
-  const m = float16bits >> 10;
-  uint32View[0] = mantissaTable[offsetTable[m] + (float16bits & 0x3ff)] + exponentTable[m];
+  const i = float16bits >> 10;
+  uint32View[0] = mantissaTable[offsetTable[i] + (float16bits & 0x3ff)] + exponentTable[i];
   return floatView[0];
 }
diff --git a/src/_util/primordials.mjs b/src/_util/primordials.mjs
@@ -104,7 +104,14 @@ export const NativeArrayPrototypeSymbolIterator = ArrayPrototype[SymbolIterator]
 export const ArrayPrototypeSymbolIterator = uncurryThis(NativeArrayPrototypeSymbolIterator);
 
 // Math
-export const MathTrunc = Math.trunc;
+export const {
+  abs: MathAbs,
+  floor: MathFloor,
+  log2: MathLog2,
+  pow: MathPow,
+  sign: MathSign,
+  trunc: MathTrunc,
+} = Math;
 
 // ArrayBuffer
 export const NativeArrayBuffer = ArrayBuffer;

diff --git a/test/Float16Array.js b/test/Float16Array.js
@@ -195,7 +195,7 @@ describe("Float16Array", () => {
   });
 
   it("iterate", () => {
-    const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+    const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
     const float16 = new Float16Array([1, 1.1, 1.2, 1.3]);
     for (const val of float16) {
@@ -263,7 +263,7 @@ describe("Float16Array", () => {
 
     it("input Array or TypedArray", () => {
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16_1 = new Float16Array(array);
 
@@ -289,7 +289,7 @@ describe("Float16Array", () => {
     it("input custom Array", () => {
       class FooArray extends Array {}
       const array = FooArray.from([1, 1.1, 1.2, 1.3]);
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16_1 = new Float16Array(array);
 
@@ -348,7 +348,7 @@ describe("Float16Array", () => {
 
     it("input Iterable", () => {
       const iterable = [1, 1.1, 1.2, 1.3][Symbol.iterator]();
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = new Float16Array(iterable);
 
@@ -361,7 +361,7 @@ describe("Float16Array", () => {
 
     it("input ArrayLike", () => {
       const arrayLike = { "0": 1, "1": 1.1, "2": 1.2, "3": 1.3, length: 4 };
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = new Float16Array(arrayLike);
 
@@ -374,7 +374,7 @@ describe("Float16Array", () => {
 
     it("input Float16Array", () => {
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = new Float16Array(new Float16Array(array));
 
@@ -402,7 +402,7 @@ describe("Float16Array", () => {
       }
 
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = new Float16Array(new AnotherRealmFloat16Array(array));
 
@@ -414,7 +414,7 @@ describe("Float16Array", () => {
     });
 
     it("input ArrayBuffer", () => {
-      const buffer = new Uint16Array([15360, 15462, 15564, 15667]).buffer;
+      const buffer = new Uint16Array([15360, 15462, 15565, 15667]).buffer;
 
       const float16_1 = new Float16Array(buffer);
 
@@ -426,7 +426,7 @@ describe("Float16Array", () => {
       assert.equalFloat16ArrayValues(float16_1, [
         1,
         1.099609375,
-        1.19921875,
+        1.2001953125,
         1.2998046875,
       ]);
 
@@ -437,7 +437,7 @@ describe("Float16Array", () => {
       assert(float16_2.byteOffset === 2);
       assert(float16_2.byteLength === 4);
       assert(float16_2.length === 2);
-      assert.equalFloat16ArrayValues(float16_2, [1.099609375, 1.19921875]);
+      assert.equalFloat16ArrayValues(float16_2, [1.099609375, 1.2001953125]);
     });
 
     it("input detached ArrayBuffer", function () {
@@ -473,7 +473,7 @@ describe("Float16Array", () => {
 
     it("input Array or TypedArray", () => {
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16_1 = Float16Array.from(array);
 
@@ -488,7 +488,7 @@ describe("Float16Array", () => {
 
     it("input Iterable", () => {
       const iterable = [1, 1.1, 1.2, 1.3][Symbol.iterator]();
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = Float16Array.from(iterable);
 
@@ -498,7 +498,7 @@ describe("Float16Array", () => {
 
     it("input ArrayLike", () => {
       const arrayLike = { 0: 1, 1: 1.1, 2: 1.2, 3: 1.3, length: 4 };
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = Float16Array.from(arrayLike);
 
@@ -508,7 +508,7 @@ describe("Float16Array", () => {
 
     it("input Float16Array", () => {
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = Float16Array.from(new Float16Array(array));
 
@@ -522,7 +522,7 @@ describe("Float16Array", () => {
       }
 
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = Float16Array.from(new AnotherRealmFloat16Array(array));
 
@@ -533,7 +533,7 @@ describe("Float16Array", () => {
     it("call from subclass", () => {
       class Foo extends Float16Array {}
 
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const array = [1, 1.1, 1.2, 1.3];
       const foo1 = Foo.from(array);
@@ -603,7 +603,7 @@ describe("Float16Array", () => {
 
     it("input", () => {
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const float16 = Float16Array.of(...array);
 
@@ -615,7 +615,7 @@ describe("Float16Array", () => {
       class Foo extends Float16Array {}
 
       const array = [1, 1.1, 1.2, 1.3];
-      const checkArray = [1, 1.099609375, 1.19921875, 1.2998046875];
+      const checkArray = [1, 1.099609375, 1.2001953125, 1.2998046875];
 
       const foo = Foo.of(...array);
 

diff --git a/test/f16round.js b/test/f16round.js
@@ -53,8 +53,8 @@ describe("f16round()", () => {
   });
 
   it("return ±float16 min value when value is ±float16 min value / 2 ± a bit number", () => {
-    assert(f16round(minFloat16 / 2 + 2 ** -25) === minFloat16);
-    assert(f16round(-minFloat16 / 2 - 2 ** -25) === -minFloat16);
+    assert(f16round(2.980232238769531911744490042422139897126953655970282852649688720703125e-8) === minFloat16);
+    assert(f16round(-2.980232238769531911744490042422139897126953655970282852649688720703125e-8) === -minFloat16);
   });
 
   it("return 1.3369140625 when value is 1.337", () => {