msgpack · gfx · May 29, 2019 · May 29, 2019 · May 29, 2019 · May 29, 2019
diff --git a/benchmark/decode-string.ts b/benchmark/decode-string.ts
@@ -1,50 +1,45 @@
 /* eslint-disable no-console */
-import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
+import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
 import { utf8DecodeWasm } from "../src/wasmFunctions";
 
 // @ts-ignore
 import Benchmark from "benchmark";
 
-const textDecoder = new TextDecoder();
-
-const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
-  return "a".repeat(n);
-});
-
-for (const str of dataSet) {
-  const byteLength = utf8Count(str);
-  const bytes = new Uint8Array(new ArrayBuffer(byteLength));
-  utf8Encode(str, bytes, 0);
-
-  console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);
-
-  const suite = new Benchmark.Suite();
-
-  const N = Math.round(100_0000 / str.length);
-
-  // use the result to avoid void-context optimizations
-  let count = 0;
-
-  suite.add("utf8Decode", () => {
-    if (utf8Decode(bytes, 0, byteLength) !== str) {
-      throw new Error("wrong result!");
-    }
-  });
-
-  suite.add("utf8DecodeWasm", () => {
-    if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
-      throw new Error("wrong result!");
-    }
-  });
-
-  suite.add("TextDecoder", () => {
-    if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
-      throw new Error("wrong result!");
-    }
-  });
-  suite.on("cycle", (event: any) => {
-    console.log(String(event.target));
+for (const baseStr of ["A", "あ", "🌏"]) {
+  const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
+    return baseStr.repeat(n);
   });
 
-  suite.run();
+  for (const str of dataSet) {
+    const byteLength = utf8Count(str);
+    const bytes = new Uint8Array(new ArrayBuffer(byteLength));
+    utf8Encode(str, bytes, 0);
+
+    console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);
+
+    const suite = new Benchmark.Suite();
+
+    suite.add("utf8DecodeJs", () => {
+      if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
+        throw new Error("wrong result!");
+      }
+    });
+
+    suite.add("utf8DecodeWasm", () => {
+      if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
+        throw new Error("wrong result!");
+      }
+    });
+
+    suite.add("TextDecoder", () => {
+      if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
+        throw new Error("wrong result!");
+      }
+    });
+    suite.on("cycle", (event: any) => {
+      console.log(String(event.target));
+    });
+
+    suite.run();
+  }
 }
diff --git a/package.json b/package.json
@@ -12,13 +12,15 @@
     "prepublishOnly": "run-p 'test:dist:*' && npm run test:browser",
     "clean": "rimraf build dist dist.*",
     "test": "mocha 'test/**/*.test.ts'",
-    "test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
-    "test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
+    "test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
+    "test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
+    "test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'",
     "test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot",
     "test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot",
-    "test:cover": "npm run cover:clean && npm run test:cover:purejs && npm run test:cover:wasm && npm run cover:report",
+    "test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report",
     "test:cover:purejs": "npx nyc --no-clean npm run test:purejs",
     "test:cover:wasm": "npx nyc --no-clean npm run test:wasm",
+    "test:cover:td": "npx nyc --no-clean npm run test:td",
     "cover:clean": "rimraf .nyc_output coverage/",
     "cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html",
     "test:browser": "karma start --single-run",

diff --git a/src/Decoder.ts b/src/Decoder.ts
@@ -1,7 +1,7 @@
 import { prettyByte } from "./utils/prettyByte";
 import { ExtensionCodec } from "./ExtensionCodec";
 import { getInt64, getUint64 } from "./utils/int";
-import { utf8Decode } from "./utils/utf8";
+import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8";
 import { createDataView, ensureUint8Array } from "./utils/typedArrays";
 import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";
 
@@ -400,10 +400,14 @@ export class Decoder {
     }
 
     const offset = this.pos + headerOffset;
-    const object =
-      WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
-        ? utf8DecodeWasm(this.bytes, offset, byteLength)
-        : utf8Decode(this.bytes, offset, byteLength);
+    let object: string;
+    if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
+      object = utf8DecodeTD(this.bytes, offset, byteLength);
+    } else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
+      object = utf8DecodeWasm(this.bytes, offset, byteLength);
+    } else {
+      object = utf8DecodeJs(this.bytes, offset, byteLength);
+    }
     this.pos += headerOffset + byteLength;
     return object;
   }

diff --git a/src/utils/utf8.ts b/src/utils/utf8.ts
@@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number
 
 const CHUNK_SIZE = 0x10_000;
 
-export function safeStringFromCharCode(units: Array<number> | Uint16Array) {
-  if (units.length <= CHUNK_SIZE) {
-    // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
-    // in case `units` is a typed array
-    return String.fromCharCode.apply(String, units as any);
-  }
-
-  let result = "";
-  for (let i = 0; i < units.length; i++) {
-    const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
-    result += String.fromCharCode.apply(String, chunk as any);
-  }
-  return result;
-}
-
-const MIN_TEXT_DECODER_STRING_LENGTH = 200;
-const defaultEncoding = "utf-8";
-const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;
-
-export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
+export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
   let offset = inputOffset;
   const end = offset + byteLength;
 
-  if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
-    const stringBytes = bytes.subarray(offset, end);
-    return sharedTextDecoder.decode(stringBytes);
-  }
-
-  const out: Array<number> = [];
+  const units: Array<number> = [];
+  let result = "";
   while (offset < end) {
     const byte1 = bytes[offset++];
     if ((byte1 & 0x80) === 0) {
       // 1 byte
-      out.push(byte1);
+      units.push(byte1);
     } else if ((byte1 & 0xe0) === 0xc0) {
       // 2 bytes
       const byte2 = bytes[offset++] & 0x3f;
-      out.push(((byte1 & 0x1f) << 6) | byte2);
+      units.push(((byte1 & 0x1f) << 6) | byte2);
     } else if ((byte1 & 0xf0) === 0xe0) {
       // 3 bytes
       const byte2 = bytes[offset++] & 0x3f;
       const byte3 = bytes[offset++] & 0x3f;
-      out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
+      units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
     } else if ((byte1 & 0xf8) === 0xf0) {
       // 4 bytes
       const byte2 = bytes[offset++] & 0x3f;
@@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
       let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
       if (unit > 0xffff) {
         unit -= 0x10000;
-        out.push(((unit >>> 10) & 0x3ff) | 0xd800);
+        units.push(((unit >>> 10) & 0x3ff) | 0xd800);
         unit = 0xdc00 | (unit & 0x3ff);
       }
-      out.push(unit);
+      units.push(unit);
     } else {
-      out.push(byte1);
+      units.push(byte1);
+    }
+
+    if (units.length - 4 >= CHUNK_SIZE) {
+      result += String.fromCharCode(...units);
+      units.length = 0;
     }
   }
 
-  return safeStringFromCharCode(out);
+  if (units.length > 0) {
+    result += String.fromCharCode(...units);
+  }
+
+  return result;
+}
+
+const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
+export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
+export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0;
+
+export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
+  const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
+  // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+  return sharedTextDecoder!.decode(stringBytes);
 }
diff --git a/src/wasmFunctions.ts b/src/wasmFunctions.ts
@@ -1,5 +1,3 @@
-import { safeStringFromCharCode } from "./utils/utf8";
-
 // WASM=never - disable WASM functions
 // WASM=force - force to use WASM functions
 const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || "";
@@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
   }
 }
 
+const CHUNK_SIZE = 0x10_000;
+
+function safeStringFromCharCodeU16(units: Uint16Array) {
+  if (units.length <= CHUNK_SIZE) {
+    // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
+    // in case `units` is a typed array
+    return String.fromCharCode.apply(String, units as any);
+  }
+
+  let result = "";
+  for (let i = 0; i < units.length; i++) {
+    const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
+    result += String.fromCharCode.apply(String, chunk as any);
+  }
+  return result;
+}
+
 // A wrapper function for utf8DecodeToUint16Array()
 export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
   const inputPtr: pointer = wm.malloc(byteLength);
@@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt
 
     const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
     const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
-    return safeStringFromCharCode(units);
+    return safeStringFromCharCodeU16(units);
   } finally {
     wm.free(inputPtr);
     wm.free(outputPtr);

diff --git a/test/msgpack-test-suite.test.ts b/test/msgpack-test-suite.test.ts
@@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => {
       FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY,
       FLOAT64_NAN: Number.NaN,
       STR16: "a".repeat(0x100),
+      STR16_MBS: "🌏".repeat(0x100),
       STR32: "b".repeat(0x10_000),
+      STR32_MBS: "🍣".repeat(0x10_000),
       STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions
-      STR_INCLUDING_NUL: "foo\0bar",
+      STR_INCLUDING_NUL: "foo\0bar\0",
       STR_BROKEN_FF: "\xff",
-      STR_LONE_SURROGATE_1: "\ud800",
-      STR_LONE_SURROGATE_2: "\udbff",
       BIN16: new Uint8Array(0x100).fill(0xff),
       BIN32: new Uint8Array(0x10000).fill(0xff),
       ARRAY16: new Array<boolean>(0x100).fill(true),