Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 36 additions & 41 deletions benchmark/decode-string.ts
Original file line number Diff line number Diff line change
@@ -1,50 +1,45 @@
/* eslint-disable no-console */
import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To benchmark them separately, I have split utf8decode functions.

import { utf8DecodeWasm } from "../src/wasmFunctions";

// @ts-ignore
import Benchmark from "benchmark";

const textDecoder = new TextDecoder();

const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
return "a".repeat(n);
});

for (const str of dataSet) {
const byteLength = utf8Count(str);
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
utf8Encode(str, bytes, 0);

console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);

const suite = new Benchmark.Suite();

const N = Math.round(100_0000 / str.length);

// use the result to avoid void-context optimizations
let count = 0;

suite.add("utf8Decode", () => {
if (utf8Decode(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("utf8DecodeWasm", () => {
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("TextDecoder", () => {
if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
throw new Error("wrong result!");
}
});
suite.on("cycle", (event: any) => {
console.log(String(event.target));
for (const baseStr of ["A", "あ", "🌏"]) {
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
return baseStr.repeat(n);
});

suite.run();
for (const str of dataSet) {
const byteLength = utf8Count(str);
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
utf8Encode(str, bytes, 0);

console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);

const suite = new Benchmark.Suite();

suite.add("utf8DecodeJs", () => {
if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("utf8DecodeWasm", () => {
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("TextDecoder", () => {
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});
suite.on("cycle", (event: any) => {
console.log(String(event.target));
});

suite.run();
}
}
8 changes: 5 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"prepublishOnly": "run-p 'test:dist:*' && npm run test:browser",
"clean": "rimraf build dist dist.*",
"test": "mocha 'test/**/*.test.ts'",
"test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
"test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
"test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
"test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
"test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'",
"test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot",
"test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot",
"test:cover": "npm run cover:clean && npm run test:cover:purejs && npm run test:cover:wasm && npm run cover:report",
"test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report",
"test:cover:purejs": "npx nyc --no-clean npm run test:purejs",
"test:cover:wasm": "npx nyc --no-clean npm run test:wasm",
"test:cover:td": "npx nyc --no-clean npm run test:td",
"cover:clean": "rimraf .nyc_output coverage/",
"cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html",
"test:browser": "karma start --single-run",
Expand Down
14 changes: 9 additions & 5 deletions src/Decoder.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { prettyByte } from "./utils/prettyByte";
import { ExtensionCodec } from "./ExtensionCodec";
import { getInt64, getUint64 } from "./utils/int";
import { utf8Decode } from "./utils/utf8";
import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8";
import { createDataView, ensureUint8Array } from "./utils/typedArrays";
import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";

Expand Down Expand Up @@ -400,10 +400,14 @@ export class Decoder {
}

const offset = this.pos + headerOffset;
const object =
WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
? utf8DecodeWasm(this.bytes, offset, byteLength)
: utf8Decode(this.bytes, offset, byteLength);
let object: string;
if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
object = utf8DecodeTD(this.bytes, offset, byteLength);
} else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
object = utf8DecodeWasm(this.bytes, offset, byteLength);
} else {
object = utf8DecodeJs(this.bytes, offset, byteLength);
}
this.pos += headerOffset + byteLength;
return object;
}
Expand Down
62 changes: 29 additions & 33 deletions src/utils/utf8.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number

const CHUNK_SIZE = 0x10_000;

export function safeStringFromCharCode(units: Array<number> | Uint16Array) {
if (units.length <= CHUNK_SIZE) {
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
// in case `units` is a typed array
return String.fromCharCode.apply(String, units as any);
}

let result = "";
for (let i = 0; i < units.length; i++) {
const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
result += String.fromCharCode.apply(String, chunk as any);
}
return result;
}

const MIN_TEXT_DECODER_STRING_LENGTH = 200;
const defaultEncoding = "utf-8";
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;

export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
let offset = inputOffset;
const end = offset + byteLength;

if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
const stringBytes = bytes.subarray(offset, end);
return sharedTextDecoder.decode(stringBytes);
}

const out: Array<number> = [];
const units: Array<number> = [];
let result = "";
while (offset < end) {
const byte1 = bytes[offset++];
if ((byte1 & 0x80) === 0) {
// 1 byte
out.push(byte1);
units.push(byte1);
} else if ((byte1 & 0xe0) === 0xc0) {
// 2 bytes
const byte2 = bytes[offset++] & 0x3f;
out.push(((byte1 & 0x1f) << 6) | byte2);
units.push(((byte1 & 0x1f) << 6) | byte2);
} else if ((byte1 & 0xf0) === 0xe0) {
// 3 bytes
const byte2 = bytes[offset++] & 0x3f;
const byte3 = bytes[offset++] & 0x3f;
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
} else if ((byte1 & 0xf8) === 0xf0) {
// 4 bytes
const byte2 = bytes[offset++] & 0x3f;
Expand All @@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
if (unit > 0xffff) {
unit -= 0x10000;
out.push(((unit >>> 10) & 0x3ff) | 0xd800);
units.push(((unit >>> 10) & 0x3ff) | 0xd800);
unit = 0xdc00 | (unit & 0x3ff);
}
out.push(unit);
units.push(unit);
} else {
out.push(byte1);
units.push(byte1);
}

if (units.length - 4 >= CHUNK_SIZE) {
result += String.fromCharCode(...units);
units.length = 0;
}
}

return safeStringFromCharCode(out);
if (units.length > 0) {
result += String.fromCharCode(...units);
}

return result;
}

const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0;

export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
return sharedTextDecoder!.decode(stringBytes);
}
21 changes: 18 additions & 3 deletions src/wasmFunctions.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { safeStringFromCharCode } from "./utils/utf8";

// WASM=never - disable WASM functions
// WASM=force - force to use WASM functions
const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || "";
Expand Down Expand Up @@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
}
}

const CHUNK_SIZE = 0x10_000;

function safeStringFromCharCodeU16(units: Uint16Array) {
if (units.length <= CHUNK_SIZE) {
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
// in case `units` is a typed array
return String.fromCharCode.apply(String, units as any);
}

let result = "";
for (let i = 0; i < units.length; i++) {
const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
result += String.fromCharCode.apply(String, chunk as any);
}
return result;
}

// A wrapper function for utf8DecodeToUint16Array()
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
const inputPtr: pointer = wm.malloc(byteLength);
Expand All @@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt

const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
return safeStringFromCharCode(units);
return safeStringFromCharCodeU16(units);
} finally {
wm.free(inputPtr);
wm.free(outputPtr);
Expand Down
6 changes: 3 additions & 3 deletions test/msgpack-test-suite.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => {
FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY,
FLOAT64_NAN: Number.NaN,
STR16: "a".repeat(0x100),
STR16_MBS: "🌏".repeat(0x100),
STR32: "b".repeat(0x10_000),
STR32_MBS: "🍣".repeat(0x10_000),
STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions
STR_INCLUDING_NUL: "foo\0bar",
STR_INCLUDING_NUL: "foo\0bar\0",
STR_BROKEN_FF: "\xff",
STR_LONE_SURROGATE_1: "\ud800",
STR_LONE_SURROGATE_2: "\udbff",
BIN16: new Uint8Array(0x100).fill(0xff),
BIN32: new Uint8Array(0x10000).fill(0xff),
ARRAY16: new Array<boolean>(0x100).fill(true),
Expand Down