Skip to content

Commit 90c2724

Browse files
authored
fix(native-preview): preserve lone surrogate string literals (#3518)
1 parent cb1910b commit 90c2724

9 files changed

Lines changed: 210 additions & 12 deletions

File tree

_packages/native-preview/src/api/async/api.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import {
3131
readSourceFileHash,
3232
RemoteSourceFile,
3333
} from "../node/node.ts";
34+
import { Wtf8Decoder } from "../node/wtf8.ts";
3435
import type {
3536
APIOptions,
3637
LSPConnectionOptions,
@@ -541,7 +542,7 @@ export class Program {
541542
private client: Client;
542543
private sourceFileCache: SourceFileCache;
543544
private toPath: (fileName: string) => Path;
544-
private decoder = new TextDecoder();
545+
private decoder = new Wtf8Decoder();
545546

546547
constructor(
547548
snapshotId: number,

_packages/native-preview/src/api/node/msgpack.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// Minimal msgpack encoder/decoder.
22
// Supports: arrays, unsigned integers, strings, booleans, binary data.
33

4+
import { Wtf8Decoder } from "./wtf8.ts";
5+
46
// ── MessagePack format constants ────────────────────────────────────
57
export const MSGPACK_FIXARRAY3 = 0x93; // 3-element fixarray
68
export const MSGPACK_BIN8 = 0xc4;
@@ -39,7 +41,7 @@ export function writeBinHeader(buf: Uint8Array, off: number, len: number): numbe
3941
}
4042

4143
const encoder = new TextEncoder();
42-
const decoder = new TextDecoder();
44+
const decoder = new Wtf8Decoder();
4345

4446
export class MsgpackWriter {
4547
private buf: Uint8Array;

_packages/native-preview/src/api/node/node.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import {
2626
NODE_OFFSET_KIND,
2727
NODE_OFFSET_PARENT,
2828
} from "./protocol.ts";
29+
import { Wtf8Decoder } from "./wtf8.ts";
2930

3031
// Re-export everything consumers need from the other two files.
3132
export { RemoteNode, RemoteNodeList } from "./node.generated.ts";
@@ -290,7 +291,7 @@ export function parseNodeHandle(handle: string): ParsedNodeHandle {
290291
* (e.g. from typeToTypeNode) that don't have a source file.
291292
*/
292293
export function decodeNode(data: Uint8Array): Node {
293-
const sf = new RemoteSourceFile(data, new TextDecoder());
294+
const sf = new RemoteSourceFile(data, new Wtf8Decoder());
294295
return sf as unknown as Node;
295296
}
296297

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { Buffer } from "node:buffer";
2+
3+
const surrogateLeadByte = 0xED;
4+
const surrogateSecondByteMin = 0xA0;
5+
const surrogateSecondByteMax = 0xBF;
6+
const continuationByteMin = 0x80;
7+
const continuationByteMax = 0xBF;
8+
type DecodeOptions = Parameters<TextDecoder["decode"]>[1];
9+
10+
function isWtf8Surrogate(bytes: Uint8Array, index: number): boolean {
11+
return index + 2 < bytes.length
12+
&& bytes[index] === surrogateLeadByte
13+
&& bytes[index + 1] >= surrogateSecondByteMin
14+
&& bytes[index + 1] <= surrogateSecondByteMax
15+
&& bytes[index + 2] >= continuationByteMin
16+
&& bytes[index + 2] <= continuationByteMax;
17+
}
18+
19+
function getSurrogateCodeUnit(bytes: Uint8Array, index: number): number {
20+
return 0xD000 | ((bytes[index + 1] & 0x3F) << 6) | (bytes[index + 2] & 0x3F);
21+
}
22+
23+
function hasSurrogateLeadByte(bytes: Uint8Array): boolean {
24+
return Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength).indexOf(surrogateLeadByte) >= 0;
25+
}
26+
27+
function toUint8Array(input: NodeJS.AllowSharedBufferSource): Uint8Array {
28+
if (input instanceof Uint8Array) {
29+
return input;
30+
}
31+
if (ArrayBuffer.isView(input)) {
32+
return new Uint8Array(input.buffer, input.byteOffset, input.byteLength);
33+
}
34+
return new Uint8Array(input);
35+
}
36+
37+
export class Wtf8Decoder extends TextDecoder {
38+
override decode(input?: NodeJS.AllowSharedBufferSource, options?: DecodeOptions): string {
39+
if (input === undefined) {
40+
return super.decode(input, options);
41+
}
42+
43+
const bytes = toUint8Array(input);
44+
if (!hasSurrogateLeadByte(bytes)) {
45+
return super.decode(bytes, options);
46+
}
47+
48+
const parts: string[] = [];
49+
let segmentStart = 0;
50+
51+
for (let i = 0; i < bytes.length; i++) {
52+
if (!isWtf8Surrogate(bytes, i)) {
53+
continue;
54+
}
55+
56+
if (segmentStart < i) {
57+
parts.push(super.decode(bytes.subarray(segmentStart, i), options));
58+
}
59+
parts.push(String.fromCharCode(getSurrogateCodeUnit(bytes, i)));
60+
i += 2;
61+
segmentStart = i + 1;
62+
}
63+
64+
if (segmentStart === 0) {
65+
return super.decode(bytes, options);
66+
}
67+
if (segmentStart < bytes.length) {
68+
parts.push(super.decode(bytes.subarray(segmentStart), options));
69+
}
70+
return parts.join("");
71+
}
72+
}

_packages/native-preview/src/api/sync/api.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import {
3939
readSourceFileHash,
4040
RemoteSourceFile,
4141
} from "../node/node.ts";
42+
import { Wtf8Decoder } from "../node/wtf8.ts";
4243
import type {
4344
APIOptions,
4445
LSPConnectionOptions,
@@ -549,7 +550,7 @@ export class Program {
549550
private client: Client;
550551
private sourceFileCache: SourceFileCache;
551552
private toPath: (fileName: string) => Path;
552-
private decoder = new TextDecoder();
553+
private decoder = new Wtf8Decoder();
553554

554555
constructor(
555556
snapshotId: number,

_packages/native-preview/test/async/api.test.ts

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,18 +355,24 @@ test("unicode escapes", async () => {
355355
"/tsconfig.json": "{}",
356356
"/src/1.ts": `"😃"`,
357357
"/src/2.ts": `"\\ud83d\\ude03"`,
358+
"/src/3.ts": `"\\ud800a\\udc00"`,
358359
});
359360
try {
360361
const snapshot = await api.updateSnapshot({ openProject: "/tsconfig.json" });
361362
const project = snapshot.getProject("/tsconfig.json")!;
363+
const expectedTexts = new Map([
364+
["/src/1.ts", "😃"],
365+
["/src/2.ts", "😃"],
366+
["/src/3.ts", "\ud800a\udc00"],
367+
]);
362368

363-
for (const file of ["/src/1.ts", "/src/2.ts"]) {
369+
for (const file of expectedTexts.keys()) {
364370
const sourceFile = await project.program.getSourceFile(file);
365371
assert.ok(sourceFile);
366372

367373
sourceFile.forEachChild(function visit(node) {
368374
if (isStringLiteral(node)) {
369-
assert.equal(node.text, "😃");
375+
assert.equal(node.text, expectedTexts.get(file));
370376
}
371377
node.forEachChild(visit);
372378
});
@@ -377,6 +383,38 @@ test("unicode escapes", async () => {
377383
}
378384
});
379385

386+
test("template unicode escapes", async () => {
387+
const api = spawnAPI({
388+
"/tsconfig.json": "{}",
389+
"/src/index.ts": "`\\ud800${0}\\udc00`",
390+
});
391+
try {
392+
const snapshot = await api.updateSnapshot({ openProject: "/tsconfig.json" });
393+
const project = snapshot.getProject("/tsconfig.json")!;
394+
const sourceFile = await project.program.getSourceFile("/src/index.ts");
395+
assert.ok(sourceFile);
396+
397+
let sawHead = false;
398+
let sawTail = false;
399+
sourceFile.forEachChild(function visit(node) {
400+
if (isTemplateHead(node)) {
401+
assert.equal(node.text, "\ud800");
402+
sawHead = true;
403+
}
404+
else if (isTemplateTail(node)) {
405+
assert.equal(node.text, "\udc00");
406+
sawTail = true;
407+
}
408+
node.forEachChild(visit);
409+
});
410+
assert.ok(sawHead);
411+
assert.ok(sawTail);
412+
}
413+
finally {
414+
await api.close();
415+
}
416+
});
417+
380418
test("Object equality", async () => {
381419
const api = spawnAPI();
382420
try {

_packages/native-preview/test/sync/api.test.ts

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,18 +363,24 @@ test("unicode escapes", () => {
363363
"/tsconfig.json": "{}",
364364
"/src/1.ts": `"😃"`,
365365
"/src/2.ts": `"\\ud83d\\ude03"`,
366+
"/src/3.ts": `"\\ud800a\\udc00"`,
366367
});
367368
try {
368369
const snapshot = api.updateSnapshot({ openProject: "/tsconfig.json" });
369370
const project = snapshot.getProject("/tsconfig.json")!;
371+
const expectedTexts = new Map([
372+
["/src/1.ts", "😃"],
373+
["/src/2.ts", "😃"],
374+
["/src/3.ts", "\ud800a\udc00"],
375+
]);
370376

371-
for (const file of ["/src/1.ts", "/src/2.ts"]) {
377+
for (const file of expectedTexts.keys()) {
372378
const sourceFile = project.program.getSourceFile(file);
373379
assert.ok(sourceFile);
374380

375381
sourceFile.forEachChild(function visit(node) {
376382
if (isStringLiteral(node)) {
377-
assert.equal(node.text, "😃");
383+
assert.equal(node.text, expectedTexts.get(file));
378384
}
379385
node.forEachChild(visit);
380386
});
@@ -385,6 +391,38 @@ test("unicode escapes", () => {
385391
}
386392
});
387393

394+
test("template unicode escapes", () => {
395+
const api = spawnAPI({
396+
"/tsconfig.json": "{}",
397+
"/src/index.ts": "`\\ud800${0}\\udc00`",
398+
});
399+
try {
400+
const snapshot = api.updateSnapshot({ openProject: "/tsconfig.json" });
401+
const project = snapshot.getProject("/tsconfig.json")!;
402+
const sourceFile = project.program.getSourceFile("/src/index.ts");
403+
assert.ok(sourceFile);
404+
405+
let sawHead = false;
406+
let sawTail = false;
407+
sourceFile.forEachChild(function visit(node) {
408+
if (isTemplateHead(node)) {
409+
assert.equal(node.text, "\ud800");
410+
sawHead = true;
411+
}
412+
else if (isTemplateTail(node)) {
413+
assert.equal(node.text, "\udc00");
414+
sawTail = true;
415+
}
416+
node.forEachChild(visit);
417+
});
418+
assert.ok(sawHead);
419+
assert.ok(sawTail);
420+
}
421+
finally {
422+
api.close();
423+
}
424+
});
425+
388426
test("Object equality", () => {
389427
const api = spawnAPI();
390428
try {
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import assert from "node:assert";
2+
import {
3+
describe,
4+
test,
5+
} from "node:test";
6+
import { Wtf8Decoder } from "../src/api/node/wtf8.ts";
7+
8+
describe("Wtf8Decoder", () => {
9+
test("decodes standard UTF-8", () => {
10+
const decoder = new Wtf8Decoder();
11+
assert.strictEqual(decoder.decode(new TextEncoder().encode("hello 🦀")), "hello 🦀");
12+
});
13+
14+
test("preserves WTF-8 encoded lone surrogates", () => {
15+
const decoder = new Wtf8Decoder();
16+
const text = decoder.decode(Uint8Array.of(
17+
0xF0,
18+
0x9F,
19+
0xA6,
20+
0x80,
21+
0xED,
22+
0x9F,
23+
0xBF,
24+
0xED,
25+
0xA0,
26+
0x80,
27+
0xED,
28+
0xA0,
29+
0x81,
30+
0xED,
31+
0xB0,
32+
0x80,
33+
0xF0,
34+
0x9F,
35+
0xA6,
36+
0x80,
37+
));
38+
39+
assert.deepStrictEqual(
40+
Array.from({ length: text.length }, (_, i) => text.charCodeAt(i)),
41+
[0xD83E, 0xDD80, 0xD7FF, 0xD800, 0xD801, 0xDC00, 0xD83E, 0xDD80],
42+
);
43+
});
44+
});

internal/api/encoder/encoder.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,11 @@ const (
111111
// String data (variable)
112112
// ----------------------
113113
//
114-
// The string data section contains UTF-8 encoded string data. In typical cases, the entirety of the string data is the
115-
// source file text, and individual nodes with string properties reference their positional slice of the file text. In
116-
// cases where a node's string property is not equal to the slice of file text at its position, the unique string is
117-
// appended to the string data section after the file text.
114+
// The string data section contains UTF-8 encoded string data, with WTF-8 used for JS strings containing lone UTF-16
115+
// surrogates. In typical cases, the entirety of the string data is the source file text, and individual nodes with
116+
// string properties reference their positional slice of the file text. In cases where a node's string property is not
117+
// equal to the slice of file text at its position, the unique string is appended to the string data section after the
118+
// file text.
118119
//
119120
// Extended node data (variable)
120121
// -----------------------------

0 commit comments

Comments
 (0)