Skip to content

Commit a7ae3a1

Browse files
authored
Pin JS casing to generated Unicode data (#4321)
1 parent 551b02d commit a7ae3a1

4 files changed

Lines changed: 2980 additions & 130 deletions

File tree

internal/stringutil/_scripts/generate-unicode-data.mts

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type SpecialCasingEntry = {
3434
codePoint: number;
3535
lower: number[];
3636
upper: number[];
37+
conditionalLower: number[];
3738
condition: string;
3839
};
3940

@@ -47,6 +48,11 @@ async function loadMapping(property: string): Promise<Map<number, number[]>> {
4748
return module.default as Map<number, number[]>;
4849
}
4950

51+
async function loadSimpleMapping(property: string): Promise<Map<number, number>> {
52+
const module = await import(`${PACKAGE}/${property}/code-points.js`);
53+
return module.default as Map<number, number>;
54+
}
55+
5056
// Group a sorted, de-duplicated run of code points into ranges sharing a
5157
// constant stride. The stride of each range is taken from the gap to the next
5258
// code point, so it never includes a code point that is not in the set; this
@@ -120,7 +126,7 @@ ${r32}
120126
`;
121127
}
122128

123-
async function buildSpecialCasing(): Promise<SpecialCasingEntry[]> {
129+
async function buildSpecialCasing(simpleLowercase: Map<number, number>, simpleUppercase: Map<number, number>): Promise<SpecialCasingEntry[]> {
124130
// The unconditional, locale-insensitive multi-rune mappings. Each map keys a
125131
// code point to its full lower/upper expansion (identity when unchanged).
126132
const lowerMappings = await loadMapping("Special_Casing/Lowercase");
@@ -131,39 +137,52 @@ async function buildSpecialCasing(): Promise<SpecialCasingEntry[]> {
131137

132138
const entries: SpecialCasingEntry[] = [];
133139

134-
const codePoints = new Set([...lowerMappings.keys(), ...upperMappings.keys()]);
140+
const codePoints = new Set([...simpleLowercase.keys(), ...simpleUppercase.keys(), ...lowerMappings.keys(), ...upperMappings.keys()]);
135141
for (const codePoint of codePoints) {
136142
entries.push({
137143
codePoint,
138-
lower: lowerMappings.get(codePoint) ?? [codePoint],
139-
upper: upperMappings.get(codePoint) ?? [codePoint],
144+
lower: lowerMappings.get(codePoint) ?? [simpleLowercase.get(codePoint) ?? codePoint],
145+
upper: upperMappings.get(codePoint) ?? [simpleUppercase.get(codePoint) ?? codePoint],
146+
conditionalLower: [codePoint],
140147
condition: "specialCasingConditionNone",
141148
});
142149
}
143150

144151
for (const [codePoint, lower] of finalSigmaMappings) {
145-
entries.push({
146-
codePoint,
147-
lower,
148-
upper: upperMappings.get(codePoint) ?? [codePoint],
149-
condition: "specialCasingConditionFinalSigma",
150-
});
152+
const entry = entries.find(entry => entry.codePoint === codePoint);
153+
if (entry === undefined) {
154+
entries.push({
155+
codePoint,
156+
lower: [simpleLowercase.get(codePoint) ?? codePoint],
157+
upper: upperMappings.get(codePoint) ?? [simpleUppercase.get(codePoint) ?? codePoint],
158+
conditionalLower: lower,
159+
condition: "specialCasingConditionFinalSigma",
160+
});
161+
}
162+
else {
163+
entry.conditionalLower = lower;
164+
entry.condition = "specialCasingConditionFinalSigma";
165+
}
151166
}
152167

153168
entries.sort((a, b) => a.codePoint - b.codePoint);
154169
return entries;
155170
}
156171

157172
function renderCaseFile(entries: SpecialCasingEntry[], casedTable: RangeTable, caseIgnorableTable: RangeTable): string {
158-
const mappings = entries.map(entry => `\t${goRuneLiteral(entry.codePoint)}: {lower: ${goStringLiteral(entry.lower)}, upper: ${goStringLiteral(entry.upper)}, condition: ${entry.condition}},`).join("\n");
173+
const mappings = entries.map(entry => {
174+
const conditionalLower = entry.condition === "specialCasingConditionFinalSigma" ? `, conditionalLower: ${goStringLiteral(entry.conditionalLower)}` : "";
175+
return `\t${goRuneLiteral(entry.codePoint)}: {lower: ${goStringLiteral(entry.lower)}, upper: ${goStringLiteral(entry.upper)}${conditionalLower}, condition: ${entry.condition}},`;
176+
}).join("\n");
159177

160178
return `// Code generated by generate-unicode-data.mts. DO NOT EDIT.
161179
// Derived from the ${PACKAGE} package (Unicode ${UNICODE_VERSION}).
162180
// Includes only the locale-insensitive multi-rune mappings needed for ECMAScript
163181
// default casing, plus the Final_Sigma context mapping. String.prototype.toLowerCase
164182
// applies Final_Sigma, but Go's unicode package does not, so the caser applies it
165-
// from this data when in context. Go's unicode package handles the simple one-rune
166-
// mappings, so those are omitted here.
183+
// from this data when in context. Simple one-rune mappings are included here too
184+
// so casing stays pinned to this Unicode version, rather than the Go toolchain's
185+
// unicode tables.
167186
168187
package stringutil
169188
@@ -177,9 +196,10 @@ const (
177196
)
178197
179198
type specialCasingMapping struct {
180-
\tlower string
181-
\tupper string
182-
\tcondition specialCasingCondition
199+
\tlower string
200+
\tupper string
201+
\tconditionalLower string
202+
\tcondition specialCasingCondition
183203
}
184204
185205
var specialCasingMappings = map[rune]specialCasingMapping{
@@ -209,7 +229,9 @@ ${renderRangeTable("unicodeESNextIdentifierPart", partTable)}
209229
}
210230

211231
async function main() {
212-
const entries = await buildSpecialCasing();
232+
const simpleLowercase = await loadSimpleMapping("Simple_Case_Mapping/Lowercase");
233+
const simpleUppercase = await loadSimpleMapping("Simple_Case_Mapping/Uppercase");
234+
const entries = await buildSpecialCasing(simpleLowercase, simpleUppercase);
213235
const casedTable = toRangeTable(await loadCodePoints("Binary_Property/Cased"));
214236
const caseIgnorableTable = toRangeTable(await loadCodePoints("Binary_Property/Case_Ignorable"));
215237
fs.writeFileSync(CASE_OUTPUT_PATH, renderCaseFile(entries, casedTable, caseIgnorableTable));

internal/stringutil/js_case.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ func ToLowerJS(str string) string {
2626
// bytes because WriteRune would re-encode the surrogate as U+FFFD.
2727
builder.WriteString(EncodeJSStringRune(r))
2828
} else if mapping, ok := specialCasingMappings[r]; ok {
29-
if mapping.condition == specialCasingConditionFinalSigma && !isFinalSigmaContext(casedBefore, str, i) {
30-
builder.WriteRune(unicode.ToLower(r))
29+
if mapping.condition == specialCasingConditionFinalSigma && isFinalSigmaContext(casedBefore, str, i) {
30+
builder.WriteString(mapping.conditionalLower)
3131
} else {
3232
builder.WriteString(mapping.lower)
3333
}
3434
} else {
35-
builder.WriteRune(unicode.ToLower(r))
35+
builder.WriteRune(r)
3636
}
3737
if !isUnicodeCaseIgnorable(r) {
3838
casedBefore = isSigmaCased(r)
@@ -58,10 +58,11 @@ func ToUpperJS(str string) string {
5858
} else if mapping, ok := specialCasingMappings[r]; ok {
5959
builder.WriteString(mapping.upper)
6060
} else {
61-
builder.WriteRune(unicode.ToUpper(r))
61+
builder.WriteRune(r)
6262
}
6363
i += size
6464
}
65+
6566
return builder.String()
6667
}
6768

0 commit comments

Comments
 (0)