Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ const kEncoding = Symbol('encoding');
const kDecoder = Symbol('decoder');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kLatin1FastPath = Symbol('kLatin1FastPath');
const kWindows1252FastPath = Symbol('kWindows1252FastPath');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const {
Expand All @@ -55,7 +55,7 @@ const {
encodeIntoResults,
encodeUtf8String,
decodeUTF8,
decodeLatin1,
decodeWindows1252,
} = binding;

const { Buffer } = require('buffer');
Expand Down Expand Up @@ -420,10 +420,10 @@ function makeTextDecoderICU() {
this[kFatal] = Boolean(options?.fatal);
// Only support fast path for UTF-8.
this[kUTF8FastPath] = enc === 'utf-8';
this[kLatin1FastPath] = enc === 'windows-1252';
this[kWindows1252FastPath] = enc === 'windows-1252';
this[kHandle] = undefined;

if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) {
this.#prepareConverter();
}
}
Expand All @@ -440,14 +440,14 @@ function makeTextDecoderICU() {
validateDecoder(this);

this[kUTF8FastPath] &&= !(options?.stream);
this[kLatin1FastPath] &&= !(options?.stream);
this[kWindows1252FastPath] &&= !(options?.stream);

if (this[kUTF8FastPath]) {
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
}

if (this[kLatin1FastPath]) {
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
if (this[kWindows1252FastPath]) {
return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]);
}

this.#prepareConverter();
Expand Down
53 changes: 39 additions & 14 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,8 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
SetMethodNoSideEffect(
isolate, target, "decodeWindows1252", DecodeWindows1252);
}

void BindingData::CreatePerContextProperties(Local<Object> target,
Expand All @@ -239,10 +240,10 @@ void BindingData::RegisterTimerExternalReferences(
registry->Register(DecodeUTF8);
registry->Register(ToASCII);
registry->Register(ToUnicode);
registry->Register(DecodeLatin1);
registry->Register(DecodeWindows1252);
}

void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 1);
Expand All @@ -255,7 +256,6 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
}

bool ignore_bom = args[1]->IsTrue();
bool has_fatal = args[2]->IsTrue();

ArrayBufferViewContents<uint8_t> buffer(args[0]);
const uint8_t* data = buffer.data();
Expand All @@ -270,20 +270,45 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
return args.GetReturnValue().SetEmptyString();
}

std::string result(length * 2, '\0');

size_t written = simdutf::convert_latin1_to_utf8(
reinterpret_cast<const char*>(data), length, result.data());
// Windows-1252 specific mapping for bytes 128-159
// These differ from Latin-1/ISO-8859-1
static const uint16_t windows1252_mapping[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
};

std::string result;
result.reserve(length * 3); // Reserve space for UTF-8 output

for (size_t i = 0; i < length; i++) {
uint8_t byte = data[i];
uint32_t codepoint;

// Check if byte is in the special Windows-1252 range (128-159)
if (byte >= 0x80 && byte <= 0x9F) {
codepoint = windows1252_mapping[byte - 0x80];
} else {
// For all other bytes, Windows-1252 is identical to Latin-1
codepoint = byte;
}

if (has_fatal && written == 0) {
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(), "The encoded data was not valid for encoding latin1");
// Convert codepoint to UTF-8
if (codepoint < 0x80) {
result.push_back(static_cast<char>(codepoint));
} else if (codepoint < 0x800) {
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else {
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
}

std::string_view view(result.c_str(), written);

Local<Value> ret;
if (ToV8Value(env->context(), view, env->isolate()).ToLocal(&ret)) {
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/encoding_binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class BindingData : public SnapshotableObject {
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeWindows1252(
const v8::FunctionCallbackInfo<v8::Value>& args);

static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);
Expand Down
47 changes: 26 additions & 21 deletions test/parallel/test-internal-encoding-binding.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,46 @@ const assert = require('node:assert');
const { internalBinding } = require('internal/test/binding');
const binding = internalBinding('encoding_binding');

// Windows-1252 specific tests
{
// Valid input
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
assert.strictEqual(binding.decodeLatin1(buf, false, false), 'Áéó');
// Test Windows-1252 special characters in 128-159 range
// These differ from Latin-1
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
}

{
// Empty input
const buf = Uint8Array.from([]);
assert.strictEqual(binding.decodeLatin1(buf, false, false), '');
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
}

{
// Invalid input, but Latin1 has no invalid chars and should never throw.
const buf = new TextEncoder().encode('Invalid Latin1 🧑‍🧑‍🧒‍🧒');
assert.strictEqual(
binding.decodeLatin1(buf, false, false),
'Invalid Latin1 ð\x9F§\x91â\x80\x8Dð\x9F§\x91â\x80\x8Dð\x9F§\x92â\x80\x8Dð\x9F§\x92'
);
// Empty input
const buf = Uint8Array.from([]);
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
}

// Windows-1252 specific tests
{
// IgnoreBOM with BOM
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
assert.strictEqual(binding.decodeLatin1(buf, true, false), 'þÿÁéó');
// Test Windows-1252 special characters in 128-159 range
// These differ from Latin-1
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
}

{
// Fatal and InvalidInput, but Latin1 has no invalid chars and should never throw.
const buf = Uint8Array.from([0xFF, 0xFF, 0xFF]);
assert.strictEqual(binding.decodeLatin1(buf, false, true), 'ÿÿÿ');
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
}

{
// IgnoreBOM and Fatal, but Latin1 has no invalid chars and should never throw.
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
assert.strictEqual(binding.decodeLatin1(buf, true, true), 'þÿÁéó');
// Empty input
const buf = Uint8Array.from([]);
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
}
46 changes: 46 additions & 0 deletions test/parallel/test-util-text-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,49 @@ test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common.

assert.strictEqual(decodedString, expectedString);
});

// Test for the difference between Latin1 and Windows-1252 in the 128-159
// range
// Ref: https://github.com/nodejs/node/issues/60888
test('TextDecoder correctly decodes windows-1252 special characters in ' +
'128-159 range', { skip: !common.hasIntl }, () => {
const decoder = new TextDecoder('windows-1252');

// Test specific characters that differ between Latin1 and Windows-1252.
// € Euro sign
assert.strictEqual(decoder.decode(Uint8Array.of(128)).codePointAt(0),
8364);
// ‚ Single low-9 quotation mark
assert.strictEqual(decoder.decode(Uint8Array.of(130)).codePointAt(0),
8218);
// Latin small letter f with hook (ƒ)
assert.strictEqual(decoder.decode(Uint8Array.of(131)).codePointAt(0),
402);
// Ÿ Latin capital letter Y with diaeresis
assert.strictEqual(decoder.decode(Uint8Array.of(159)).codePointAt(0),
376);

// Test the full range to ensure no character is treated as Latin1
// Directly.
const expectedMappings = [
[128, 8364], [129, 129], [130, 8218], [131, 402], [132, 8222],
[133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240],
[138, 352], [139, 8249], [140, 338], [141, 141], [142, 381],
[143, 143], [144, 144], [145, 8216], [146, 8217], [147, 8220],
[148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732],
[153, 8482], [154, 353], [155, 8250], [156, 339], [157, 157],
[158, 382], [159, 376],
];

for (const [byte, expectedCodePoint] of expectedMappings) {
const result = decoder.decode(Uint8Array.of(byte));
const actualCodePoint = result.codePointAt(0);
assert.strictEqual(
actualCodePoint,
expectedCodePoint,
`Byte 0x${byte.toString(16)} should decode to ` +
`U+${expectedCodePoint.toString(16)} but got ` +
`U+${actualCodePoint.toString(16)}`
);
}
});
2 changes: 1 addition & 1 deletion typings/internalBinding/encoding_binding.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ export interface EncodingBinding {
decodeUTF8(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
toASCII(input: string): string;
toUnicode(input: string): string;
decodeLatin1(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
}
Loading