From 4e651983e5bda23c15271caf00017b7c6dcba3f4 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 31 Aug 2018 16:57:03 +0200 Subject: [PATCH] src: allow UTF-16 in generic StringBytes decode call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows removing a number of special cases in other parts of the code, at least one of which was incorrect in expecting aligned input when that was not guaranteed. Fixes: https://github.com/nodejs/node/issues/22358 PR-URL: https://github.com/nodejs/node/pull/22622 Reviewed-By: Tobias Nießen --- src/node.h | 1 - src/node_buffer.cc | 59 ---------------------------- src/string_bytes.cc | 57 +++++++++++++++------------ src/string_bytes.h | 1 - src/string_decoder.cc | 10 ----- test/parallel/test-string-decoder.js | 8 ++++ 6 files changed, 40 insertions(+), 96 deletions(-) diff --git a/src/node.h b/src/node.h index 2429e45936b572..fd99b9e4eaa358 100644 --- a/src/node.h +++ b/src/node.h @@ -418,7 +418,6 @@ NODE_DEPRECATED("Use FatalException(isolate, ...)", return FatalException(v8::Isolate::GetCurrent(), try_catch); }) -// Don't call with encoding=UCS2. NODE_EXTERN v8::Local Encode(v8::Isolate* isolate, const char* buf, size_t len, diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 4a29873c34b821..7c4c0d9f657b24 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -467,65 +467,6 @@ void StringSlice(const FunctionCallbackInfo& args) { } -template <> -void StringSlice(const FunctionCallbackInfo& args) { - Isolate* isolate = args.GetIsolate(); - Environment* env = Environment::GetCurrent(isolate); - - THROW_AND_RETURN_UNLESS_BUFFER(env, args.This()); - SPREAD_BUFFER_ARG(args.This(), ts_obj); - - if (ts_obj_length == 0) - return args.GetReturnValue().SetEmptyString(); - - SLICE_START_END(args[0], args[1], ts_obj_length) - length /= 2; - - const char* data = ts_obj_data + start; - const uint16_t* buf; - bool release = false; - - // Node's "ucs2" encoding expects LE character data inside a Buffer, so we - // need to reorder on BE platforms. See https://nodejs.org/api/buffer.html - // regarding Node's "ucs2" encoding specification. - const bool aligned = (reinterpret_cast(data) % sizeof(*buf) == 0); - if (IsLittleEndian() && !aligned) { - // Make a copy to avoid unaligned accesses in v8::String::NewFromTwoByte(). - // This applies ONLY to little endian platforms, as misalignment will be - // handled by a byte-swapping operation in StringBytes::Encode on - // big endian platforms. - uint16_t* copy = new uint16_t[length]; - for (size_t i = 0, k = 0; i < length; i += 1, k += 2) { - // Assumes that the input is little endian. - const uint8_t lo = static_cast(data[k + 0]); - const uint8_t hi = static_cast(data[k + 1]); - copy[i] = lo | hi << 8; - } - buf = copy; - release = true; - } else { - buf = reinterpret_cast(data); - } - - Local error; - MaybeLocal ret = - StringBytes::Encode(isolate, - buf, - length, - &error); - - if (release) - delete[] buf; - - if (ret.IsEmpty()) { - CHECK(!error.IsEmpty()); - isolate->ThrowException(error); - return; - } - args.GetReturnValue().Set(ret.ToLocalChecked()); -} - - // bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd]) void Copy(const FunctionCallbackInfo &args) { Environment* env = Environment::GetCurrent(args); diff --git a/src/string_bytes.cc b/src/string_bytes.cc index c38f368d41878e..e4c4c0a7dbb34c 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -625,7 +625,6 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, size_t buflen, enum encoding encoding, Local* error) { - CHECK_NE(encoding, UCS2); CHECK_BUFLEN_IN_RANGE(buflen); if (!buflen && encoding != BUFFER) { @@ -703,6 +702,37 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, return ExternOneByteString::New(isolate, dst, dlen, error); } + case UCS2: { + if (IsBigEndian()) { + uint16_t* dst = node::UncheckedMalloc(buflen / 2); + if (dst == nullptr) { + *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate); + return MaybeLocal(); + } + for (size_t i = 0, k = 0; k < buflen / 2; i += 2, k += 1) { + // The input is in *little endian*, because that's what Node.js + // expects, so the high byte comes after the low byte. + const uint8_t hi = static_cast(buf[i + 1]); + const uint8_t lo = static_cast(buf[i + 0]); + dst[k] = static_cast(hi) << 8 | lo; + } + return ExternTwoByteString::New(isolate, dst, buflen / 2, error); + } + if (reinterpret_cast(buf) % 2 != 0) { + // Unaligned data still means we can't directly pass it to V8. + char* dst = node::UncheckedMalloc(buflen); + if (dst == nullptr) { + *error = node::ERR_MEMORY_ALLOCATION_FAILED(isolate); + return MaybeLocal(); + } + memcpy(dst, buf, buflen); + return ExternTwoByteString::New( + isolate, reinterpret_cast(dst), buflen / 2, error); + } + return ExternTwoByteString::NewFromCopy( + isolate, reinterpret_cast(buf), buflen / 2, error); + } + default: CHECK(0 && "unknown encoding"); break; @@ -742,30 +772,7 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, enum encoding encoding, Local* error) { const size_t len = strlen(buf); - MaybeLocal ret; - if (encoding == UCS2) { - // In Node, UCS2 means utf16le. The data must be in little-endian - // order and must be aligned on 2-bytes. This returns an empty - // value if it's not aligned and ensures the appropriate byte order - // on big endian architectures. - const bool be = IsBigEndian(); - if (len % 2 != 0) - return ret; - std::vector vec(len / 2); - for (size_t i = 0, k = 0; i < len; i += 2, k += 1) { - const uint8_t hi = static_cast(buf[i + 0]); - const uint8_t lo = static_cast(buf[i + 1]); - vec[k] = be ? - static_cast(hi) << 8 | lo - : static_cast(lo) << 8 | hi; - } - ret = vec.empty() ? - static_cast< Local >(String::Empty(isolate)) - : StringBytes::Encode(isolate, &vec[0], vec.size(), error); - } else { - ret = StringBytes::Encode(isolate, buf, len, encoding, error); - } - return ret; + return Encode(isolate, buf, len, encoding, error); } } // namespace node diff --git a/src/string_bytes.h b/src/string_bytes.h index 9e64fa0ee5f90a..87a296663e15aa 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -90,7 +90,6 @@ class StringBytes { int* chars_written = nullptr); // Take the bytes in the src, and turn it into a Buffer or String. - // Don't call with encoding=UCS2. static v8::MaybeLocal Encode(v8::Isolate* isolate, const char* buf, size_t buflen, diff --git a/src/string_decoder.cc b/src/string_decoder.cc index b75169ff00cafb..fa8201faff2944 100644 --- a/src/string_decoder.cc +++ b/src/string_decoder.cc @@ -30,16 +30,6 @@ MaybeLocal MakeString(Isolate* isolate, data, v8::NewStringType::kNormal, length); - } else if (encoding == UCS2) { -#ifdef DEBUG - CHECK_EQ(reinterpret_cast(data) % 2, 0); - CHECK_EQ(length % 2, 0); -#endif - ret = StringBytes::Encode( - isolate, - reinterpret_cast(data), - length / 2, - &error); } else { ret = StringBytes::Encode( isolate, diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 5571aaeeb78dc2..6e4f4b121d20d7 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -143,6 +143,14 @@ decoder = new StringDecoder('utf16le'); assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d'); assert.strictEqual(decoder.end(), ''); +// Regression test for https://github.com/nodejs/node/issues/22358 +// (unaligned UTF-16 access). +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.alloc(1)), ''); +assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10)); +assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24)); +assert.strictEqual(decoder.end(), ''); + common.expectsError( () => new StringDecoder(1), {