From 7eae61d1ca0da63491e925c74fba45f723a34b70 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Thu, 1 Feb 2018 02:28:39 +0100 Subject: [PATCH 1/4] string_decoder: reimplement in C++ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. $ ./node benchmark/compare.js --new ./node --old ./node-before --set n=25e4 string_decoder | Rscript benchmark/compare.R [03:07:14|% 100| 2/2 files | 60/60 runs | 80/80 configs]: Done confidence improvement accuracy (*) (**) (***) string_decoder/string-decoder-create.js n=250000 encoding="ascii" *** -77.32 % ±4.32% ±5.80% ±7.68% string_decoder/string-decoder-create.js n=250000 encoding="AscII" *** -74.15 % ±2.55% ±3.43% ±4.53% string_decoder/string-decoder-create.js n=250000 encoding="base64" *** -56.88 % ±4.75% ±6.36% ±8.37% string_decoder/string-decoder-create.js n=250000 encoding="ucs2" *** -64.00 % ±3.53% ±4.72% ±6.21% string_decoder/string-decoder-create.js n=250000 encoding="UTF-16LE" *** -54.64 % ±3.82% ±5.12% ±6.74% string_decoder/string-decoder-create.js n=250000 encoding="utf-8" *** -66.98 % ±4.92% ±6.62% ±8.75% string_decoder/string-decoder-create.js n=250000 encoding="UTF-8" *** -59.94 % ±3.45% ±4.61% ±6.03% string_decoder/string-decoder-create.js n=250000 encoding="utf8" *** -66.55 % ±4.65% ±6.26% ±8.28% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=1024 encoding="ascii" 12.89 % ±14.67% ±19.54% ±25.46% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=1024 encoding="base64-ascii" ** 12.83 % ±9.20% ±12.24% ±15.93% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=1024 encoding="base64-utf8" ** 12.19 % ±8.97% ±11.93% ±15.53% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=1024 encoding="utf16le" 7.69 % ±11.02% ±14.67% ±19.12% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=1024 encoding="utf8" 0.55 % ±8.17% ±10.88% ±14.17% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=128 encoding="ascii" * 23.55 % ±17.71% ±23.57% ±30.69% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=128 encoding="base64-ascii" *** 45.47 % ±13.60% ±18.13% ±23.66% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=128 encoding="base64-utf8" *** 39.64 % ±14.04% ±18.70% ±24.37% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=128 encoding="utf16le" * 13.59 % ±13.43% ±17.86% ±23.26% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=128 encoding="utf8" 11.52 % ±12.00% ±15.98% ±20.81% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=32 encoding="ascii" * 14.10 % ±13.51% ±18.01% ±23.50% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=32 encoding="base64-ascii" *** 67.77 % ±17.03% ±22.72% ±29.69% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=32 encoding="base64-utf8" *** 54.15 % ±16.54% ±22.08% ±28.87% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=32 encoding="utf16le" 15.89 % ±16.84% ±22.44% ±29.26% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=32 encoding="utf8" 9.28 % ±12.42% ±16.52% ±21.51% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=4096 encoding="ascii" 10.09 % ±10.59% ±14.09% ±18.34% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=4096 encoding="base64-ascii" 6.93 % ±8.86% ±11.79% ±15.36% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=4096 encoding="base64-utf8" * 9.24 % ±7.98% ±10.62% ±13.83% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=4096 encoding="utf16le" -0.67 % ±9.40% ±12.51% ±16.31% string_decoder/string-decoder.js n=250000 chunkLen=1024 inLen=4096 encoding="utf8" 2.27 % ±8.52% ±11.34% ±14.76% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=1024 encoding="ascii" 8.53 % ±10.65% ±14.18% ±18.48% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=1024 encoding="base64-ascii" *** 54.82 % ±10.50% ±14.00% ±18.29% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=1024 encoding="base64-utf8" *** 51.75 % ±10.83% ±14.43% ±18.85% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=1024 encoding="utf16le" -5.01 % ±9.15% ±12.18% ±15.85% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=1024 encoding="utf8" 4.55 % ±8.20% ±10.92% ±14.21% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=128 encoding="ascii" * 13.88 % ±12.62% ±16.81% ±21.91% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=128 encoding="base64-ascii" *** 52.79 % ±10.56% ±14.08% ±18.39% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=128 encoding="base64-utf8" *** 52.56 % ±11.39% ±15.21% ±19.90% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=128 encoding="utf16le" 0.43 % ±10.06% ±13.38% ±17.42% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=128 encoding="utf8" 3.33 % ±10.17% ±13.53% ±17.63% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=32 encoding="ascii" *** 31.81 % ±14.71% ±19.58% ±25.52% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=32 encoding="base64-ascii" *** 51.38 % ±13.23% ±17.65% ±23.09% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=32 encoding="base64-utf8" *** 48.89 % ±14.06% ±18.82% ±24.71% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=32 encoding="utf16le" 8.70 % ±12.70% ±16.90% ±22.01% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=32 encoding="utf8" * 17.94 % ±14.58% ±19.40% ±25.26% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=4096 encoding="ascii" ** 13.42 % ±9.78% ±13.03% ±17.00% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=4096 encoding="base64-ascii" *** 49.02 % ±10.60% ±14.13% ±18.45% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=4096 encoding="base64-utf8" *** 52.44 % ±11.03% ±14.72% ±19.23% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=4096 encoding="utf16le" -5.55 % ±9.18% ±12.22% ±15.92% string_decoder/string-decoder.js n=250000 chunkLen=16 inLen=4096 encoding="utf8" 6.32 % ±8.12% ±10.80% ±14.07% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=1024 encoding="ascii" 8.21 % ±12.47% ±16.61% ±21.65% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=1024 encoding="base64-ascii" *** 28.39 % ±9.32% ±12.42% ±16.19% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=1024 encoding="base64-utf8" *** 21.83 % ±9.94% ±13.24% ±17.26% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=1024 encoding="utf16le" 5.92 % ±10.21% ±13.60% ±17.71% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=1024 encoding="utf8" 2.83 % ±8.47% ±11.27% ±14.67% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=128 encoding="ascii" * 16.92 % ±16.02% ±21.34% ±27.81% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=128 encoding="base64-ascii" *** 36.24 % ±14.01% ±18.68% ±24.37% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=128 encoding="base64-utf8" *** 24.74 % ±13.22% ±17.65% ±23.10% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=128 encoding="utf16le" 6.26 % ±12.01% ±15.98% ±20.81% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=128 encoding="utf8" 1.51 % ±10.89% ±14.50% ±18.88% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=32 encoding="ascii" ** 25.58 % ±19.19% ±25.54% ±33.28% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=32 encoding="base64-ascii" *** 67.36 % ±15.88% ±21.21% ±27.76% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=32 encoding="base64-utf8" *** 55.56 % ±17.89% ±23.92% ±31.36% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=32 encoding="utf16le" ** 22.91 % ±15.06% ±20.03% ±26.07% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=32 encoding="utf8" 11.45 % ±12.80% ±17.03% ±22.18% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=4096 encoding="ascii" * 10.71 % ±10.00% ±13.32% ±17.35% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=4096 encoding="base64-ascii" *** 21.56 % ±9.11% ±12.12% ±15.77% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=4096 encoding="base64-utf8" *** 25.72 % ±9.04% ±12.03% ±15.68% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=4096 encoding="utf16le" 0.64 % ±9.49% ±12.64% ±16.48% string_decoder/string-decoder.js n=250000 chunkLen=256 inLen=4096 encoding="utf8" 1.64 % ±8.54% ±11.37% ±14.80% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=1024 encoding="ascii" 11.38 % ±11.58% ±15.44% ±20.15% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=1024 encoding="base64-ascii" *** 44.38 % ±10.72% ±14.31% ±18.70% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=1024 encoding="base64-utf8" *** 40.75 % ±10.92% ±14.57% ±19.03% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=1024 encoding="utf16le" 0.01 % ±9.09% ±12.09% ±15.73% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=1024 encoding="utf8" 4.25 % ±8.23% ±10.95% ±14.26% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=128 encoding="ascii" * 21.15 % ±16.30% ±21.71% ±28.31% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=128 encoding="base64-ascii" *** 50.31 % ±11.71% ±15.64% ±20.46% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=128 encoding="base64-utf8" *** 42.86 % ±11.50% ±15.34% ±20.07% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=128 encoding="utf16le" 3.08 % ±14.43% ±19.21% ±25.01% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=128 encoding="utf8" 5.19 % ±10.15% ±13.51% ±17.59% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=32 encoding="ascii" 12.04 % ±16.01% ±21.32% ±27.79% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=32 encoding="base64-ascii" *** 52.38 % ±15.90% ±21.27% ±27.92% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=32 encoding="base64-utf8" *** 38.86 % ±16.88% ±22.57% ±29.60% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=32 encoding="utf16le" ** 19.55 % ±13.93% ±18.54% ±24.15% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=32 encoding="utf8" ** 19.12 % ±12.42% ±16.52% ±21.51% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=4096 encoding="ascii" ** 14.49 % ±9.70% ±12.91% ±16.81% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=4096 encoding="base64-ascii" *** 37.39 % ±10.67% ±14.21% ±18.52% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=4096 encoding="base64-utf8" *** 46.33 % ±10.02% ±13.34% ±17.40% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=4096 encoding="utf16le" -1.83 % ±9.42% ±12.54% ±16.32% string_decoder/string-decoder.js n=250000 chunkLen=64 inLen=4096 encoding="utf8" 2.81 % ±8.96% ±11.93% ±15.52% --- lib/string_decoder.js | 283 ++++------------------- node.gyp | 4 + src/node_internals.h | 1 + src/string_decoder-inl.h | 38 +++ src/string_decoder.cc | 334 +++++++++++++++++++++++++++ src/string_decoder.h | 50 ++++ test/parallel/test-string-decoder.js | 4 + 7 files changed, 476 insertions(+), 238 deletions(-) create mode 100644 src/string_decoder-inl.h create mode 100644 src/string_decoder.cc create mode 100644 src/string_decoder.h diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 1e569ba6b26a4c..69514038a85104 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -22,10 +22,23 @@ 'use strict'; const { Buffer } = require('buffer'); +const { + kIncompleteCharactersStart, + kIncompleteCharactersEnd, + kMissingBytes, + kBufferedBytes, + kEncodingField, + kSize, + decode, + flush, + encodings +} = internalBinding('string_decoder'); const internalUtil = require('internal/util'); const errors = require('internal/errors'); const isEncoding = Buffer[internalUtil.kIsEncodingSymbol]; +const kNativeDecoder = Symbol('kNativeDecoder'); + // Do not cache `Buffer.isEncoding` when checking encoding names as some // modules monkey-patch it to support additional encodings function normalizeEncoding(enc) { @@ -36,258 +49,52 @@ function normalizeEncoding(enc) { return nenc || enc; } +const encodingsMap = {}; +for (var i = 0; i < encodings.length; ++i) + encodingsMap[encodings[i]] = i; + // StringDecoder provides an interface for efficiently splitting a series of // buffers into a series of JS strings without breaking apart multi-byte // characters. -exports.StringDecoder = StringDecoder; -function StringDecoder(encoding) { - this.encoding = normalizeEncoding(encoding); - var nb; - switch (this.encoding) { - case 'utf16le': - this.text = utf16Text; - this.end = utf16End; - nb = 4; - break; - case 'utf8': - this.fillLast = utf8FillLast; - nb = 4; - break; - case 'base64': - this.text = base64Text; - this.end = base64End; - nb = 3; - break; - default: - this.write = simpleWrite; - this.end = simpleEnd; - return; - } - this.lastNeed = 0; - this.lastTotal = 0; - this.lastChar = Buffer.allocUnsafe(nb); -} - -StringDecoder.prototype.write = function(buf) { - if (buf.length === 0) - return ''; - var r; - var i; - if (this.lastNeed) { - r = this.fillLast(buf); - if (r === undefined) - return ''; - i = this.lastNeed; - this.lastNeed = 0; - } else { - i = 0; - } - if (i < buf.length) - return (r ? r + this.text(buf, i) : this.text(buf, i)); - return r || ''; -}; - -StringDecoder.prototype.end = utf8End; - -// Returns only complete characters in a Buffer -StringDecoder.prototype.text = utf8Text; - -// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer -StringDecoder.prototype.fillLast = function(buf) { - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); - } - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); - this.lastNeed -= buf.length; -}; - -// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. If an invalid byte is detected, -2 is returned. -function utf8CheckByte(byte) { - if (byte <= 0x7F) - return 0; - else if (byte >> 5 === 0x06) - return 2; - else if (byte >> 4 === 0x0E) - return 3; - else if (byte >> 3 === 0x1E) - return 4; - return (byte >> 6 === 0x02 ? -1 : -2); -} - -// Checks at most 3 bytes at the end of a Buffer in order to detect an -// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) -// needed to complete the UTF-8 character (if applicable) are returned. -function utf8CheckIncomplete(self, buf, i) { - var j = buf.length - 1; - if (j < i) - return 0; - var nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 1; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb - 2; - return nb; - } - if (--j < i || nb === -2) - return 0; - nb = utf8CheckByte(buf[j]); - if (nb >= 0) { - if (nb > 0) { - if (nb === 2) - nb = 0; - else - self.lastNeed = nb - 3; - } - return nb; - } - return 0; -} - -// Validates as many continuation bytes for a multi-byte UTF-8 character as -// needed or are available. If we see a non-continuation byte where we expect -// one, we "replace" the validated continuation bytes we've seen so far with -// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding -// behavior. The continuation byte check is included three times in the case -// where all of the continuation bytes for a character exist in the same buffer. -// It is also done this way as a slight performance increase instead of using a -// loop. -function utf8CheckExtraBytes(self, buf, p) { - if ((buf[0] & 0xC0) !== 0x80) { - self.lastNeed = 0; - return '\ufffd'; - } - if (self.lastNeed > 1 && buf.length > 1) { - if ((buf[1] & 0xC0) !== 0x80) { - self.lastNeed = 1; - return '\ufffd'; - } - if (self.lastNeed > 2 && buf.length > 2) { - if ((buf[2] & 0xC0) !== 0x80) { - self.lastNeed = 2; - return '\ufffd'; - } - } +class StringDecoder { + constructor(encoding) { + this.encoding = normalizeEncoding(encoding); + this[kNativeDecoder] = Buffer.alloc(kSize); + this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding]; } -} -// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. -function utf8FillLast(buf) { - const p = this.lastTotal - this.lastNeed; - var r = utf8CheckExtraBytes(this, buf, p); - if (r !== undefined) - return r; - if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, p, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); + write(buf) { + if (typeof buf === 'string') + return buf; + if (!ArrayBuffer.isView(buf)) + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf', + ['Buffer', 'Uint8Array', 'ArrayBufferView']); + return decode(this[kNativeDecoder], buf); } - buf.copy(this.lastChar, p, 0, buf.length); - this.lastNeed -= buf.length; -} -// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a -// partial character, the character's bytes are buffered until the required -// number of bytes are available. -function utf8Text(buf, i) { - const total = utf8CheckIncomplete(this, buf, i); - if (!this.lastNeed) - return buf.toString('utf8', i); - this.lastTotal = total; - const end = buf.length - (total - this.lastNeed); - buf.copy(this.lastChar, 0, end); - return buf.toString('utf8', i, end); -} - -// For UTF-8, a replacement character is added when ending on a partial -// character. -function utf8End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - this.lastNeed = 0; - this.lastTotal = 0; - return r + '\ufffd'; + end(buf) { + let ret = ''; + if (buf !== undefined) + ret = this.write(buf); + return ret + flush(this[kNativeDecoder]); } - return r; -} -// UTF-16LE typically needs two bytes per character, but even if we have an even -// number of bytes available, we need to check if we end on a leading/high -// surrogate. In that case, we need to wait for the next two bytes in order to -// decode the last character properly. -function utf16Text(buf, i) { - if ((buf.length - i) % 2 === 0) { - const r = buf.toString('utf16le', i); - if (r) { - const c = r.charCodeAt(r.length - 1); - if (c >= 0xD800 && c <= 0xDBFF) { - this.lastNeed = 2; - this.lastTotal = 4; - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; - return r.slice(0, -1); - } - } - return r; - } - this.lastNeed = 1; - this.lastTotal = 2; - this.lastChar[0] = buf[buf.length - 1]; - return buf.toString('utf16le', i, buf.length - 1); -} + /* Everything below this line is undocumented legacy stuff. */ -// For UTF-16LE we do not explicitly append special replacement characters if we -// end on a partial character, we simply let v8 handle that. -function utf16End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = this.lastTotal - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('utf16le', 0, end); + text(buf, offset) { + this[kNativeDecoder][kMissingBytes] = 0; + this[kNativeDecoder][kBufferedBytes] = 0; + return this.write(buf.slice(offset)); } - return r; -} -function base64Text(buf, i) { - const n = (buf.length - i) % 3; - if (n === 0) - return buf.toString('base64', i); - this.lastNeed = 3 - n; - this.lastTotal = 3; - if (n === 1) { - this.lastChar[0] = buf[buf.length - 1]; - } else { - this.lastChar[0] = buf[buf.length - 2]; - this.lastChar[1] = buf[buf.length - 1]; + get lastTotal() { + return this[kNativeDecoder][kBufferedBytes] + this.lastNeed; } - return buf.toString('base64', i, buf.length - n); -} - -function base64End(buf) { - const r = (buf && buf.length ? this.write(buf) : ''); - if (this.lastNeed) { - const end = 3 - this.lastNeed; - this.lastNeed = 0; - this.lastTotal = 0; - return r + this.lastChar.toString('base64', 0, end); + get lastChar() { + return this[kNativeDecoder].subarray(kIncompleteCharactersStart, + kIncompleteCharactersEnd); } - return r; } -// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) -function simpleWrite(buf) { - return buf.toString(this.encoding); -} - -function simpleEnd(buf) { - return (buf && buf.length ? this.write(buf) : ''); -} +exports.StringDecoder = StringDecoder; diff --git a/node.gyp b/node.gyp index dc860d3e56f56a..31dd26563b6566 100644 --- a/node.gyp +++ b/node.gyp @@ -325,6 +325,7 @@ 'src/signal_wrap.cc', 'src/spawn_sync.cc', 'src/string_bytes.cc', + 'src/string_decoder.cc', 'src/string_search.cc', 'src/stream_base.cc', 'src/stream_wrap.cc', @@ -378,6 +379,8 @@ 'src/req_wrap.h', 'src/req_wrap-inl.h', 'src/string_bytes.h', + 'src/string_decoder.h', + 'src/string_decoder-inl.h', 'src/stream_base.h', 'src/stream_base-inl.h', 'src/stream_wrap.h', @@ -988,6 +991,7 @@ '<(obj_path)<(obj_separator)node_url.<(obj_suffix)', '<(obj_path)<(obj_separator)util.<(obj_suffix)', '<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)', + '<(obj_path)<(obj_separator)string_decoder.<(obj_suffix)', '<(obj_path)<(obj_separator)string_search.<(obj_suffix)', '<(obj_path)<(obj_separator)stream_base.<(obj_suffix)', '<(obj_path)<(obj_separator)node_constants.<(obj_suffix)', diff --git a/src/node_internals.h b/src/node_internals.h index b3e1f5cd9f270c..094fcc2d839d5f 100644 --- a/src/node_internals.h +++ b/src/node_internals.h @@ -120,6 +120,7 @@ struct sockaddr; V(signal_wrap) \ V(spawn_sync) \ V(stream_wrap) \ + V(string_decoder) \ V(tcp_wrap) \ V(timer_wrap) \ V(trace_events) \ diff --git a/src/string_decoder-inl.h b/src/string_decoder-inl.h new file mode 100644 index 00000000000000..77d018579a1767 --- /dev/null +++ b/src/string_decoder-inl.h @@ -0,0 +1,38 @@ +#ifndef SRC_STRING_DECODER_INL_H_ +#define SRC_STRING_DECODER_INL_H_ + +#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#include "string_decoder.h" +#include "util.h" + +namespace node { + +inline void StringDecoder::SetEncoding(enum encoding encoding) { + state_[kBufferedBytes] = 0; + state_[kMissingBytes] = 0; + state_[kEncodingField] = encoding; +} + +inline enum encoding StringDecoder::Encoding() const { + return static_cast(state_[kEncodingField]); +} + +inline unsigned StringDecoder::BufferedBytes() const { + return state_[kBufferedBytes]; +} + +inline unsigned StringDecoder::MissingBytes() const { + return state_[kMissingBytes]; +} + +inline char* StringDecoder::IncompleteCharacterBuffer() { + return reinterpret_cast(state_ + kIncompleteCharactersStart); +} + + +} // namespace node + +#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#endif // SRC_STRING_DECODER_INL_H_ diff --git a/src/string_decoder.cc b/src/string_decoder.cc new file mode 100644 index 00000000000000..000215e5a9fb2e --- /dev/null +++ b/src/string_decoder.cc @@ -0,0 +1,334 @@ +#include "string_decoder-inl.h" +#include "string_bytes.h" +#include "node_internals.h" +#include "node_buffer.h" + +using v8::Array; +using v8::Context; +using v8::FunctionCallbackInfo; +using v8::Integer; +using v8::Isolate; +using v8::Local; +using v8::MaybeLocal; +using v8::Object; +using v8::String; +using v8::Value; + +namespace node { + +namespace { + +MaybeLocal MakeString(Isolate* isolate, + const char* data, + size_t length, + enum encoding encoding) { + Local error; + MaybeLocal ret; + if (encoding == UTF8) { + return String::NewFromUtf8( + isolate, + data, + v8::NewStringType::kNormal, + length); + } else if (encoding == UCS2) { +#ifdef DEBUG + CHECK_EQ(reinterpret_cast(data) % 2, 0); + CHECK_EQ(length % 2, 0); +#endif + ret = StringBytes::Encode( + isolate, + reinterpret_cast(data), + length / 2, + &error); + } else { + ret = StringBytes::Encode( + isolate, + data, + length, + encoding, + &error); + } + + if (ret.IsEmpty()) { + CHECK(!error.IsEmpty()); + isolate->ThrowException(error); + } + +#ifdef DEBUG + CHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString()); +#endif + return ret.FromMaybe(Local()).As(); +} + +} // anonymous namespace + + +MaybeLocal StringDecoder::DecodeData(Isolate* isolate, + const char* data, + ssize_t* nread_ptr) { + Local prepend, body; + + size_t nread = *nread_ptr; + + if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) { + // See if we want bytes to finish a character from the previous + // chunk; if so, copy the new bytes to the missing bytes buffer + // and create a small string from it that is to be prepended to the + // main body. + if (MissingBytes() > 0) { + // There are never more bytes missing than the pre-calculated maximum. + CHECK_LE(MissingBytes() + BufferedBytes(), + kIncompleteCharactersEnd); + if (Encoding() == UTF8) { + // For UTF-8, we need special treatment to align with the V8 decoder: + // If an incomplete character is found at a chunk boundary, we turn + // that character into a single invalid one. + for (size_t i = 0; i < nread && i < MissingBytes(); ++i) { + if ((data[i] & 0xC0) != 0x80) { + // This byte is not a continuation byte even though it should have + // been one. + // Act as if there was a 1-byte incomplete character, which does + // not make sense but works here because we know it's invalid. + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 1; + data += i; + nread -= i; + break; + } + } + } + + size_t found_bytes = + std::min(nread, static_cast(MissingBytes())); + memcpy(IncompleteCharacterBuffer() + BufferedBytes(), + data, + found_bytes); + // Adjust the two buffers. + data += found_bytes; + nread -= found_bytes; + + state_[kMissingBytes] -= found_bytes; + state_[kBufferedBytes] += found_bytes; + + if (LIKELY(MissingBytes() == 0)) { + // If no more bytes are missing, create a small string that we + // will later prepend. + if (!MakeString(isolate, + IncompleteCharacterBuffer(), + BufferedBytes(), + Encoding()).ToLocal(&prepend)) { + return MaybeLocal(); + } + + *nread_ptr += BufferedBytes(); + // No more buffered bytes. + state_[kBufferedBytes] = 0; + } + } + + // It could be that trying to finish the previous chunk already + // consumed all data that we received in this chunk. + if (UNLIKELY(nread == 0)) { + body = !prepend.IsEmpty() ? prepend : String::Empty(isolate); + prepend = Local(); + } else { +#ifdef DEBUG + // If not, that means is no character left to finish at this point. + CHECK_EQ(MissingBytes(), 0); + CHECK_EQ(BufferedBytes(), 0); +#endif + + // See whether there is a character that we may have to cut off and + // finish when receiving the next chunk. + if (Encoding() == UTF8 && data[nread - 1] & 0x80) { + // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte. + // This means we'll need to figure out where the character to which + // the byte belongs begins. + for (size_t i = nread - 1; ; --i) { +#ifdef DEBUG + CHECK_LT(i, nread); +#endif + state_[kBufferedBytes]++; + if ((data[i] & 0xC0) == 0x80) { + // This byte does not start a character (a "trailing" bytes). + if (state_[kBufferedBytes] >= 4 || i == 0) { + // We either have more then 4 trailing bytes (which means + // the current character would not be inside the range for + // valid Unicode, and in particular cannot be represented + // through JavaScript's UTF-16-based approach to strings), or the + // current buffer does not contain the start of an UTF-8 character + // at all. Either way, this is invalid UTF8 and we can just + // let the engine's decoder handle it. + state_[kBufferedBytes] = 0; + break; + } + } else { + // Found the first byte of a UTF-8 character. By looking at the + // upper bits we can tell how long the character *should* be. + if ((data[i] & 0xE0) == 0xC0) { + state_[kMissingBytes] = 2; + } else if ((data[i] & 0xF0) == 0xE0) { + state_[kMissingBytes] = 3; + } else if ((data[i] & 0xF8) == 0xF0) { + state_[kMissingBytes] = 4; + } else { + // This lead byte would indicate a character outside of the + // representable range. + state_[kBufferedBytes] = 0; + break; + } + + if (BufferedBytes() >= MissingBytes()) { + // Received more or exactly as many trailing bytes than the lead + // character would indicate. In the "==" case, we have valid + // data and don't need to slice anything off; + // in the ">" case, this is invalid UTF-8 anyway. + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 0; + } + + state_[kMissingBytes] -= state_[kBufferedBytes]; + break; + } + } + } else if (Encoding() == UCS2) { + if ((nread % 2) == 1) { + // We got half a codepoint, and need the second byte of it. + state_[kBufferedBytes] = 1; + state_[kMissingBytes] = 1; + } else if ((data[nread - 1] & 0xFC) == 0xD8) { + // Half a split UTF-16 character. + state_[kBufferedBytes] = 2; + state_[kMissingBytes] = 2; + } + } else if (Encoding() == BASE64) { + state_[kBufferedBytes] = nread % 3; + if (state_[kBufferedBytes] > 0) + state_[kMissingBytes] = 3 - BufferedBytes(); + } + + if (BufferedBytes() > 0) { + // Copy the requested number of buffered bytes from the end of the + // input into the incomplete character buffer. + nread -= BufferedBytes(); + *nread_ptr -= BufferedBytes(); + memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes()); + } + + if (nread > 0) { + if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body)) + return MaybeLocal(); + } else { + body = String::Empty(isolate); + } + } + + if (!prepend.IsEmpty()) { + return String::Concat(prepend, body); + } + + return body; + } else { + CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1); + return MakeString(isolate, data, nread, Encoding()); + } +} + +MaybeLocal StringDecoder::FlushData(Isolate* isolate) { + if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) { + CHECK_EQ(MissingBytes(), 0); + CHECK_EQ(BufferedBytes(), 0); + } + + if (Encoding() == UCS2 && (BufferedBytes() % 2) == 1) { + // Ignore a single trailing byte, like the JS decoder does. + state_[kMissingBytes]--; + state_[kBufferedBytes]--; + } + + if (BufferedBytes() == 0) + return String::Empty(isolate); + + MaybeLocal ret = + MakeString(isolate, + IncompleteCharacterBuffer(), + BufferedBytes(), + Encoding()); + + state_[kMissingBytes] = 0; + state_[kBufferedBytes] = 0; + + return ret; +} + +namespace { + +void DecodeData(const FunctionCallbackInfo& args) { + StringDecoder* decoder = + reinterpret_cast(Buffer::Data(args[0])); + CHECK_NE(decoder, nullptr); + ssize_t nread = Buffer::Length(args[1]); + MaybeLocal ret = + decoder->DecodeData(args.GetIsolate(), Buffer::Data(args[1]), &nread); + if (!ret.IsEmpty()) + args.GetReturnValue().Set(ret.ToLocalChecked()); +} + +void FlushData(const FunctionCallbackInfo& args) { + StringDecoder* decoder = + reinterpret_cast(Buffer::Data(args[0])); + CHECK_NE(decoder, nullptr); + MaybeLocal ret = decoder->FlushData(args.GetIsolate()); + if (!ret.IsEmpty()) + args.GetReturnValue().Set(ret.ToLocalChecked()); +} + +void InitializeStringDecoder(Local target, + Local unused, + Local context) { + Environment* env = Environment::GetCurrent(context); + Isolate* isolate = env->isolate(); + +#define SET_DECODER_CONSTANT(name) \ + target->Set(context, \ + FIXED_ONE_BYTE_STRING(isolate, #name), \ + Integer::New(isolate, StringDecoder::name)).FromJust() + + SET_DECODER_CONSTANT(kIncompleteCharactersStart); + SET_DECODER_CONSTANT(kIncompleteCharactersEnd); + SET_DECODER_CONSTANT(kMissingBytes); + SET_DECODER_CONSTANT(kBufferedBytes); + SET_DECODER_CONSTANT(kEncodingField); + SET_DECODER_CONSTANT(kNumFields); + + Local encodings = Array::New(isolate); +#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \ + encodings->Set(context, \ + static_cast(cname), \ + FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust() + ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii"); + ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8"); + ADD_TO_ENCODINGS_ARRAY(BASE64, "base64"); + ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le"); + ADD_TO_ENCODINGS_ARRAY(HEX, "hex"); + ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer"); + ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1"); + + target->Set(context, + FIXED_ONE_BYTE_STRING(isolate, "encodings"), + encodings).FromJust(); + + target->Set(context, + FIXED_ONE_BYTE_STRING(isolate, "kSize"), + Integer::New(isolate, sizeof(StringDecoder))).FromJust(); + + env->SetMethod(target, "decode", DecodeData); + env->SetMethod(target, "flush", FlushData); +} + +} // anonymous namespace + +} // namespace node + +NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder, + node::InitializeStringDecoder) diff --git a/src/string_decoder.h b/src/string_decoder.h new file mode 100644 index 00000000000000..fd7ba579f917cb --- /dev/null +++ b/src/string_decoder.h @@ -0,0 +1,50 @@ +#ifndef SRC_STRING_DECODER_H_ +#define SRC_STRING_DECODER_H_ + +#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#include "node.h" + +namespace node { + +class StringDecoder { + public: + StringDecoder() { state_[kEncodingField] = BUFFER; } + void SetEncoding(enum encoding encoding); + enum encoding Encoding() const; + + char* IncompleteCharacterBuffer(); + unsigned MissingBytes() const; + unsigned BufferedBytes() const; + + // Decode a string from the specified encoding. + // The value pointed to by `nread` will be modified to reflect that + // less data may have been read because it ended on an incomplete character + // and more data may have been read because a previously incomplete character + // was finished. + v8::MaybeLocal DecodeData(v8::Isolate* isolate, + const char* data, + ssize_t* nread); + // Flush an incomplete character. For character encodings like UTF8 this + // means printing replacement characters, buf for e.g. Base64 the returned + // string contains more data. + v8::MaybeLocal FlushData(v8::Isolate* isolate); + + enum Fields { + kIncompleteCharactersStart = 0, + kIncompleteCharactersEnd = 4, + kMissingBytes = 4, + kBufferedBytes = 5, + kEncodingField = 6, + kNumFields = 7 + }; + + private: + uint8_t state_[kNumFields] = {}; +}; + +} // namespace node + +#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS + +#endif // SRC_STRING_DECODER_H_ diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 9d1fe69a25df73..21a0b6c3e38539 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -128,6 +128,10 @@ assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), ''); assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), ''); assert.strictEqual(decoder.end(), '\ud83d'); +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d'); +assert.strictEqual(decoder.end(), ''); + common.expectsError( () => new StringDecoder(1), { From 208bf3a94039297d1d97603e2fb4619c59a70f88 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Sat, 3 Feb 2018 20:04:07 +0100 Subject: [PATCH 2/4] [squash] skip call into native inside flush if no bytes have been buffered --- lib/string_decoder.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 69514038a85104..d955a663307de9 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -76,7 +76,9 @@ class StringDecoder { let ret = ''; if (buf !== undefined) ret = this.write(buf); - return ret + flush(this[kNativeDecoder]); + if (this[kNativeDecoder][kBufferedBytes] > 0) + ret += flush(this[kNativeDecoder]); + return ret; } /* Everything below this line is undocumented legacy stuff. */ From ebf70eb95164e0b8a673e00bfb7b39896a1eb6a8 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 9 Feb 2018 10:18:13 +0100 Subject: [PATCH 3/4] [squash] comments --- src/string_decoder.cc | 14 +++++++------- src/string_decoder.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/string_decoder.cc b/src/string_decoder.cc index 000215e5a9fb2e..ad1bace918c678 100644 --- a/src/string_decoder.cc +++ b/src/string_decoder.cc @@ -65,7 +65,7 @@ MaybeLocal MakeString(Isolate* isolate, MaybeLocal StringDecoder::DecodeData(Isolate* isolate, const char* data, - ssize_t* nread_ptr) { + size_t* nread_ptr) { Local prepend, body; size_t nread = *nread_ptr; @@ -150,7 +150,7 @@ MaybeLocal StringDecoder::DecodeData(Isolate* isolate, #endif state_[kBufferedBytes]++; if ((data[i] & 0xC0) == 0x80) { - // This byte does not start a character (a "trailing" bytes). + // This byte does not start a character (a "trailing" byte). if (state_[kBufferedBytes] >= 4 || i == 0) { // We either have more then 4 trailing bytes (which means // the current character would not be inside the range for @@ -223,11 +223,11 @@ MaybeLocal StringDecoder::DecodeData(Isolate* isolate, } } - if (!prepend.IsEmpty()) { + if (prepend.IsEmpty()) { + return body; + } else { return String::Concat(prepend, body); } - - return body; } else { CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1); return MakeString(isolate, data, nread, Encoding()); @@ -240,7 +240,7 @@ MaybeLocal StringDecoder::FlushData(Isolate* isolate) { CHECK_EQ(BufferedBytes(), 0); } - if (Encoding() == UCS2 && (BufferedBytes() % 2) == 1) { + if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) { // Ignore a single trailing byte, like the JS decoder does. state_[kMissingBytes]--; state_[kBufferedBytes]--; @@ -267,7 +267,7 @@ void DecodeData(const FunctionCallbackInfo& args) { StringDecoder* decoder = reinterpret_cast(Buffer::Data(args[0])); CHECK_NE(decoder, nullptr); - ssize_t nread = Buffer::Length(args[1]); + size_t nread = Buffer::Length(args[1]); MaybeLocal ret = decoder->DecodeData(args.GetIsolate(), Buffer::Data(args[1]), &nread); if (!ret.IsEmpty()) diff --git a/src/string_decoder.h b/src/string_decoder.h index fd7ba579f917cb..75b6d0255f926a 100644 --- a/src/string_decoder.h +++ b/src/string_decoder.h @@ -24,7 +24,7 @@ class StringDecoder { // was finished. v8::MaybeLocal DecodeData(v8::Isolate* isolate, const char* data, - ssize_t* nread); + size_t* nread); // Flush an incomplete character. For character encodings like UTF8 this // means printing replacement characters, buf for e.g. Base64 the returned // string contains more data. From 4841d864062579e63c6414e08984d167cfa7ef90 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 9 Feb 2018 16:13:41 +0100 Subject: [PATCH 4/4] [squash] move inline tags into main header --- src/string_decoder-inl.h | 10 +++++----- src/string_decoder.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/string_decoder-inl.h b/src/string_decoder-inl.h index 77d018579a1767..8a04211906f759 100644 --- a/src/string_decoder-inl.h +++ b/src/string_decoder-inl.h @@ -8,25 +8,25 @@ namespace node { -inline void StringDecoder::SetEncoding(enum encoding encoding) { +void StringDecoder::SetEncoding(enum encoding encoding) { state_[kBufferedBytes] = 0; state_[kMissingBytes] = 0; state_[kEncodingField] = encoding; } -inline enum encoding StringDecoder::Encoding() const { +enum encoding StringDecoder::Encoding() const { return static_cast(state_[kEncodingField]); } -inline unsigned StringDecoder::BufferedBytes() const { +unsigned StringDecoder::BufferedBytes() const { return state_[kBufferedBytes]; } -inline unsigned StringDecoder::MissingBytes() const { +unsigned StringDecoder::MissingBytes() const { return state_[kMissingBytes]; } -inline char* StringDecoder::IncompleteCharacterBuffer() { +char* StringDecoder::IncompleteCharacterBuffer() { return reinterpret_cast(state_ + kIncompleteCharactersStart); } diff --git a/src/string_decoder.h b/src/string_decoder.h index 75b6d0255f926a..9059eeaa9d2eb7 100644 --- a/src/string_decoder.h +++ b/src/string_decoder.h @@ -10,12 +10,12 @@ namespace node { class StringDecoder { public: StringDecoder() { state_[kEncodingField] = BUFFER; } - void SetEncoding(enum encoding encoding); - enum encoding Encoding() const; + inline void SetEncoding(enum encoding encoding); + inline enum encoding Encoding() const; - char* IncompleteCharacterBuffer(); - unsigned MissingBytes() const; - unsigned BufferedBytes() const; + inline char* IncompleteCharacterBuffer(); + inline unsigned MissingBytes() const; + inline unsigned BufferedBytes() const; // Decode a string from the specified encoding. // The value pointed to by `nread` will be modified to reflect that