Skip to content

Commit 180af17

Browse files
addaleaxBridgeAR
authored andcommitted
string_decoder: reimplement in C++
Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. PR-URL: #18537 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
1 parent de848ac commit 180af17

File tree

7 files changed

+478
-238
lines changed

7 files changed

+478
-238
lines changed

lib/string_decoder.js

Lines changed: 47 additions & 238 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,23 @@
2222
'use strict';
2323

2424
const { Buffer } = require('buffer');
25+
const {
26+
kIncompleteCharactersStart,
27+
kIncompleteCharactersEnd,
28+
kMissingBytes,
29+
kBufferedBytes,
30+
kEncodingField,
31+
kSize,
32+
decode,
33+
flush,
34+
encodings
35+
} = internalBinding('string_decoder');
2536
const internalUtil = require('internal/util');
2637
const errors = require('internal/errors');
2738
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
2839

40+
const kNativeDecoder = Symbol('kNativeDecoder');
41+
2942
// Do not cache `Buffer.isEncoding` when checking encoding names as some
3043
// modules monkey-patch it to support additional encodings
3144
function normalizeEncoding(enc) {
@@ -36,258 +49,54 @@ function normalizeEncoding(enc) {
3649
return nenc || enc;
3750
}
3851

52+
const encodingsMap = {};
53+
for (var i = 0; i < encodings.length; ++i)
54+
encodingsMap[encodings[i]] = i;
55+
3956
// StringDecoder provides an interface for efficiently splitting a series of
4057
// buffers into a series of JS strings without breaking apart multi-byte
4158
// characters.
42-
exports.StringDecoder = StringDecoder;
43-
function StringDecoder(encoding) {
44-
this.encoding = normalizeEncoding(encoding);
45-
var nb;
46-
switch (this.encoding) {
47-
case 'utf16le':
48-
this.text = utf16Text;
49-
this.end = utf16End;
50-
nb = 4;
51-
break;
52-
case 'utf8':
53-
this.fillLast = utf8FillLast;
54-
nb = 4;
55-
break;
56-
case 'base64':
57-
this.text = base64Text;
58-
this.end = base64End;
59-
nb = 3;
60-
break;
61-
default:
62-
this.write = simpleWrite;
63-
this.end = simpleEnd;
64-
return;
65-
}
66-
this.lastNeed = 0;
67-
this.lastTotal = 0;
68-
this.lastChar = Buffer.allocUnsafe(nb);
69-
}
70-
71-
StringDecoder.prototype.write = function(buf) {
72-
if (buf.length === 0)
73-
return '';
74-
var r;
75-
var i;
76-
if (this.lastNeed) {
77-
r = this.fillLast(buf);
78-
if (r === undefined)
79-
return '';
80-
i = this.lastNeed;
81-
this.lastNeed = 0;
82-
} else {
83-
i = 0;
84-
}
85-
if (i < buf.length)
86-
return (r ? r + this.text(buf, i) : this.text(buf, i));
87-
return r || '';
88-
};
89-
90-
StringDecoder.prototype.end = utf8End;
91-
92-
// Returns only complete characters in a Buffer
93-
StringDecoder.prototype.text = utf8Text;
94-
95-
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
96-
StringDecoder.prototype.fillLast = function(buf) {
97-
if (this.lastNeed <= buf.length) {
98-
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
99-
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
100-
}
101-
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
102-
this.lastNeed -= buf.length;
103-
};
104-
105-
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
106-
// continuation byte. If an invalid byte is detected, -2 is returned.
107-
function utf8CheckByte(byte) {
108-
if (byte <= 0x7F)
109-
return 0;
110-
else if (byte >> 5 === 0x06)
111-
return 2;
112-
else if (byte >> 4 === 0x0E)
113-
return 3;
114-
else if (byte >> 3 === 0x1E)
115-
return 4;
116-
return (byte >> 6 === 0x02 ? -1 : -2);
117-
}
118-
119-
// Checks at most 3 bytes at the end of a Buffer in order to detect an
120-
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
121-
// needed to complete the UTF-8 character (if applicable) are returned.
122-
function utf8CheckIncomplete(self, buf, i) {
123-
var j = buf.length - 1;
124-
if (j < i)
125-
return 0;
126-
var nb = utf8CheckByte(buf[j]);
127-
if (nb >= 0) {
128-
if (nb > 0)
129-
self.lastNeed = nb - 1;
130-
return nb;
131-
}
132-
if (--j < i || nb === -2)
133-
return 0;
134-
nb = utf8CheckByte(buf[j]);
135-
if (nb >= 0) {
136-
if (nb > 0)
137-
self.lastNeed = nb - 2;
138-
return nb;
139-
}
140-
if (--j < i || nb === -2)
141-
return 0;
142-
nb = utf8CheckByte(buf[j]);
143-
if (nb >= 0) {
144-
if (nb > 0) {
145-
if (nb === 2)
146-
nb = 0;
147-
else
148-
self.lastNeed = nb - 3;
149-
}
150-
return nb;
151-
}
152-
return 0;
153-
}
154-
155-
// Validates as many continuation bytes for a multi-byte UTF-8 character as
156-
// needed or are available. If we see a non-continuation byte where we expect
157-
// one, we "replace" the validated continuation bytes we've seen so far with
158-
// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
159-
// behavior. The continuation byte check is included three times in the case
160-
// where all of the continuation bytes for a character exist in the same buffer.
161-
// It is also done this way as a slight performance increase instead of using a
162-
// loop.
163-
function utf8CheckExtraBytes(self, buf, p) {
164-
if ((buf[0] & 0xC0) !== 0x80) {
165-
self.lastNeed = 0;
166-
return '\ufffd';
167-
}
168-
if (self.lastNeed > 1 && buf.length > 1) {
169-
if ((buf[1] & 0xC0) !== 0x80) {
170-
self.lastNeed = 1;
171-
return '\ufffd';
172-
}
173-
if (self.lastNeed > 2 && buf.length > 2) {
174-
if ((buf[2] & 0xC0) !== 0x80) {
175-
self.lastNeed = 2;
176-
return '\ufffd';
177-
}
178-
}
59+
class StringDecoder {
60+
constructor(encoding) {
61+
this.encoding = normalizeEncoding(encoding);
62+
this[kNativeDecoder] = Buffer.alloc(kSize);
63+
this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding];
17964
}
180-
}
18165

182-
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
183-
function utf8FillLast(buf) {
184-
const p = this.lastTotal - this.lastNeed;
185-
var r = utf8CheckExtraBytes(this, buf, p);
186-
if (r !== undefined)
187-
return r;
188-
if (this.lastNeed <= buf.length) {
189-
buf.copy(this.lastChar, p, 0, this.lastNeed);
190-
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
66+
write(buf) {
67+
if (typeof buf === 'string')
68+
return buf;
69+
if (!ArrayBuffer.isView(buf))
70+
throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf',
71+
['Buffer', 'Uint8Array', 'ArrayBufferView']);
72+
return decode(this[kNativeDecoder], buf);
19173
}
192-
buf.copy(this.lastChar, p, 0, buf.length);
193-
this.lastNeed -= buf.length;
194-
}
19574

196-
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
197-
// partial character, the character's bytes are buffered until the required
198-
// number of bytes are available.
199-
function utf8Text(buf, i) {
200-
const total = utf8CheckIncomplete(this, buf, i);
201-
if (!this.lastNeed)
202-
return buf.toString('utf8', i);
203-
this.lastTotal = total;
204-
const end = buf.length - (total - this.lastNeed);
205-
buf.copy(this.lastChar, 0, end);
206-
return buf.toString('utf8', i, end);
207-
}
208-
209-
// For UTF-8, a replacement character is added when ending on a partial
210-
// character.
211-
function utf8End(buf) {
212-
const r = (buf && buf.length ? this.write(buf) : '');
213-
if (this.lastNeed) {
214-
this.lastNeed = 0;
215-
this.lastTotal = 0;
216-
return r + '\ufffd';
75+
end(buf) {
76+
let ret = '';
77+
if (buf !== undefined)
78+
ret = this.write(buf);
79+
if (this[kNativeDecoder][kBufferedBytes] > 0)
80+
ret += flush(this[kNativeDecoder]);
81+
return ret;
21782
}
218-
return r;
219-
}
22083

221-
// UTF-16LE typically needs two bytes per character, but even if we have an even
222-
// number of bytes available, we need to check if we end on a leading/high
223-
// surrogate. In that case, we need to wait for the next two bytes in order to
224-
// decode the last character properly.
225-
function utf16Text(buf, i) {
226-
if ((buf.length - i) % 2 === 0) {
227-
const r = buf.toString('utf16le', i);
228-
if (r) {
229-
const c = r.charCodeAt(r.length - 1);
230-
if (c >= 0xD800 && c <= 0xDBFF) {
231-
this.lastNeed = 2;
232-
this.lastTotal = 4;
233-
this.lastChar[0] = buf[buf.length - 2];
234-
this.lastChar[1] = buf[buf.length - 1];
235-
return r.slice(0, -1);
236-
}
237-
}
238-
return r;
239-
}
240-
this.lastNeed = 1;
241-
this.lastTotal = 2;
242-
this.lastChar[0] = buf[buf.length - 1];
243-
return buf.toString('utf16le', i, buf.length - 1);
244-
}
84+
/* Everything below this line is undocumented legacy stuff. */
24585

246-
// For UTF-16LE we do not explicitly append special replacement characters if we
247-
// end on a partial character, we simply let v8 handle that.
248-
function utf16End(buf) {
249-
const r = (buf && buf.length ? this.write(buf) : '');
250-
if (this.lastNeed) {
251-
const end = this.lastTotal - this.lastNeed;
252-
this.lastNeed = 0;
253-
this.lastTotal = 0;
254-
return r + this.lastChar.toString('utf16le', 0, end);
86+
text(buf, offset) {
87+
this[kNativeDecoder][kMissingBytes] = 0;
88+
this[kNativeDecoder][kBufferedBytes] = 0;
89+
return this.write(buf.slice(offset));
25590
}
256-
return r;
257-
}
25891

259-
function base64Text(buf, i) {
260-
const n = (buf.length - i) % 3;
261-
if (n === 0)
262-
return buf.toString('base64', i);
263-
this.lastNeed = 3 - n;
264-
this.lastTotal = 3;
265-
if (n === 1) {
266-
this.lastChar[0] = buf[buf.length - 1];
267-
} else {
268-
this.lastChar[0] = buf[buf.length - 2];
269-
this.lastChar[1] = buf[buf.length - 1];
92+
get lastTotal() {
93+
return this[kNativeDecoder][kBufferedBytes] + this.lastNeed;
27094
}
271-
return buf.toString('base64', i, buf.length - n);
272-
}
273-
27495

275-
function base64End(buf) {
276-
const r = (buf && buf.length ? this.write(buf) : '');
277-
if (this.lastNeed) {
278-
const end = 3 - this.lastNeed;
279-
this.lastNeed = 0;
280-
this.lastTotal = 0;
281-
return r + this.lastChar.toString('base64', 0, end);
96+
get lastChar() {
97+
return this[kNativeDecoder].subarray(kIncompleteCharactersStart,
98+
kIncompleteCharactersEnd);
28299
}
283-
return r;
284100
}
285101

286-
// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
287-
function simpleWrite(buf) {
288-
return buf.toString(this.encoding);
289-
}
290-
291-
function simpleEnd(buf) {
292-
return (buf && buf.length ? this.write(buf) : '');
293-
}
102+
exports.StringDecoder = StringDecoder;

node.gyp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@
326326
'src/signal_wrap.cc',
327327
'src/spawn_sync.cc',
328328
'src/string_bytes.cc',
329+
'src/string_decoder.cc',
329330
'src/string_search.cc',
330331
'src/stream_base.cc',
331332
'src/stream_wrap.cc',
@@ -379,6 +380,8 @@
379380
'src/req_wrap.h',
380381
'src/req_wrap-inl.h',
381382
'src/string_bytes.h',
383+
'src/string_decoder.h',
384+
'src/string_decoder-inl.h',
382385
'src/stream_base.h',
383386
'src/stream_base-inl.h',
384387
'src/stream_wrap.h',
@@ -989,6 +992,7 @@
989992
'<(obj_path)<(obj_separator)node_url.<(obj_suffix)',
990993
'<(obj_path)<(obj_separator)util.<(obj_suffix)',
991994
'<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)',
995+
'<(obj_path)<(obj_separator)string_decoder.<(obj_suffix)',
992996
'<(obj_path)<(obj_separator)string_search.<(obj_suffix)',
993997
'<(obj_path)<(obj_separator)stream_base.<(obj_suffix)',
994998
'<(obj_path)<(obj_separator)node_constants.<(obj_suffix)',

src/node_internals.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ struct sockaddr;
120120
V(signal_wrap) \
121121
V(spawn_sync) \
122122
V(stream_wrap) \
123+
V(string_decoder) \
123124
V(tcp_wrap) \
124125
V(timer_wrap) \
125126
V(trace_events) \

src/string_decoder-inl.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#ifndef SRC_STRING_DECODER_INL_H_
2+
#define SRC_STRING_DECODER_INL_H_
3+
4+
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
5+
6+
#include "string_decoder.h"
7+
#include "util.h"
8+
9+
namespace node {
10+
11+
void StringDecoder::SetEncoding(enum encoding encoding) {
12+
state_[kBufferedBytes] = 0;
13+
state_[kMissingBytes] = 0;
14+
state_[kEncodingField] = encoding;
15+
}
16+
17+
enum encoding StringDecoder::Encoding() const {
18+
return static_cast<enum encoding>(state_[kEncodingField]);
19+
}
20+
21+
unsigned StringDecoder::BufferedBytes() const {
22+
return state_[kBufferedBytes];
23+
}
24+
25+
unsigned StringDecoder::MissingBytes() const {
26+
return state_[kMissingBytes];
27+
}
28+
29+
char* StringDecoder::IncompleteCharacterBuffer() {
30+
return reinterpret_cast<char*>(state_ + kIncompleteCharactersStart);
31+
}
32+
33+
34+
} // namespace node
35+
36+
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
37+
38+
#endif // SRC_STRING_DECODER_INL_H_

0 commit comments

Comments
 (0)