22
22
'use strict' ;
23
23
24
24
const { Buffer } = require ( 'buffer' ) ;
25
+ const {
26
+ kIncompleteCharactersStart,
27
+ kIncompleteCharactersEnd,
28
+ kMissingBytes,
29
+ kBufferedBytes,
30
+ kEncodingField,
31
+ kSize,
32
+ decode,
33
+ flush,
34
+ encodings
35
+ } = internalBinding ( 'string_decoder' ) ;
25
36
const internalUtil = require ( 'internal/util' ) ;
26
37
const errors = require ( 'internal/errors' ) ;
27
38
const isEncoding = Buffer [ internalUtil . kIsEncodingSymbol ] ;
28
39
40
+ const kNativeDecoder = Symbol ( 'kNativeDecoder' ) ;
41
+
29
42
// Do not cache `Buffer.isEncoding` when checking encoding names as some
30
43
// modules monkey-patch it to support additional encodings
31
44
function normalizeEncoding ( enc ) {
@@ -36,258 +49,54 @@ function normalizeEncoding(enc) {
36
49
return nenc || enc ;
37
50
}
38
51
52
+ const encodingsMap = { } ;
53
+ for ( var i = 0 ; i < encodings . length ; ++ i )
54
+ encodingsMap [ encodings [ i ] ] = i ;
55
+
39
56
// StringDecoder provides an interface for efficiently splitting a series of
40
57
// buffers into a series of JS strings without breaking apart multi-byte
41
58
// characters.
42
- exports . StringDecoder = StringDecoder ;
43
- function StringDecoder ( encoding ) {
44
- this . encoding = normalizeEncoding ( encoding ) ;
45
- var nb ;
46
- switch ( this . encoding ) {
47
- case 'utf16le' :
48
- this . text = utf16Text ;
49
- this . end = utf16End ;
50
- nb = 4 ;
51
- break ;
52
- case 'utf8' :
53
- this . fillLast = utf8FillLast ;
54
- nb = 4 ;
55
- break ;
56
- case 'base64' :
57
- this . text = base64Text ;
58
- this . end = base64End ;
59
- nb = 3 ;
60
- break ;
61
- default :
62
- this . write = simpleWrite ;
63
- this . end = simpleEnd ;
64
- return ;
65
- }
66
- this . lastNeed = 0 ;
67
- this . lastTotal = 0 ;
68
- this . lastChar = Buffer . allocUnsafe ( nb ) ;
69
- }
70
-
71
- StringDecoder . prototype . write = function ( buf ) {
72
- if ( buf . length === 0 )
73
- return '' ;
74
- var r ;
75
- var i ;
76
- if ( this . lastNeed ) {
77
- r = this . fillLast ( buf ) ;
78
- if ( r === undefined )
79
- return '' ;
80
- i = this . lastNeed ;
81
- this . lastNeed = 0 ;
82
- } else {
83
- i = 0 ;
84
- }
85
- if ( i < buf . length )
86
- return ( r ? r + this . text ( buf , i ) : this . text ( buf , i ) ) ;
87
- return r || '' ;
88
- } ;
89
-
90
- StringDecoder . prototype . end = utf8End ;
91
-
92
- // Returns only complete characters in a Buffer
93
- StringDecoder . prototype . text = utf8Text ;
94
-
95
- // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
96
- StringDecoder . prototype . fillLast = function ( buf ) {
97
- if ( this . lastNeed <= buf . length ) {
98
- buf . copy ( this . lastChar , this . lastTotal - this . lastNeed , 0 , this . lastNeed ) ;
99
- return this . lastChar . toString ( this . encoding , 0 , this . lastTotal ) ;
100
- }
101
- buf . copy ( this . lastChar , this . lastTotal - this . lastNeed , 0 , buf . length ) ;
102
- this . lastNeed -= buf . length ;
103
- } ;
104
-
105
- // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
106
- // continuation byte. If an invalid byte is detected, -2 is returned.
107
- function utf8CheckByte ( byte ) {
108
- if ( byte <= 0x7F )
109
- return 0 ;
110
- else if ( byte >> 5 === 0x06 )
111
- return 2 ;
112
- else if ( byte >> 4 === 0x0E )
113
- return 3 ;
114
- else if ( byte >> 3 === 0x1E )
115
- return 4 ;
116
- return ( byte >> 6 === 0x02 ? - 1 : - 2 ) ;
117
- }
118
-
119
- // Checks at most 3 bytes at the end of a Buffer in order to detect an
120
- // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
121
- // needed to complete the UTF-8 character (if applicable) are returned.
122
- function utf8CheckIncomplete ( self , buf , i ) {
123
- var j = buf . length - 1 ;
124
- if ( j < i )
125
- return 0 ;
126
- var nb = utf8CheckByte ( buf [ j ] ) ;
127
- if ( nb >= 0 ) {
128
- if ( nb > 0 )
129
- self . lastNeed = nb - 1 ;
130
- return nb ;
131
- }
132
- if ( -- j < i || nb === - 2 )
133
- return 0 ;
134
- nb = utf8CheckByte ( buf [ j ] ) ;
135
- if ( nb >= 0 ) {
136
- if ( nb > 0 )
137
- self . lastNeed = nb - 2 ;
138
- return nb ;
139
- }
140
- if ( -- j < i || nb === - 2 )
141
- return 0 ;
142
- nb = utf8CheckByte ( buf [ j ] ) ;
143
- if ( nb >= 0 ) {
144
- if ( nb > 0 ) {
145
- if ( nb === 2 )
146
- nb = 0 ;
147
- else
148
- self . lastNeed = nb - 3 ;
149
- }
150
- return nb ;
151
- }
152
- return 0 ;
153
- }
154
-
155
- // Validates as many continuation bytes for a multi-byte UTF-8 character as
156
- // needed or are available. If we see a non-continuation byte where we expect
157
- // one, we "replace" the validated continuation bytes we've seen so far with
158
- // a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
159
- // behavior. The continuation byte check is included three times in the case
160
- // where all of the continuation bytes for a character exist in the same buffer.
161
- // It is also done this way as a slight performance increase instead of using a
162
- // loop.
163
- function utf8CheckExtraBytes ( self , buf , p ) {
164
- if ( ( buf [ 0 ] & 0xC0 ) !== 0x80 ) {
165
- self . lastNeed = 0 ;
166
- return '\ufffd' ;
167
- }
168
- if ( self . lastNeed > 1 && buf . length > 1 ) {
169
- if ( ( buf [ 1 ] & 0xC0 ) !== 0x80 ) {
170
- self . lastNeed = 1 ;
171
- return '\ufffd' ;
172
- }
173
- if ( self . lastNeed > 2 && buf . length > 2 ) {
174
- if ( ( buf [ 2 ] & 0xC0 ) !== 0x80 ) {
175
- self . lastNeed = 2 ;
176
- return '\ufffd' ;
177
- }
178
- }
59
+ class StringDecoder {
60
+ constructor ( encoding ) {
61
+ this . encoding = normalizeEncoding ( encoding ) ;
62
+ this [ kNativeDecoder ] = Buffer . alloc ( kSize ) ;
63
+ this [ kNativeDecoder ] [ kEncodingField ] = encodingsMap [ this . encoding ] ;
179
64
}
180
- }
181
65
182
- // Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
183
- function utf8FillLast ( buf ) {
184
- const p = this . lastTotal - this . lastNeed ;
185
- var r = utf8CheckExtraBytes ( this , buf , p ) ;
186
- if ( r !== undefined )
187
- return r ;
188
- if ( this . lastNeed <= buf . length ) {
189
- buf . copy ( this . lastChar , p , 0 , this . lastNeed ) ;
190
- return this . lastChar . toString ( this . encoding , 0 , this . lastTotal ) ;
66
+ write ( buf ) {
67
+ if ( typeof buf === 'string' )
68
+ return buf ;
69
+ if ( ! ArrayBuffer . isView ( buf ) )
70
+ throw new errors . TypeError ( 'ERR_INVALID_ARG_TYPE' , 'buf' ,
71
+ [ 'Buffer' , 'Uint8Array' , 'ArrayBufferView' ] ) ;
72
+ return decode ( this [ kNativeDecoder ] , buf ) ;
191
73
}
192
- buf . copy ( this . lastChar , p , 0 , buf . length ) ;
193
- this . lastNeed -= buf . length ;
194
- }
195
74
196
- // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
197
- // partial character, the character's bytes are buffered until the required
198
- // number of bytes are available.
199
- function utf8Text ( buf , i ) {
200
- const total = utf8CheckIncomplete ( this , buf , i ) ;
201
- if ( ! this . lastNeed )
202
- return buf . toString ( 'utf8' , i ) ;
203
- this . lastTotal = total ;
204
- const end = buf . length - ( total - this . lastNeed ) ;
205
- buf . copy ( this . lastChar , 0 , end ) ;
206
- return buf . toString ( 'utf8' , i , end ) ;
207
- }
208
-
209
- // For UTF-8, a replacement character is added when ending on a partial
210
- // character.
211
- function utf8End ( buf ) {
212
- const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
213
- if ( this . lastNeed ) {
214
- this . lastNeed = 0 ;
215
- this . lastTotal = 0 ;
216
- return r + '\ufffd' ;
75
+ end ( buf ) {
76
+ let ret = '' ;
77
+ if ( buf !== undefined )
78
+ ret = this . write ( buf ) ;
79
+ if ( this [ kNativeDecoder ] [ kBufferedBytes ] > 0 )
80
+ ret += flush ( this [ kNativeDecoder ] ) ;
81
+ return ret ;
217
82
}
218
- return r ;
219
- }
220
83
221
- // UTF-16LE typically needs two bytes per character, but even if we have an even
222
- // number of bytes available, we need to check if we end on a leading/high
223
- // surrogate. In that case, we need to wait for the next two bytes in order to
224
- // decode the last character properly.
225
- function utf16Text ( buf , i ) {
226
- if ( ( buf . length - i ) % 2 === 0 ) {
227
- const r = buf . toString ( 'utf16le' , i ) ;
228
- if ( r ) {
229
- const c = r . charCodeAt ( r . length - 1 ) ;
230
- if ( c >= 0xD800 && c <= 0xDBFF ) {
231
- this . lastNeed = 2 ;
232
- this . lastTotal = 4 ;
233
- this . lastChar [ 0 ] = buf [ buf . length - 2 ] ;
234
- this . lastChar [ 1 ] = buf [ buf . length - 1 ] ;
235
- return r . slice ( 0 , - 1 ) ;
236
- }
237
- }
238
- return r ;
239
- }
240
- this . lastNeed = 1 ;
241
- this . lastTotal = 2 ;
242
- this . lastChar [ 0 ] = buf [ buf . length - 1 ] ;
243
- return buf . toString ( 'utf16le' , i , buf . length - 1 ) ;
244
- }
84
+ /* Everything below this line is undocumented legacy stuff. */
245
85
246
- // For UTF-16LE we do not explicitly append special replacement characters if we
247
- // end on a partial character, we simply let v8 handle that.
248
- function utf16End ( buf ) {
249
- const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
250
- if ( this . lastNeed ) {
251
- const end = this . lastTotal - this . lastNeed ;
252
- this . lastNeed = 0 ;
253
- this . lastTotal = 0 ;
254
- return r + this . lastChar . toString ( 'utf16le' , 0 , end ) ;
86
+ text ( buf , offset ) {
87
+ this [ kNativeDecoder ] [ kMissingBytes ] = 0 ;
88
+ this [ kNativeDecoder ] [ kBufferedBytes ] = 0 ;
89
+ return this . write ( buf . slice ( offset ) ) ;
255
90
}
256
- return r ;
257
- }
258
91
259
- function base64Text ( buf , i ) {
260
- const n = ( buf . length - i ) % 3 ;
261
- if ( n === 0 )
262
- return buf . toString ( 'base64' , i ) ;
263
- this . lastNeed = 3 - n ;
264
- this . lastTotal = 3 ;
265
- if ( n === 1 ) {
266
- this . lastChar [ 0 ] = buf [ buf . length - 1 ] ;
267
- } else {
268
- this . lastChar [ 0 ] = buf [ buf . length - 2 ] ;
269
- this . lastChar [ 1 ] = buf [ buf . length - 1 ] ;
92
+ get lastTotal ( ) {
93
+ return this [ kNativeDecoder ] [ kBufferedBytes ] + this . lastNeed ;
270
94
}
271
- return buf . toString ( 'base64' , i , buf . length - n ) ;
272
- }
273
-
274
95
275
- function base64End ( buf ) {
276
- const r = ( buf && buf . length ? this . write ( buf ) : '' ) ;
277
- if ( this . lastNeed ) {
278
- const end = 3 - this . lastNeed ;
279
- this . lastNeed = 0 ;
280
- this . lastTotal = 0 ;
281
- return r + this . lastChar . toString ( 'base64' , 0 , end ) ;
96
+ get lastChar ( ) {
97
+ return this [ kNativeDecoder ] . subarray ( kIncompleteCharactersStart ,
98
+ kIncompleteCharactersEnd ) ;
282
99
}
283
- return r ;
284
100
}
285
101
286
- // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
287
- function simpleWrite ( buf ) {
288
- return buf . toString ( this . encoding ) ;
289
- }
290
-
291
- function simpleEnd ( buf ) {
292
- return ( buf && buf . length ? this . write ( buf ) : '' ) ;
293
- }
102
+ exports . StringDecoder = StringDecoder ;
0 commit comments