string_decoder: align UTF-8 handling with V8

mscdex · targos · commit 24ef1e677575 · 2017-01-26T22:46:18.000+01:00
V8 5.5 changed how invalid characters are handled and it now appears to follow the WHATWG Encoding standard, where all of an invalid character's bytes are replaced by a single replacement character (\ufffd) instead of replacing each invalid byte with separate replacement characters. Example: the byte sequence 0xF0,0xB8,0x41 is decoded as '\ufffdA' in V8 5.5, but is decoded as '\ufffd\ufffdA' in previous versions of V8. PR-URL: #9618 Reviewed-By: Ali Ijaz Sheikh <ofrobots@google.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
diff --git a/lib/string_decoder.js b/lib/string_decoder.js
@@ -81,7 +81,7 @@ StringDecoder.prototype.fillLast = function(buf) {
 };
 
 // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
-// continuation byte.
+// continuation byte. If an invalid byte is detected, -2 is returned.
 function utf8CheckByte(byte) {
   if (byte <= 0x7F)
     return 0;
@@ -91,7 +91,7 @@ function utf8CheckByte(byte) {
     return 3;
   else if (byte >> 3 === 0x1E)
     return 4;
-  return -1;
+  return (byte >> 6 === 0x02 ? -1 : -2);
 }
 
 // Checks at most 3 bytes at the end of a Buffer in order to detect an
@@ -107,15 +107,15 @@ function utf8CheckIncomplete(self, buf, i) {
       self.lastNeed = nb - 1;
     return nb;
   }
-  if (--j < i)
+  if (--j < i || nb === -2)
     return 0;
   nb = utf8CheckByte(buf[j]);
   if (nb >= 0) {
     if (nb > 0)
       self.lastNeed = nb - 2;
     return nb;
   }
-  if (--j < i)
+  if (--j < i || nb === -2)
     return 0;
   nb = utf8CheckByte(buf[j]);
   if (nb >= 0) {
@@ -133,25 +133,25 @@ function utf8CheckIncomplete(self, buf, i) {
 // Validates as many continuation bytes for a multi-byte UTF-8 character as
 // needed or are available. If we see a non-continuation byte where we expect
 // one, we "replace" the validated continuation bytes we've seen so far with
-// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
+// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
 // behavior. The continuation byte check is included three times in the case
 // where all of the continuation bytes for a character exist in the same buffer.
 // It is also done this way as a slight performance increase instead of using a
 // loop.
 function utf8CheckExtraBytes(self, buf, p) {
   if ((buf[0] & 0xC0) !== 0x80) {
     self.lastNeed = 0;
-    return '\ufffd'.repeat(p);
+    return '\ufffd';
   }
   if (self.lastNeed > 1 && buf.length > 1) {
     if ((buf[1] & 0xC0) !== 0x80) {
       self.lastNeed = 1;
-      return '\ufffd'.repeat(p + 1);
+      return '\ufffd';
     }
     if (self.lastNeed > 2 && buf.length > 2) {
       if ((buf[2] & 0xC0) !== 0x80) {
         self.lastNeed = 2;
-        return '\ufffd'.repeat(p + 2);
+        return '\ufffd';
       }
     }
   }
@@ -184,12 +184,12 @@ function utf8Text(buf, i) {
   return buf.toString('utf8', i, end);
 }
 
-// For UTF-8, a replacement character for each buffered byte of a (partial)
-// character needs to be added to the output.
+// For UTF-8, a replacement character is added when ending on a partial
+// character.
 function utf8End(buf) {
   const r = (buf && buf.length ? this.write(buf) : '');
   if (this.lastNeed)
-    return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
+    return r + '\ufffd';
   return r;
 }
 
diff --git a/test/parallel/test-string-decoder-end.js b/test/parallel/test-string-decoder-end.js
@@ -18,17 +18,13 @@ for (let i = 1; i <= 16; i++) {
 
 encodings.forEach(testEncoding);
 
-console.log('ok');
-
 function testEncoding(encoding) {
   bufs.forEach((buf) => {
     testBuf(encoding, buf);
   });
 }
 
 function testBuf(encoding, buf) {
-  console.error('# %s', encoding, buf);
-
   // write one byte at a time.
   let s = new SD(encoding);
   let res1 = '';
@@ -46,9 +42,6 @@ function testBuf(encoding, buf) {
   // .toString() on the buffer
   const res3 = buf.toString(encoding);
 
-  console.log('expect=%j', res3);
-  console.log('res1=%j', res1);
-  console.log('res2=%j', res2);
   assert.strictEqual(res1, res3, 'one byte at a time should match toString');
   assert.strictEqual(res2, res3, 'all bytes at once should match toString');
 }
diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
@@ -8,8 +8,6 @@ const StringDecoder = require('string_decoder').StringDecoder;
 let decoder = new StringDecoder();
 assert.strictEqual(decoder.encoding, 'utf8');
 
-process.stdout.write('scanning ');
-
 // UTF-8
 test('utf-8', Buffer.from('$', 'utf-8'), '$');
 test('utf-8', Buffer.from('¢', 'utf-8'), '¢');
@@ -42,32 +40,30 @@ test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
 test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
 test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
 test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
-test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
+test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA');
 test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
 test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
 test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
-test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
+test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338');
 test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
-test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
-     '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
 test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+// CESU-8 of U+1D40D
+test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), '\ufffd\ufffd');
 
 // UCS-2
 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
 
 // UTF-16LE
 test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up
 
-console.log(' crayon!');
-
 // Additional UTF-8 tests
 decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), '');
 assert.strictEqual(decoder.end(), '\ufffd');
 
 decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), '');
-assert.strictEqual(decoder.end(), '\ufffd\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
 
 decoder = new StringDecoder('utf8');
 assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd');
@@ -131,7 +127,6 @@ function test(encoding, input, expected, singleSequence) {
       output += decoder.write(input.slice(write[0], write[1]));
     });
     output += decoder.end();
-    process.stdout.write('.');
     if (output !== expected) {
       const message =
         'Expected "' + unicodeEscape(expected) + '", ' +