fix(deserialize): fix deserialization of 0xFFFD

When deserializing a string, we previously relied on the appearance of 0xFFFD to denote an invalid unicode character. However, 0xFFFD is a valid unicode character if that is what was originally input. Fixes NODE-1718 Fixes #277
mongodb · Nov 2, 2018 · c682ae3 · c682ae3
1 parent 06af813
commit c682ae3
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 5 deletions.
diff --git a/lib/parser/deserializer.js b/lib/parser/deserializer.js
@@ -14,6 +14,7 @@ const DBRef = require('../db_ref');
 const BSONRegExp = require('../regexp');
 const Binary = require('../binary');
 const constants = require('../constants');
+const validateUtf8 = require('../validate_utf8').validateUtf8;
 
 // Internal long versions
 const JS_INT_MAX_LONG = Long.fromNumber(constants.JS_INT_MAX);
@@ -134,13 +135,12 @@ function deserializeObject(buffer, index, options, isArray) {
       )
         throw new Error('bad string length in bson');
 
-      const s = buffer.toString('utf8', index, index + stringSize - 1);
-      for (i = 0; i < s.length; i++) {
-        if (s.charCodeAt(i) === 0xfffd) {
-          throw new Error('Invalid UTF-8 string in BSON document');
-        }
+      if (!validateUtf8(buffer, index, index + stringSize - 1)) {
+        throw new Error('Invalid UTF-8 string in BSON document');
       }
 
+      const s = buffer.toString('utf8', index, index + stringSize - 1);
+
       object[name] = s;
       index = index + stringSize;
     } else if (elementType === constants.BSON_DATA_OID) {

diff --git a/lib/validate_utf8.js b/lib/validate_utf8.js
@@ -0,0 +1,48 @@
+'use strict';
+
+const FIRST_BIT = 0x80;
+const FIRST_TWO_BITS = 0xc0;
+const FIRST_THREE_BITS = 0xe0;
+const FIRST_FOUR_BITS = 0xf0;
+const FIRST_FIVE_BITS = 0xf8;
+
+const TWO_BIT_CHAR = 0xc0;
+const THREE_BIT_CHAR = 0xe0;
+const FOUR_BIT_CHAR = 0xf0;
+const CONTINUING_CHAR = 0x80;
+
+/**
+ * Determines if the passed in bytes are valid utf8
+ * @param {Buffer|Uint8Array} bytes An array of 8-bit bytes. Must be indexable and have length property
+ * @param {Number} start The index to start validating
+ * @param {Number} end The index to end validating
+ * @returns {boolean} True if valid utf8
+ */
+function validateUtf8(bytes, start, end) {
+  let continuation = 0;
+
+  for (let i = start; i < end; i += 1) {
+    const byte = bytes[i];
+
+    if (continuation) {
+      if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
+        return false;
+      }
+      continuation -= 1;
+    } else if (byte & FIRST_BIT) {
+      if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
+        continuation = 1;
+      } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
+        continuation = 2;
+      } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
+        continuation = 3;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  return !continuation;
+}
+
+module.exports.validateUtf8 = validateUtf8;
diff --git a/test/node/string_test.js b/test/node/string_test.js
@@ -0,0 +1,12 @@
+'use strict';
+
+const BSON = require('../../lib/bson');
+
+describe('string tests', function() {
+  it('can serialize and deserialize 0xFFFD', function() {
+    const unicodeString = String.fromCharCode(0x41, 0x42, 0xfffd, 0x43, 0x44); // "AB�CD"
+
+    const serialized = BSON.serialize({ value: unicodeString });
+    BSON.deserialize(serialized);
+  });
+});