Skip to content

Commit

Permalink
fix(deserialize): fix deserialization of 0xFFFD
Browse files Browse the repository at this point in the history
When deserializing a string, we previously relied on the appearance
of 0xFFFD to denote an invalid unicode character. However, 0xFFFD
is a valid unicode character if that is what was originally
input.

Fixes NODE-1718
Fixes #277
  • Loading branch information
daprahamian committed Nov 2, 2018
1 parent 06af813 commit c682ae3
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 5 deletions.
10 changes: 5 additions & 5 deletions lib/parser/deserializer.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const DBRef = require('../db_ref');
const BSONRegExp = require('../regexp');
const Binary = require('../binary');
const constants = require('../constants');
const validateUtf8 = require('../validate_utf8').validateUtf8;

// Internal long versions
const JS_INT_MAX_LONG = Long.fromNumber(constants.JS_INT_MAX);
Expand Down Expand Up @@ -134,13 +135,12 @@ function deserializeObject(buffer, index, options, isArray) {
)
throw new Error('bad string length in bson');

const s = buffer.toString('utf8', index, index + stringSize - 1);
for (i = 0; i < s.length; i++) {
if (s.charCodeAt(i) === 0xfffd) {
throw new Error('Invalid UTF-8 string in BSON document');
}
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new Error('Invalid UTF-8 string in BSON document');
}

const s = buffer.toString('utf8', index, index + stringSize - 1);

object[name] = s;
index = index + stringSize;
} else if (elementType === constants.BSON_DATA_OID) {
Expand Down
48 changes: 48 additions & 0 deletions lib/validate_utf8.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
'use strict';

const FIRST_BIT = 0x80;
const FIRST_TWO_BITS = 0xc0;
const FIRST_THREE_BITS = 0xe0;
const FIRST_FOUR_BITS = 0xf0;
const FIRST_FIVE_BITS = 0xf8;

const TWO_BIT_CHAR = 0xc0;
const THREE_BIT_CHAR = 0xe0;
const FOUR_BIT_CHAR = 0xf0;
const CONTINUING_CHAR = 0x80;

/**
* Determines if the passed in bytes are valid utf8
* @param {Buffer|Uint8Array} bytes An array of 8-bit bytes. Must be indexable and have length property
* @param {Number} start The index to start validating
* @param {Number} end The index to end validating
* @returns {boolean} True if valid utf8
*/
function validateUtf8(bytes, start, end) {
let continuation = 0;

for (let i = start; i < end; i += 1) {
const byte = bytes[i];

if (continuation) {
if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
return false;
}
continuation -= 1;
} else if (byte & FIRST_BIT) {
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
continuation = 1;
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
continuation = 2;
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
continuation = 3;
} else {
return false;
}
}
}

return !continuation;
}

module.exports.validateUtf8 = validateUtf8;
12 changes: 12 additions & 0 deletions test/node/string_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
'use strict';

const BSON = require('../../lib/bson');

describe('string tests', function() {
it('can serialize and deserialize 0xFFFD', function() {
const unicodeString = String.fromCharCode(0x41, 0x42, 0xfffd, 0x43, 0x44); // "AB�CD"

const serialized = BSON.serialize({ value: unicodeString });
BSON.deserialize(serialized);
});
});

0 comments on commit c682ae3

Please sign in to comment.