From 41c4eef0de26d9be4ba968657583bb2d2092db48 Mon Sep 17 00:00:00 2001 From: Damien Arrachequesne Date: Sun, 16 Oct 2016 22:53:02 +0200 Subject: [PATCH] Add error-tolerant mode --- README.md | 29 +++++++++++++++++++++++++---- tests/tests.js | 41 ++++++++++++++++++++++++++++++++++++++++- utf8.js | 39 +++++++++++++++++++++++++-------------- 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 36b8de0..81ca405 100644 --- a/README.md +++ b/README.md @@ -60,22 +60,37 @@ require( ## API -### `utf8.encode(string)` +### `utf8.encode(string, opts)` -Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) +Encodes any given JavaScript string (`string`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) + +Available options: + +* `strict`: whether encountering a lone surrogate should throw an error (defaults to `true`). Else, each lone surrogate is replaced by the character U+FFFD. ```js // U+00A9 COPYRIGHT SIGN; see http://codepoints.net/U+00A9 utf8.encode('\xA9'); // → '\xC2\xA9' // U+10001 LINEAR B SYLLABLE B038 E; see http://codepoints.net/U+10001 + utf8.encode('\uD800\uDC01'); // → '\xF0\x90\x80\x81' + +utf8.encode('\uDC00'); +// → throws 'Lone surrogate is not a scalar value' error + +utf8.encode('\uDC00', { strict: false }); +// → '\xEF\xBF\xBD' ``` -### `utf8.decode(byteString)` +### `utf8.decode(byteString, opts)` -Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) +Decodes any given UTF-8-encoded string (`byteString`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) + +Available options: + +* `strict`: whether encountering a non-scalar value should throw an error (defaults to `true`). Else, each non-scalar value is decoded as U+FFFD. ```js utf8.decode('\xC2\xA9'); @@ -84,6 +99,12 @@ utf8.decode('\xC2\xA9'); utf8.decode('\xF0\x90\x80\x81'); // → '\uD800\uDC01' // → U+10001 LINEAR B SYLLABLE B038 E + +utf8.decode('\xED\xB0\x80'); +// → throws 'Lone surrogate is not a scalar value' error + +utf8.decode('\xED\xB0\x80', { strict: false }); +// → '\uFFFD' ``` ### `utf8.version` diff --git a/tests/tests.js b/tests/tests.js index 2a57115..fd7fd14 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -91,6 +91,11 @@ 'decoded': '\u2C3C', 'encoded': '\xE2\xB0\xBC' }, + { + 'codePoint': 0xFFFD, + 'decoded': '\uFFFD', + 'encoded': '\xEF\xBF\xBD', + }, { 'codePoint': 0xFFFF, 'decoded': '\uFFFF', @@ -101,74 +106,98 @@ { 'codePoint': 0xD800, 'decoded': '\uD800', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xA0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, { 'description': 'High surrogate followed by another high surrogate', 'decoded': '\uD800\uD800', + 'decodedNonStrict': '\uFFFD\uFFFD', 'encoded': '\xED\xA0\x80\xED\xA0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD', 'error': true }, { 'description': 'High surrogate followed by a symbol that is not a surrogate', 'decoded': '\uD800A', + 'decodedNonStrict': '\uFFFDA', 'encoded': '\xED\xA0\x80A', + 'encodedNonStrict': '\xEF\xBF\xBDA', 'error': true }, { 'description': 'Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate', 'decoded': '\uD800\uD834\uDF06\uD800', + 'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD', 'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD', 'error': true }, { 'codePoint': 0xD9AF, 'decoded': '\uD9AF', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xA6\xAF', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, { 'codePoint': 0xDBFF, 'decoded': '\uDBFF', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xAF\xBF', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, // low surrogates: 0xDC00 to 0xDFFF { 'codePoint': 0xDC00, 'decoded': '\uDC00', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xB0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, { 'description': 'Low surrogate followed by another low surrogate', 'decoded': '\uDC00\uDC00', + 'decodedNonStrict': '\uFFFD\uFFFD', 'encoded': '\xED\xB0\x80\xED\xB0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD', 'error': true }, { 'description': 'Low surrogate followed by a symbol that is not a surrogate', 'decoded': '\uDC00A', + 'decodedNonStrict': '\uFFFDA', 'encoded': '\xED\xB0\x80A', + 'encodedNonStrict': '\xEF\xBF\xBDA', 'error': true }, { 'description': 'Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate', 'decoded': '\uDC00\uD834\uDF06\uDC00', + 'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD', 'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80', + 'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD', 'error': true }, { 'codePoint': 0xDEEE, 'decoded': '\uDEEE', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xBB\xAE', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, { 'codePoint': 0xDFFF, 'decoded': '\uDFFF', + 'decodedNonStrict': '\uFFFD', 'encoded': '\xED\xBF\xBF', + 'encodedNonStrict': '\xEF\xBF\xBD', 'error': true }, @@ -204,7 +233,7 @@ test('encode/decode', function() { forEach(data, function(object) { var description = object.description || 'U+' + object.codePoint.toString(16).toUpperCase(); - ; + if (object.error) { raises( function() { @@ -220,6 +249,16 @@ Error, 'Error: non-scalar value detected' ); + equal( + object.encodedNonStrict, + utf8.encode(object.decoded, { strict: false }), + 'Encoding (non-strict): ' + description + ); + equal( + object.decodedNonStrict, + utf8.decode(object.encoded, { strict: false }), + 'Decoding (non-strict): ' + description + ); } else { equal( object.encoded, diff --git a/utf8.js b/utf8.js index 58a3daf..79882b0 100644 --- a/utf8.js +++ b/utf8.js @@ -64,13 +64,17 @@ return output; } - function checkScalarValue(codePoint) { + function checkScalarValue(codePoint, strict) { if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { - throw Error( - 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() + - ' is not a scalar value' - ); + if (strict) { + throw Error( + 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() + + ' is not a scalar value' + ); + } + return false; } + return true; } /*--------------------------------------------------------------------------*/ @@ -78,7 +82,7 @@ return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80); } - function encodeCodePoint(codePoint) { + function encodeCodePoint(codePoint, strict) { if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence return stringFromCharCode(codePoint); } @@ -87,7 +91,9 @@ symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0); } else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence - checkScalarValue(codePoint); + if (!checkScalarValue(codePoint, strict)) { + codePoint = 0xFFFD; + } symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0); symbol += createByte(codePoint, 6); } @@ -100,7 +106,10 @@ return symbol; } - function utf8encode(string) { + function utf8encode(string, opts) { + opts = opts || {}; + var strict = false !== opts.strict; + var codePoints = ucs2decode(string); var length = codePoints.length; var index = -1; @@ -108,7 +117,7 @@ var byteString = ''; while (++index < length) { codePoint = codePoints[index]; - byteString += encodeCodePoint(codePoint); + byteString += encodeCodePoint(codePoint, strict); } return byteString; } @@ -131,7 +140,7 @@ throw Error('Invalid continuation byte'); } - function decodeSymbol() { + function decodeSymbol(strict) { var byte1; var byte2; var byte3; @@ -172,8 +181,7 @@ byte3 = readContinuationByte(); codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; if (codePoint >= 0x0800) { - checkScalarValue(codePoint); - return codePoint; + return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD; } else { throw Error('Invalid continuation byte'); } @@ -197,13 +205,16 @@ var byteArray; var byteCount; var byteIndex; - function utf8decode(byteString) { + function utf8decode(byteString, opts) { + opts = opts || {}; + var strict = false !== opts.strict; + byteArray = ucs2decode(byteString); byteCount = byteArray.length; byteIndex = 0; var codePoints = []; var tmp; - while ((tmp = decodeSymbol()) !== false) { + while ((tmp = decodeSymbol(strict)) !== false) { codePoints.push(tmp); } return ucs2encode(codePoints);