From 41c4eef0de26d9be4ba968657583bb2d2092db48 Mon Sep 17 00:00:00 2001
From: Damien Arrachequesne <damien.arrachequesne@gmail.com>
Date: Sun, 16 Oct 2016 22:53:02 +0200
Subject: [PATCH] Add error-tolerant mode

---
 README.md      | 29 +++++++++++++++++++++++++----
 tests/tests.js | 41 ++++++++++++++++++++++++++++++++++++++++-
 utf8.js        | 39 +++++++++++++++++++++++++--------------
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 36b8de0..81ca405 100644
--- a/README.md
+++ b/README.md
@@ -60,22 +60,37 @@ require(
 
 ## API
 
-### `utf8.encode(string)`
+### `utf8.encode(string, opts)`
 
-Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
+Encodes any given JavaScript string (`string`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
+
+Available options:
+
+* `strict`: whether encountering a lone surrogate should throw an error (defaults to `true`). Else, each lone surrogate is replaced by the character U+FFFD.
 
 ```js
 // U+00A9 COPYRIGHT SIGN; see http://codepoints.net/U+00A9
 utf8.encode('\xA9');
 // → '\xC2\xA9'
 // U+10001 LINEAR B SYLLABLE B038 E; see http://codepoints.net/U+10001
+
 utf8.encode('\uD800\uDC01');
 // → '\xF0\x90\x80\x81'
+
+utf8.encode('\uDC00');
+// → throws 'Lone surrogate is not a scalar value' error
+
+utf8.encode('\uDC00', { strict: false });
+// → '\xEF\xBF\xBD'
 ```
 
-### `utf8.decode(byteString)`
+### `utf8.decode(byteString, opts)`
 
-Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
+Decodes any given UTF-8-encoded string (`byteString`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
+
+Available options:
+
+* `strict`: whether encountering a non-scalar value should throw an error (defaults to `true`). Else, each non-scalar value is decoded as U+FFFD.
 
 ```js
 utf8.decode('\xC2\xA9');
@@ -84,6 +99,12 @@ utf8.decode('\xC2\xA9');
 utf8.decode('\xF0\x90\x80\x81');
 // → '\uD800\uDC01'
 // → U+10001 LINEAR B SYLLABLE B038 E
+
+utf8.decode('\xED\xB0\x80');
+// → throws 'Lone surrogate is not a scalar value' error
+
+utf8.decode('\xED\xB0\x80', { strict: false });
+// → '\uFFFD'
 ```
 
 ### `utf8.version`
diff --git a/tests/tests.js b/tests/tests.js
index 2a57115..fd7fd14 100644
--- a/tests/tests.js
+++ b/tests/tests.js
@@ -91,6 +91,11 @@
 			'decoded': '\u2C3C',
 			'encoded': '\xE2\xB0\xBC'
 		},
+		{
+			'codePoint': 0xFFFD,
+			'decoded': '\uFFFD',
+			'encoded': '\xEF\xBF\xBD',
+		},
 		{
 			'codePoint': 0xFFFF,
 			'decoded': '\uFFFF',
@@ -101,74 +106,98 @@
 		{
 			'codePoint': 0xD800,
 			'decoded': '\uD800',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xA0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'description': 'High surrogate followed by another high surrogate',
 			'decoded': '\uD800\uD800',
+			'decodedNonStrict': '\uFFFD\uFFFD',
 			'encoded': '\xED\xA0\x80\xED\xA0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'description': 'High surrogate followed by a symbol that is not a surrogate',
 			'decoded': '\uD800A',
+			'decodedNonStrict': '\uFFFDA',
 			'encoded': '\xED\xA0\x80A',
+			'encodedNonStrict': '\xEF\xBF\xBDA',
 			'error': true
 		},
 		{
 			'description': 'Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate',
 			'decoded': '\uD800\uD834\uDF06\uD800',
+			'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
 			'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'codePoint': 0xD9AF,
 			'decoded': '\uD9AF',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xA6\xAF',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'codePoint': 0xDBFF,
 			'decoded': '\uDBFF',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xAF\xBF',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 		// low surrogates: 0xDC00 to 0xDFFF
 		{
 			'codePoint': 0xDC00,
 			'decoded': '\uDC00',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xB0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'description': 'Low surrogate followed by another low surrogate',
 			'decoded': '\uDC00\uDC00',
+			'decodedNonStrict': '\uFFFD\uFFFD',
 			'encoded': '\xED\xB0\x80\xED\xB0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'description': 'Low surrogate followed by a symbol that is not a surrogate',
 			'decoded': '\uDC00A',
+			'decodedNonStrict': '\uFFFDA',
 			'encoded': '\xED\xB0\x80A',
+			'encodedNonStrict': '\xEF\xBF\xBDA',
 			'error': true
 		},
 		{
 			'description': 'Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate',
 			'decoded': '\uDC00\uD834\uDF06\uDC00',
+			'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
 			'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80',
+			'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'codePoint': 0xDEEE,
 			'decoded': '\uDEEE',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xBB\xAE',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 		{
 			'codePoint': 0xDFFF,
 			'decoded': '\uDFFF',
+			'decodedNonStrict': '\uFFFD',
 			'encoded': '\xED\xBF\xBF',
+			'encodedNonStrict': '\xEF\xBF\xBD',
 			'error': true
 		},
 
@@ -204,7 +233,7 @@
 	test('encode/decode', function() {
 		forEach(data, function(object) {
 			var description = object.description || 'U+' + object.codePoint.toString(16).toUpperCase();
-			;
+
 			if (object.error) {
 				raises(
 					function() {
@@ -220,6 +249,16 @@
 					Error,
 					'Error: non-scalar value detected'
 				);
+				equal(
+					object.encodedNonStrict,
+					utf8.encode(object.decoded, { strict: false }),
+					'Encoding (non-strict): ' + description
+				);
+				equal(
+					object.decodedNonStrict,
+					utf8.decode(object.encoded, { strict: false }),
+					'Decoding (non-strict): ' + description
+				);
 			} else {
 				equal(
 					object.encoded,
diff --git a/utf8.js b/utf8.js
index 58a3daf..79882b0 100644
--- a/utf8.js
+++ b/utf8.js
@@ -64,13 +64,17 @@
 		return output;
 	}
 
-	function checkScalarValue(codePoint) {
+	function checkScalarValue(codePoint, strict) {
 		if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
-			throw Error(
-				'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
-				' is not a scalar value'
-			);
+			if (strict) {
+				throw Error(
+					'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
+					' is not a scalar value'
+				);
+			}
+			return false;
 		}
+		return true;
 	}
 	/*--------------------------------------------------------------------------*/
 
@@ -78,7 +82,7 @@
 		return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
 	}
 
-	function encodeCodePoint(codePoint) {
+	function encodeCodePoint(codePoint, strict) {
 		if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
 			return stringFromCharCode(codePoint);
 		}
@@ -87,7 +91,9 @@
 			symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
 		}
 		else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
-			checkScalarValue(codePoint);
+			if (!checkScalarValue(codePoint, strict)) {
+				codePoint = 0xFFFD;
+			}
 			symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
 			symbol += createByte(codePoint, 6);
 		}
@@ -100,7 +106,10 @@
 		return symbol;
 	}
 
-	function utf8encode(string) {
+	function utf8encode(string, opts) {
+		opts = opts || {};
+		var strict = false !== opts.strict;
+
 		var codePoints = ucs2decode(string);
 		var length = codePoints.length;
 		var index = -1;
@@ -108,7 +117,7 @@
 		var byteString = '';
 		while (++index < length) {
 			codePoint = codePoints[index];
-			byteString += encodeCodePoint(codePoint);
+			byteString += encodeCodePoint(codePoint, strict);
 		}
 		return byteString;
 	}
@@ -131,7 +140,7 @@
 		throw Error('Invalid continuation byte');
 	}
 
-	function decodeSymbol() {
+	function decodeSymbol(strict) {
 		var byte1;
 		var byte2;
 		var byte3;
@@ -172,8 +181,7 @@
 			byte3 = readContinuationByte();
 			codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
 			if (codePoint >= 0x0800) {
-				checkScalarValue(codePoint);
-				return codePoint;
+				return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
 			} else {
 				throw Error('Invalid continuation byte');
 			}
@@ -197,13 +205,16 @@
 	var byteArray;
 	var byteCount;
 	var byteIndex;
-	function utf8decode(byteString) {
+	function utf8decode(byteString, opts) {
+		opts = opts || {};
+		var strict = false !== opts.strict;
+
 		byteArray = ucs2decode(byteString);
 		byteCount = byteArray.length;
 		byteIndex = 0;
 		var codePoints = [];
 		var tmp;
-		while ((tmp = decodeSymbol()) !== false) {
+		while ((tmp = decodeSymbol(strict)) !== false) {
 			codePoints.push(tmp);
 		}
 		return ucs2encode(codePoints);