Skip to content

Commit

Permalink
Merge 41c4eef into 5566334
Browse files Browse the repository at this point in the history
  • Loading branch information
darrachequesne committed Dec 18, 2016
2 parents 5566334 + 41c4eef commit 11e47ae
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 19 deletions.
29 changes: 25 additions & 4 deletions README.md
Expand Up @@ -60,22 +60,37 @@ require(

## API

### `utf8.encode(string)`
### `utf8.encode(string, opts)`

Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
Encodes any given JavaScript string (`string`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)

Available options:

* `strict`: whether encountering a lone surrogate should throw an error (defaults to `true`). Else, each lone surrogate is replaced by the character U+FFFD.

```js
// U+00A9 COPYRIGHT SIGN; see http://codepoints.net/U+00A9
utf8.encode('\xA9');
// → '\xC2\xA9'
// U+10001 LINEAR B SYLLABLE B038 E; see http://codepoints.net/U+10001

utf8.encode('\uD800\uDC01');
// → '\xF0\x90\x80\x81'

utf8.encode('\uDC00');
// → throws 'Lone surrogate is not a scalar value' error

utf8.encode('\uDC00', { strict: false });
// → '\xEF\xBF\xBD'
```

### `utf8.decode(byteString)`
### `utf8.decode(byteString, opts)`

Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
Decodes any given UTF-8-encoded string (`byteString`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)

Available options:

* `strict`: whether encountering a non-scalar value should throw an error (defaults to `true`). Else, each non-scalar value is decoded as U+FFFD.

```js
utf8.decode('\xC2\xA9');
Expand All @@ -84,6 +99,12 @@ utf8.decode('\xC2\xA9');
utf8.decode('\xF0\x90\x80\x81');
// → '\uD800\uDC01'
// → U+10001 LINEAR B SYLLABLE B038 E

utf8.decode('\xED\xB0\x80');
// → throws 'Lone surrogate is not a scalar value' error

utf8.decode('\xED\xB0\x80', { strict: false });
// → '\uFFFD'
```

### `utf8.version`
Expand Down
41 changes: 40 additions & 1 deletion tests/tests.js
Expand Up @@ -91,6 +91,11 @@
'decoded': '\u2C3C',
'encoded': '\xE2\xB0\xBC'
},
{
'codePoint': 0xFFFD,
'decoded': '\uFFFD',
'encoded': '\xEF\xBF\xBD',
},
{
'codePoint': 0xFFFF,
'decoded': '\uFFFF',
Expand All @@ -101,74 +106,98 @@
{
'codePoint': 0xD800,
'decoded': '\uD800',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'description': 'High surrogate followed by another high surrogate',
'decoded': '\uD800\uD800',
'decodedNonStrict': '\uFFFD\uFFFD',
'encoded': '\xED\xA0\x80\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
'error': true
},
{
'description': 'High surrogate followed by a symbol that is not a surrogate',
'decoded': '\uD800A',
'decodedNonStrict': '\uFFFDA',
'encoded': '\xED\xA0\x80A',
'encodedNonStrict': '\xEF\xBF\xBDA',
'error': true
},
{
'description': 'Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate',
'decoded': '\uD800\uD834\uDF06\uD800',
'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xD9AF,
'decoded': '\uD9AF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xA6\xAF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDBFF,
'decoded': '\uDBFF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xAF\xBF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
// low surrogates: 0xDC00 to 0xDFFF
{
'codePoint': 0xDC00,
'decoded': '\uDC00',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'description': 'Low surrogate followed by another low surrogate',
'decoded': '\uDC00\uDC00',
'decodedNonStrict': '\uFFFD\uFFFD',
'encoded': '\xED\xB0\x80\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
'error': true
},
{
'description': 'Low surrogate followed by a symbol that is not a surrogate',
'decoded': '\uDC00A',
'decodedNonStrict': '\uFFFDA',
'encoded': '\xED\xB0\x80A',
'encodedNonStrict': '\xEF\xBF\xBDA',
'error': true
},
{
'description': 'Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate',
'decoded': '\uDC00\uD834\uDF06\uDC00',
'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDEEE,
'decoded': '\uDEEE',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xBB\xAE',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDFFF,
'decoded': '\uDFFF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xBF\xBF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},

Expand Down Expand Up @@ -204,7 +233,7 @@
test('encode/decode', function() {
forEach(data, function(object) {
var description = object.description || 'U+' + object.codePoint.toString(16).toUpperCase();
;

if (object.error) {
raises(
function() {
Expand All @@ -220,6 +249,16 @@
Error,
'Error: non-scalar value detected'
);
equal(
object.encodedNonStrict,
utf8.encode(object.decoded, { strict: false }),
'Encoding (non-strict): ' + description
);
equal(
object.decodedNonStrict,
utf8.decode(object.encoded, { strict: false }),
'Decoding (non-strict): ' + description
);
} else {
equal(
object.encoded,
Expand Down
39 changes: 25 additions & 14 deletions utf8.js
Expand Up @@ -64,21 +64,25 @@
return output;
}

function checkScalarValue(codePoint) {
function checkScalarValue(codePoint, strict) {
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
if (strict) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
}
return false;
}
return true;
}
/*--------------------------------------------------------------------------*/

function createByte(codePoint, shift) {
return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
}

function encodeCodePoint(codePoint) {
function encodeCodePoint(codePoint, strict) {
if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
return stringFromCharCode(codePoint);
}
Expand All @@ -87,7 +91,9 @@
symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
}
else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
checkScalarValue(codePoint);
if (!checkScalarValue(codePoint, strict)) {
codePoint = 0xFFFD;
}
symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
symbol += createByte(codePoint, 6);
}
Expand All @@ -100,15 +106,18 @@
return symbol;
}

function utf8encode(string) {
function utf8encode(string, opts) {
opts = opts || {};
var strict = false !== opts.strict;

var codePoints = ucs2decode(string);
var length = codePoints.length;
var index = -1;
var codePoint;
var byteString = '';
while (++index < length) {
codePoint = codePoints[index];
byteString += encodeCodePoint(codePoint);
byteString += encodeCodePoint(codePoint, strict);
}
return byteString;
}
Expand All @@ -131,7 +140,7 @@
throw Error('Invalid continuation byte');
}

function decodeSymbol() {
function decodeSymbol(strict) {
var byte1;
var byte2;
var byte3;
Expand Down Expand Up @@ -172,8 +181,7 @@
byte3 = readContinuationByte();
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
if (codePoint >= 0x0800) {
checkScalarValue(codePoint);
return codePoint;
return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
} else {
throw Error('Invalid continuation byte');
}
Expand All @@ -197,13 +205,16 @@
var byteArray;
var byteCount;
var byteIndex;
function utf8decode(byteString) {
function utf8decode(byteString, opts) {
opts = opts || {};
var strict = false !== opts.strict;

byteArray = ucs2decode(byteString);
byteCount = byteArray.length;
byteIndex = 0;
var codePoints = [];
var tmp;
while ((tmp = decodeSymbol()) !== false) {
while ((tmp = decodeSymbol(strict)) !== false) {
codePoints.push(tmp);
}
return ucs2encode(codePoints);
Expand Down

0 comments on commit 11e47ae

Please sign in to comment.