diff --git a/.gitignore b/.gitignore index c4b384e..6342324 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ # Generated test data file (> 100 MB) tests/data.json -# JSON version of coverage report -coverage/coverage.json +# Coverage report +coverage # Installed npm modules node_modules diff --git a/.travis.yml b/.travis.yml index 5a39c55..f3e2e1a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,4 +16,6 @@ before_script: - "PREFIX=/usr/lib/jvm; if [ ! -d $PREFIX/java-6-openjdk ]; then for d in $PREFIX/java-6-openjdk-*; do if [ -e $d/jre/lib/rt.jar ]; then sudo ln -s $d $PREFIX/java-6-openjdk; break; fi; done; fi" - "sudo apt-get install -qq python; python --version" script: - "grunt ci" + - "grunt ci" +after_script: + - "grunt shell:cover-coveralls" diff --git a/Gruntfile.js b/Gruntfile.js index 9e827a2..2af2bf5 100644 --- a/Gruntfile.js +++ b/Gruntfile.js @@ -15,9 +15,12 @@ module.exports = function(grunt) { } } }, - 'cover': { + 'cover-html': { 'command': 'istanbul cover --report "html" --verbose --dir "coverage" "tests/tests.js"; istanbul report --root "coverage" --format "html"' }, + 'cover-coveralls': { + 'command': 'istanbul cover --verbose --dir "coverage" "tests/tests.js" && cat coverage/lcov.info | coveralls; rm -rf coverage/lcov*' + }, 'test-narwhal': { 'command': 'echo "Testing in Narwhal..."; export NARWHAL_OPTIMIZATION=-1; narwhal "tests/tests.js"' }, @@ -49,7 +52,7 @@ module.exports = function(grunt) { grunt.loadNpmTasks('grunt-shell'); - grunt.registerTask('cover', 'shell:cover'); + grunt.registerTask('cover', 'shell:cover-html'); grunt.registerTask('ci', [ 'shell:generate-test-data', 'shell:test-narwhal', diff --git a/LICENSE-MIT.txt b/LICENSE-MIT.txt index 97067e5..a41e0a7 100644 --- a/LICENSE-MIT.txt +++ b/LICENSE-MIT.txt @@ -1,4 +1,4 @@ -Copyright Mathias Bynens +Copyright Mathias Bynens Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/README.md b/README.md index ddee43d..edddcce 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # utf8.js [![Build status](https://travis-ci.org/mathiasbynens/utf8.js.svg?branch=master)](https://travis-ci.org/mathiasbynens/utf8.js) [![Dependency status](https://gemnasium.com/mathiasbynens/utf8.js.svg)](https://gemnasium.com/mathiasbynens/utf8.js) -_utf8.js_ is a well-tested UTF-8 encoder/decoder written in JavaScript. Unlike many other JavaScript solutions, it is designed to be a _proper_ UTF-8 encoder/decoder: it can encode/decode any given Unicode code point, including astral symbols and unpaired surrogates. [Here’s an online demo.](http://mothereff.in/utf-8) +_utf8.js_ is a well-tested UTF-8 encoder/decoder written in JavaScript. Unlike many other JavaScript solutions, it is designed to be a _proper_ UTF-8 encoder/decoder: it can encode/decode any scalar Unicode code point values, as per [the Encoding Standard](https://encoding.spec.whatwg.org/#utf-8). [Here’s an online demo.](https://mothereff.in/utf-8) Feel free to fork if you see possible improvements! ## Installation -Via [npm](http://npmjs.org/): +Via [npm](https://www.npmjs.org/): ```bash npm install utf8 @@ -30,7 +30,7 @@ In a browser: ``` -In [Narwhal](http://narwhaljs.org/), [Node.js](http://nodejs.org/), and [RingoJS ≥ v0.8.0](http://ringojs.org/): +In [Narwhal](http://narwhaljs.org/), [Node.js](https://nodejs.org/), and [RingoJS ≥ v0.8.0](http://ringojs.org/): ```js var utf8 = require('utf8'); @@ -62,7 +62,7 @@ require( ### `utf8.encode(string)` -Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. +Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) ```js // U+00A9 COPYRIGHT SIGN; see http://codepoints.net/U+00A9 @@ -75,7 +75,7 @@ utf8.encode('\uD800\uDC01'); ### `utf8.decode(byteString)` -Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. +Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.) ```js utf8.decode('\xC2\xA9'); @@ -112,8 +112,8 @@ Long before utf8.js was created, the `utf8` module on npm was registered and use | [![twitter/mathias](https://gravatar.com/avatar/24e08a9ea84deb17ae121074d0f17125?s=70)](https://twitter.com/mathias "Follow @mathias on Twitter") | |---| -| [Mathias Bynens](http://mathiasbynens.be/) | +| [Mathias Bynens](https://mathiasbynens.be/) | ## License -utf8.js is available under the [MIT](http://mths.be/mit) license. +utf8.js is available under the [MIT](https://mths.be/mit) license. diff --git a/package.json b/package.json index e612cea..32e3115 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "utf8", "version": "2.0.0", "description": "A well-tested UTF-8 encoder/decoder written in JavaScript.", - "homepage": "http://mths.be/utf8js", + "homepage": "https://mths.be/utf8js", "main": "utf8.js", "keywords": [ "charset", @@ -10,39 +10,26 @@ "unicode", "utf8" ], - "licenses": [ - { - "type": "MIT", - "url": "http://mths.be/mit" - }, - { - "type": "GPL", - "url": "http://mths.be/gpl" - } - ], + "license": "MIT", "author": { "name": "Mathias Bynens", - "url": "http://mathiasbynens.be/" + "url": "https://mathiasbynens.be/" }, "repository": { "type": "git", "url": "https://github.com/mathiasbynens/utf8.js.git" }, - "bugs": { - "url": "https://github.com/mathiasbynens/utf8.js/issues" - }, - "directories": { - "test": "tests" - }, + "bugs": "https://github.com/mathiasbynens/utf8.js/issues", "scripts": { "test": "node tests/tests.js" }, "devDependencies": { - "grunt": "~0.4.4", - "grunt-shell": "~0.6.4", - "istanbul": "~0.2.6", - "qunit-clib": "~1.3.0", + "coveralls": "^2.11.1", + "grunt": "^0.4.5", + "grunt-shell": "^1.1.1", + "istanbul": "^0.3.5", + "qunit-extras": "^1.4.0", "qunitjs": "~1.11.0", - "requirejs": "~2.1.11" + "requirejs": "^2.1.11" } } diff --git a/tests/generate-test-data.py b/tests/generate-test-data.py index 096b012..08be0d2 100755 --- a/tests/generate-test-data.py +++ b/tests/generate-test-data.py @@ -3,7 +3,7 @@ import re import json -# http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae +# https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae # http://stackoverflow.com/a/13436167/96656 def unisymbol(codePoint): if codePoint >= 0x0000 and codePoint <= 0xFFFF: @@ -25,6 +25,9 @@ def writeFile(filename, contents): data = [] for codePoint in range(0x000000, 0x10FFFF + 1): + # Skip non-scalar values. + if codePoint >= 0xD800 and codePoint <= 0xDFFF: + continue symbol = unisymbol(codePoint) # http://stackoverflow.com/a/17199950/96656 bytes = symbol.encode('utf8').decode('latin1') diff --git a/tests/tests.js b/tests/tests.js index cb01e10..5ebe513 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -1,22 +1,27 @@ -;(function(root) { +(function(root) { 'use strict'; - /** Use a single `load` function */ - var load = typeof require == 'function' ? require : root.load; + var noop = Function.prototype; + + var load = (typeof require == 'function' && !(root.define && define.amd)) ? + require : + (!root.document && root.java && root.load) || noop; - /** The unit testing framework */ var QUnit = (function() { - var noop = Function.prototype; return root.QUnit || ( root.addEventListener || (root.addEventListener = noop), root.setTimeout || (root.setTimeout = noop), root.QUnit = load('../node_modules/qunitjs/qunit/qunit.js') || root.QUnit, - (load('../node_modules/qunit-clib/qunit-clib.js') || { 'runInContext': noop }).runInContext(root), addEventListener === noop && delete root.addEventListener, root.QUnit ); }()); + var qe = load('../node_modules/qunit-extras/qunit-extras.js'); + if (qe) { + qe.runInContext(root); + } + /** The `utf8` object to test */ var utf8 = root.utf8 || (root.utf8 = ( utf8 = load('../utf8.js') || root.utf8, @@ -96,63 +101,75 @@ { 'codePoint': 0xD800, 'decoded': '\uD800', - 'encoded': '\xED\xA0\x80' + 'encoded': '\xED\xA0\x80', + 'error': true }, { 'description': 'High surrogate followed by another high surrogate', 'decoded': '\uD800\uD800', - 'encoded': '\xED\xA0\x80\xED\xA0\x80' + 'encoded': '\xED\xA0\x80\xED\xA0\x80', + 'error': true }, { 'description': 'High surrogate followed by a symbol that is not a surrogate', 'decoded': '\uD800A', - 'encoded': '\xED\xA0\x80A' + 'encoded': '\xED\xA0\x80A', + 'error': true }, { 'description': 'Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate', 'decoded': '\uD800\uD834\uDF06\uD800', - 'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80' + 'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80', + 'error': true }, { 'codePoint': 0xD9AF, 'decoded': '\uD9AF', - 'encoded': '\xED\xA6\xAF' + 'encoded': '\xED\xA6\xAF', + 'error': true }, { 'codePoint': 0xDBFF, 'decoded': '\uDBFF', - 'encoded': '\xED\xAF\xBF' + 'encoded': '\xED\xAF\xBF', + 'error': true }, // low surrogates: 0xDC00 to 0xDFFF { 'codePoint': 0xDC00, 'decoded': '\uDC00', - 'encoded': '\xED\xB0\x80' + 'encoded': '\xED\xB0\x80', + 'error': true }, { 'description': 'Low surrogate followed by another low surrogate', 'decoded': '\uDC00\uDC00', - 'encoded': '\xED\xB0\x80\xED\xB0\x80' + 'encoded': '\xED\xB0\x80\xED\xB0\x80', + 'error': true }, { 'description': 'Low surrogate followed by a symbol that is not a surrogate', 'decoded': '\uDC00A', - 'encoded': '\xED\xB0\x80A' + 'encoded': '\xED\xB0\x80A', + 'error': true }, { 'description': 'Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate', 'decoded': '\uDC00\uD834\uDF06\uDC00', - 'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80' + 'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80', + 'error': true }, { 'codePoint': 0xDEEE, 'decoded': '\uDEEE', - 'encoded': '\xED\xBB\xAE' + 'encoded': '\xED\xBB\xAE', + 'error': true }, { 'codePoint': 0xDFFF, 'decoded': '\uDFFF', - 'encoded': '\xED\xBF\xBF' + 'encoded': '\xED\xBF\xBF', + 'error': true }, // 4-byte @@ -188,16 +205,33 @@ forEach(data, function(object) { var description = object.description || 'U+' + object.codePoint.toString(16).toUpperCase(); ; - equal( - object.encoded, - utf8.encode(object.decoded), - 'Encoding: ' + description - ); - equal( - object.decoded, - utf8.decode(object.encoded), - 'Decoding: ' + description - ); + if (object.error) { + raises( + function() { + utf8.decode(object.encoded); + }, + Error, + 'Error: non-scalar value detected' + ); + raises( + function() { + utf8.encode(object.decoded); + }, + Error, + 'Error: non-scalar value detected' + ); + } else { + equal( + object.encoded, + utf8.encode(object.decoded), + 'Encoding: ' + description + ); + equal( + object.decoded, + utf8.decode(object.encoded), + 'Decoding: ' + description + ); + } }); // Error handling diff --git a/utf8.js b/utf8.js index be74021..c138a38 100644 --- a/utf8.js +++ b/utf8.js @@ -1,4 +1,4 @@ -/*! http://mths.be/utf8js v2.0.0 by @mathias */ +/*! https://mths.be/utf8js v2.0.0 by @mathias */ ;(function(root) { // Detect free variables `exports` @@ -19,7 +19,7 @@ var stringFromCharCode = String.fromCharCode; - // Taken from http://mths.be/punycode + // Taken from https://mths.be/punycode function ucs2decode(string) { var output = []; var counter = 0; @@ -46,7 +46,7 @@ return output; } - // Taken from http://mths.be/punycode + // Taken from https://mths.be/punycode function ucs2encode(array) { var length = array.length; var index = -1; @@ -64,6 +64,14 @@ return output; } + function checkScalarValue(codePoint) { + if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { + throw Error( + 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() + + ' is not a scalar value' + ); + } + } /*--------------------------------------------------------------------------*/ function createByte(codePoint, shift) { @@ -79,6 +87,7 @@ symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0); } else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence + checkScalarValue(codePoint); symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0); symbol += createByte(codePoint, 6); } @@ -163,6 +172,7 @@ byte3 = readContinuationByte(); codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; if (codePoint >= 0x0800) { + checkScalarValue(codePoint); return codePoint; } else { throw Error('Invalid continuation byte');