Skip to content

Commit

Permalink
Merge 7fa7045 into 93daf73
Browse files Browse the repository at this point in the history
  • Loading branch information
akobler committed Aug 31, 2015
2 parents 93daf73 + 7fa7045 commit a53a932
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 15 deletions.
30 changes: 30 additions & 0 deletions README.md
Expand Up @@ -104,6 +104,36 @@ he.encode('foo © bar ≠ baz 𝌆 qux', {
// → 'foo © bar ≠ baz 𝌆 qux'
```

#### `decimal`

The default value for the `decimal` option is `false`. If the option is enabled, `encode` will generally use decimal escapes (e.g. `©`) rather than hexadecimal escapes (e.g. `©`). Beside of this replacement, the basic behaviour remains the same when combined with other options. For example: If both options `useNamedReferences` and `decimal` are enabled, named references (e.g. `©`) are used over decimal escapes. HTML entities without a named reference are encoded with decimal escapes.


```js
// Using the global default setting (defaults to `false`):
he.encode('foo © bar ≠ baz 𝌆 qux');
// → 'foo © bar ≠ baz 𝌆 qux'

// Passing an `options` object to `encode`, to explicitly disable decimal escapes:
he.encode('foo © bar ≠ baz 𝌆 qux', {
'decimal': false
});
// → 'foo © bar ≠ baz 𝌆 qux'

// Passing an `options` object to `encode`, to explicitly enable decimal escapes:
he.encode('foo © bar ≠ baz 𝌆 qux', {
'decimal': true
});
// → 'foo © bar ≠ baz 𝌆 qux'

// Passing an `options` object to `encode`, to explicitly allow named references and decimal escapes:
he.encode('foo © bar ≠ baz 𝌆 qux', {
'useNamedReferences': true,
'decimal': true
});
// → 'foo © bar ≠ baz 𝌆 qux'
```

#### `encodeEverything`

The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string. When set to `true`, this option takes precedence over `allowUnsafeSymbols` (i.e. setting the latter to `true` in such a case has no effect).
Expand Down
7 changes: 6 additions & 1 deletion bin/he
Expand Up @@ -23,7 +23,7 @@
log([
'\nUsage:\n',
'\the [--escape] string',
'\the [--encode] [--use-named-refs] [--everything] [--allow-unsafe] string',
'\the [--encode] [--use-named-refs] [--everything] [--allow-unsafe] [--decimal] string',
'\the [--decode] [--attribute] [--strict] string',
'\the [-v | --version]',
'\the [-h | --help]',
Expand Down Expand Up @@ -64,6 +64,11 @@
options.allowUnsafeSymbols = true;
return;
}
if (string == '--decimal') {
action = 'encode';
options.decimalOutput = true;
return;
}
if (string == '--decode') {
action = 'decode';
return;
Expand Down
24 changes: 17 additions & 7 deletions he.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/he.1
Expand Up @@ -35,6 +35,8 @@ Enable the use of named character references (like `©`) in the output. If c
Encode every symbol in the input string, even safe printable ASCII symbols.
.It Sy "--encode --allow-unsafe"
Encode non-ASCII characters only. This leaves unsafe HTML/XML symbols like `&`, `<`, `>`, `"`, and `'` intact.
.It Sy "--encode --decimal-output"
Take decimal format rather than hexadecimal format for encoded HTML entities (`©` is turned into `&#169;` instead of `&#xA9;`).
.It Sy "--decode"
Takes a string of HTML and decode any named and numerical character references in it using the algorithm described in the HTML spec.
.It Sy "--decode --attribute"
Expand Down
25 changes: 18 additions & 7 deletions src/he.js
Expand Up @@ -131,8 +131,12 @@
return output;
};

var hexEscape = function(symbol) {
return '&#x' + symbol.charCodeAt(0).toString(16).toUpperCase() + ';';
var hexEscape = function(charCode) {
return '&#x' + charCode.toString(16).toUpperCase() + ';';
};

var decEscape = function(charCode) {
return '&#' + charCode + ';';
};

var parseError = function(message) {
Expand All @@ -150,14 +154,20 @@
var encodeEverything = options.encodeEverything;
var useNamedReferences = options.useNamedReferences;
var allowUnsafeSymbols = options.allowUnsafeSymbols;
var escapeCharCode = options.decimal ? decEscape : hexEscape;

var escapeSymbol = function(symbol) {
return escapeCharCode(symbol.charCodeAt(0));
}

if (encodeEverything) {
// Encode ASCII symbols.
string = string.replace(regexAsciiWhitelist, function(symbol) {
// Use named references if requested & possible.
if (useNamedReferences && has(encodeMap, symbol)) {
return '&' + encodeMap[symbol] + ';';
}
return hexEscape(symbol);
return escapeSymbol(symbol);
});
// Shorten a few escapes that represent two symbols, of which at least one
// is within the ASCII range.
Expand Down Expand Up @@ -197,7 +207,7 @@
} else if (!allowUnsafeSymbols) {
// Encode `<>"'&` using hexadecimal escapes, now that they’re not handled
// using named character references.
string = string.replace(regexEscape, hexEscape);
string = string.replace(regexEscape, escapeSymbol);
}
return string
// Encode astral symbols.
Expand All @@ -206,18 +216,19 @@
var high = $0.charCodeAt(0);
var low = $0.charCodeAt(1);
var codePoint = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000;
return '&#x' + codePoint.toString(16).toUpperCase() + ';';
return escapeCharCode(codePoint);
})
// Encode any remaining BMP symbols that are not printable ASCII symbols
// using a hexadecimal escape.
.replace(regexBmpWhitelist, hexEscape);
.replace(regexBmpWhitelist, escapeSymbol);
};
// Expose default options (so they can be overridden globally).
encode.options = {
'allowUnsafeSymbols': false,
'encodeEverything': false,
'strict': false,
'useNamedReferences': false
'useNamedReferences': false,
'decimal' : false
};

var decode = function(html, options) {
Expand Down
64 changes: 64 additions & 0 deletions tests/tests.js
Expand Up @@ -6620,6 +6620,70 @@
Error,
'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`'
);
equal(
he.encode('\xE4\xF6\xFC\xC4\xD6\xDC', { 'decimal': true}),
'&#228;&#246;&#252;&#196;&#214;&#220;',
'encode to decimal HTML entities'
);
equal(
he.encode('\xE4\xF6\xFC\xC4\xD6\xDC', { 'decimal': true, 'useNamedReferences': true }),
'&auml;&ouml;&uuml;&Auml;&Ouml;&Uuml;',
'encode to named HTML entities whereby `useNamedReferences` takes precedence over `decimal`'
);
equal(
he.encode('a<b', { 'decimal': true, 'encodeEverything': true }),
'&#97;&#60;&#98;',
'`encodeEverything` to decimal HTML entities'
);
equal(
he.encode('\0\x89', {'decimal': true}),
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, neither if `decimal`: true is used'
);
equal(
he.encode('\0\x89', { 'decimal': true, 'encodeEverything': true }),
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` and `decimal: true` is used'
);
equal(
he.encode('foo\xA9<bar\uD834\uDF06>baz\u2603"qux', { 'decimal': true, 'allowUnsafeSymbols': true }),
'foo&#169;<bar&#119558;>baz&#9731;"qux',
'Markup characters pass through when `allowUnsafeSymbols: true`, non-ASCII symbols are encoded to decimal HTML entities'
);
equal(
he.encode('a<b', { 'decimal': true, 'encodeEverything': true, 'allowUnsafeSymbols': true }),
'&#97;&#60;&#98;',
'`encodeEverything` to decimal HTML entities whereby `encodeEverything` takes precedence over `allowUnsafeSymbols`'
);
equal(
he.encode('a<\xE4>', { 'decimal': true, 'allowUnsafeSymbols': true, 'useNamedReferences': true }),
'a<&auml;>',
'encode to named HTML entities whereby `useNamedReferences` takes precedence over `decimal`,unsafe symbols allowed'
);
equal(
he.encode('a<\u223E>', { 'decimal': true, 'allowUnsafeSymbols': true }),
'a<&#8766;>',
'`decimal` only affects non-ASCII symbols when `allowUnsafeSymbols: true`'
)
raises(
he.encode('a<\xE4>', { 'decimal': true, 'allowUnsafeSymbols': false}),
'a<&auml;>',
'Parse error: unsafe symbols are not allowed'
);
raises(
function() {
he.encode('\0\x01\x02\x03\x04\x05\x06\x07\b\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF\uFFFE\uFFFF\uD83F\uDFFE\uD83F\uDFFF\uD87F\uDFFE\uD87F\uDFFF\uD8BF\uDFFE\uD8BF\uDFFF\uD8FF\uDFFE\uD8FF\uDFFF\uD93F\uDFFE\uD93F\uDFFF\uD97F\uDFFE\uD97F\uDFFF\uD9BF\uDFFE\uD9BF\uDFFF\uD9FF\uDFFE\uD9FF\uDFFF\uDA3F\uDFFE\uDA3F\uDFFF\uDA7F\uDFFE\uDA7F\uDFFF\uDABF\uDFFE\uDABF\uDFFF\uDAFF\uDFFE\uDAFF\uDFFF\uDB3F\uDFFE\uDB3F\uDFFF\uDB7F\uDFFE\uDB7F\uDFFF\uDBBF\uDFFE\uDBBF\uDFFF\uDBFF\uDFFE\uDBFF\uDFFF', { 'decimal': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `decimal: true`, `strict: true`'
);
raises(
function() {
he.encode('\0\x01\x02\x03\x04\x05\x06\x07\b\x0B\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF\uFFFE\uFFFF\uD83F\uDFFE\uD83F\uDFFF\uD87F\uDFFE\uD87F\uDFFF\uD8BF\uDFFE\uD8BF\uDFFF\uD8FF\uDFFE\uD8FF\uDFFF\uD93F\uDFFE\uD93F\uDFFF\uD97F\uDFFE\uD97F\uDFFF\uD9BF\uDFFE\uD9BF\uDFFF\uD9FF\uDFFE\uD9FF\uDFFF\uDA3F\uDFFE\uDA3F\uDFFF\uDA7F\uDFFE\uDA7F\uDFFF\uDABF\uDFFE\uDABF\uDFFF\uDAFF\uDFFE\uDAFF\uDFFF\uDB3F\uDFFE\uDB3F\uDFFF\uDB7F\uDFFE\uDB7F\uDFFF\uDBBF\uDFFE\uDBBF\uDFFF\uDBFF\uDFFE\uDBFF\uDFFF', { 'decimal': true, 'allowUnsafeSymbols': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `decimal: true`, `allowUnsafeSymbols: true` and `strict: true`'
);
});
test('escape', function() {
equal(
Expand Down
64 changes: 64 additions & 0 deletions tests/tests.src.js
Expand Up @@ -6620,6 +6620,70 @@
Error,
'Parse error: forbidden code point when `allowUnsafeSymbols: true` and `strict: true`'
);
equal(
he.encode('\xE4\xF6\xFC\xC4\xD6\xDC', { 'decimal': true}),
'&#228;&#246;&#252;&#196;&#214;&#220;',
'encode to decimal HTML entities'
);
equal(
he.encode('\xE4\xF6\xFC\xC4\xD6\xDC', { 'decimal': true, 'useNamedReferences': true }),
'&auml;&ouml;&uuml;&Auml;&Ouml;&Uuml;',
'encode to named HTML entities whereby `useNamedReferences` takes precedence over `decimal`'
);
equal(
he.encode('a<b', { 'decimal': true, 'encodeEverything': true }),
'&#97;&#60;&#98;',
'`encodeEverything` to decimal HTML entities'
);
equal(
he.encode('\0\x89', {'decimal': true}),
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, neither if `decimal`: true is used'
);
equal(
he.encode('\0\x89', { 'decimal': true, 'encodeEverything': true }),
'\0\x89',
'Does not encode invalid code points whose character references would refer to another code point, even when `encodeEverything: true` and `decimal: true` is used'
);
equal(
he.encode('foo\xA9<bar\uD834\uDF06>baz\u2603"qux', { 'decimal': true, 'allowUnsafeSymbols': true }),
'foo&#169;<bar&#119558;>baz&#9731;"qux',
'Markup characters pass through when `allowUnsafeSymbols: true`, non-ASCII symbols are encoded to decimal HTML entities'
);
equal(
he.encode('a<b', { 'decimal': true, 'encodeEverything': true, 'allowUnsafeSymbols': true }),
'&#97;&#60;&#98;',
'`encodeEverything` to decimal HTML entities whereby `encodeEverything` takes precedence over `allowUnsafeSymbols`'
);
equal(
he.encode('a<\xE4>', { 'decimal': true, 'allowUnsafeSymbols': true, 'useNamedReferences': true }),
'a<&auml;>',
'encode to named HTML entities whereby `useNamedReferences` takes precedence over `decimal`,unsafe symbols allowed'
);
equal(
he.encode('a<\u223E>', { 'decimal': true, 'allowUnsafeSymbols': true }),
'a<&#8766;>',
'`decimal` only affects non-ASCII symbols when `allowUnsafeSymbols: true`'
)
raises(
he.encode('a<\xE4>', { 'decimal': true, 'allowUnsafeSymbols': false}),
'a<&auml;>',
'Parse error: unsafe symbols are not allowed'
);
raises(
function() {
he.encode(<%= stringInvalidCodePoints %>, { 'decimal': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `decimal: true`, `strict: true`'
);
raises(
function() {
he.encode(<%= stringInvalidCodePoints %>, { 'decimal': true, 'allowUnsafeSymbols': true, 'strict': true });
},
Error,
'Parse error: forbidden code point when `decimal: true`, `allowUnsafeSymbols: true` and `strict: true`'
);
});
test('escape', function() {
equal(
Expand Down

0 comments on commit a53a932

Please sign in to comment.