Skip to content

Commit

Permalink
encode: Add encodeEverything option
Browse files Browse the repository at this point in the history
Ref. #12.
  • Loading branch information
mathiasbynens committed Aug 7, 2013
1 parent 7076d0a commit 41a05b4
Show file tree
Hide file tree
Showing 10 changed files with 286 additions and 30 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ A string representing the semantic version number.

### `he.encode(text, options)`

This function takes a string of text and encodes any symbols that aren’t printable ASCII symbols and that can be replaced with character references. For example, it would turn `©` into `©`, but it wouldn’t turn `+` into `+` or `+` since there is no point in doing so. Additionally, it replaces any remaining non-ASCII symbols with a hexadecimal escape sequence (e.g. `𝌆`). The return value of this function is always valid HTML.
This function takes a string of text and encodes (by default) any symbols that aren’t printable ASCII symbols, replacing them with character references. As long as the input string contains allowed code points only, the return value of this function is always valid HTML.

```js
he.encode('foo © bar ≠ baz 𝌆 qux');
Expand Down Expand Up @@ -97,6 +97,29 @@ he.encode('foo © bar ≠ baz 𝌆 qux', {
// → 'foo © bar ≠ baz 𝌆 qux'
```

#### `encodeEverything`

The default value for the `encodeEverything` option is `false`. This means that `encode()` will not use any character references for printable ASCII symbols that don’t need escaping. Set it to `true` to encode every symbol in the input string.

```js
// Using the global default setting (defaults to `false`):
he.encode('foo © bar ≠ baz 𝌆 qux');
// → 'foo © bar ≠ baz 𝌆 qux'

// Passing an `options` object to `encode`, to explicitly encode all symbols:
he.encode('foo © bar ≠ baz 𝌆 qux', {
'encodeEverything': true
});
// → 'foo © bar ≠ baz 𝌆 qux'

// This setting can be combined with the `useNamedReferences` option:
he.encode('foo © bar ≠ baz 𝌆 qux', {
'encodeEverything': true,
'useNamedReferences': true
});
// → 'foo © bar ≠ baz 𝌆 qux'
```

#### Overriding default `encode` options globally

The global default setting can be overridden by modifying the `he.encode.options` object. This saves you from passing in an `options` object for every call to `encode` if you want to use the non-default setting.
Expand Down
26 changes: 26 additions & 0 deletions data/encode-lone-code-points.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
[
9,
10,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
46,
47,
58,
59,
60,
61,
62,
63,
64,
91,
92,
93,
94,
95,
96,
123,
124,
125,
160,
161,
162,
Expand Down
27 changes: 27 additions & 0 deletions data/encode-map.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"\uD835\uDC9C": "Ascr",
"\uD835\uDCB6": "ascr",
"\u2254": "colone",
"*": "ast",
"\u224D": "CupCap",
"\u00C3": "Atilde",
"\u00E3": "atilde",
Expand Down Expand Up @@ -175,6 +176,7 @@
"\uD835\uDCB7": "bscr",
"\u204F": "bsemi",
"\u29C5": "bsolb",
"\\": "bsol",
"\u27C8": "bsolhsub",
"\u2022": "bull",
"\u224E": "bump",
Expand Down Expand Up @@ -238,8 +240,11 @@
"\u201D": "rdquo",
"\u2019": "rsquo",
"\u2663": "clubs",
":": "colon",
"\u2237": "Colon",
"\u2A74": "Colone",
",": "comma",
"@": "commat",
"\u2201": "comp",
"\u2218": "compfn",
"\u2102": "Copf",
Expand Down Expand Up @@ -314,6 +319,7 @@
"\u21C3": "dharl",
"\u21C2": "dharr",
"\u02D9": "dot",
"`": "grave",
"\u02DC": "tilde",
"\u22C4": "diam",
"\u2666": "diams",
Expand All @@ -326,6 +332,7 @@
"\u0452": "djcy",
"\u231E": "dlcorn",
"\u230D": "dlcrop",
"$": "dollar",
"\uD835\uDD3B": "Dopf",
"\uD835\uDD55": "dopf",
"\u20DC": "DotDot",
Expand Down Expand Up @@ -424,6 +431,7 @@
"\u03F5": "epsiv",
"\u2242": "esim",
"\u2A75": "Equal",
"=": "equals",
"\u225F": "equest",
"\u21CC": "rlhar",
"\u2A78": "equivDD",
Expand All @@ -440,6 +448,7 @@
"\u00CB": "Euml",
"\u00EB": "euml",
"\u20AC": "euro",
"!": "excl",
"\u2203": "exist",
"\u0424": "Fcy",
"\u0444": "fcy",
Expand All @@ -451,6 +460,7 @@
"\uD835\uDD23": "ffr",
"\uFB01": "filig",
"\u25FC": "FilledSmallSquare",
"fj": "fjlig",
"\u266D": "flat",
"\uFB02": "fllig",
"\u25B1": "fltns",
Expand Down Expand Up @@ -543,6 +553,7 @@
"\u2948": "harrcir",
"\u2194": "harr",
"\u21AD": "harrw",
"^": "Hat",
"\u210F": "hbar",
"\u0124": "Hcirc",
"\u0125": "hcirc",
Expand Down Expand Up @@ -684,6 +695,8 @@
"\u290C": "lbarr",
"\u290E": "lBarr",
"\u2772": "lbbrk",
"{": "lcub",
"[": "lsqb",
"\u298B": "lbrke",
"\u298F": "lbrksld",
"\u298D": "lbrkslu",
Expand Down Expand Up @@ -767,9 +780,11 @@
"\u2A2D": "loplus",
"\u2A34": "lotimes",
"\u2217": "lowast",
"_": "lowbar",
"\u2199": "swarr",
"\u2198": "searr",
"\u25CA": "loz",
"(": "lpar",
"\u2993": "lparlt",
"\u296D": "lrhard",
"\u200E": "lrm",
Expand Down Expand Up @@ -857,6 +872,7 @@
"\u2262": "nequiv",
"\u2928": "toea",
"\u2242\u0338": "nesim",
"\n": "NewLine",
"\u2204": "nexist",
"\uD835\uDD11": "Nfr",
"\uD835\uDD2B": "nfr",
Expand Down Expand Up @@ -946,6 +962,7 @@
"\u00F1": "ntilde",
"\u039D": "Nu",
"\u03BD": "nu",
"#": "num",
"\u2116": "numero",
"\u2007": "numsp",
"\u224D\u20D2": "nvap",
Expand Down Expand Up @@ -1033,6 +1050,8 @@
"\u2202": "part",
"\u041F": "Pcy",
"\u043F": "pcy",
"%": "percnt",
".": "period",
"\u2030": "permil",
"\u2031": "pertenk",
"\uD835\uDD13": "Pfr",
Expand All @@ -1047,6 +1066,7 @@
"\u210E": "planckh",
"\u2A23": "plusacir",
"\u2A22": "pluscir",
"+": "plus",
"\u2A25": "plusdu",
"\u2A72": "pluse",
"\u00B1": "pm",
Expand Down Expand Up @@ -1087,6 +1107,7 @@
"\uD835\uDCAC": "Qscr",
"\uD835\uDCC6": "qscr",
"\u2A16": "quatint",
"?": "quest",
"\"": "quot",
"\u21DB": "rAarr",
"\u223D\u0331": "race",
Expand Down Expand Up @@ -1115,6 +1136,8 @@
"\u291C": "rAtail",
"\u2236": "ratio",
"\u2773": "rbbrk",
"}": "rcub",
"]": "rsqb",
"\u298C": "rbrke",
"\u298E": "rbrksld",
"\u2990": "rbrkslu",
Expand Down Expand Up @@ -1168,6 +1191,7 @@
"\u2A2E": "roplus",
"\u2A35": "rotimes",
"\u2970": "RoundImplies",
")": "rpar",
"\u2994": "rpargt",
"\u2A12": "rppolint",
"\u203A": "rsaquo",
Expand Down Expand Up @@ -1204,6 +1228,7 @@
"\u2A66": "sdote",
"\u21D8": "seArr",
"\u00A7": "sect",
";": "semi",
"\u2929": "tosa",
"\u2736": "sext",
"\uD835\uDD16": "Sfr",
Expand Down Expand Up @@ -1238,6 +1263,7 @@
"\u044C": "softcy",
"\u233F": "solbar",
"\u29C4": "solb",
"/": "sol",
"\uD835\uDD4A": "Sopf",
"\uD835\uDD64": "sopf",
"\u2660": "spades",
Expand Down Expand Up @@ -1411,6 +1437,7 @@
"\u22BB": "veebar",
"\u225A": "veeeq",
"\u22EE": "vellip",
"|": "vert",
"\u2016": "Vert",
"\u2758": "VerticalSeparator",
"\u2240": "wr",
Expand Down
1 change: 1 addition & 0 deletions data/encode-paired-symbols.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"<\u20D2",
"=\u20E5",
">\u20D2",
"fj",
"\u205F\u200A",
"\u219D\u0338",
"\u2202\u0338",
Expand Down
52 changes: 46 additions & 6 deletions he.js

Large diffs are not rendered by default.

51 changes: 44 additions & 7 deletions scripts/export-data.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
var fs = require('fs');
var stringEscape = require('jsesc');
var jsesc = require('jsesc');
var regenerate = require('regenerate');

var object = {};
Expand All @@ -14,24 +14,61 @@ var readJSON = function(fileName) {
if (isArray(object)) {
return object;
}
return stringEscape(object, {
return jsesc(object, {
'compact': true,
'quotes': 'single'
});
};

var joinStrings = function(a, b) {
if (a && b) {
return a + '|' + b;
}
return a + b;
};

var loneCodePoints = readJSON('encode-lone-code-points');
var arrayEncodeMultipleSymbols = readJSON('encode-paired-symbols');
var arrayEncodeMultipleSymbolsASCII = arrayEncodeMultipleSymbols
.filter(function(string) {
return /^[\0-\x7F]+$/.test(string);
});

var encodeSingleSymbolsASCII = regenerate(loneCodePoints)
.removeRange(0x7F + 1, 0x10FFFF).toString();
var encodeSingleSymbolsNonASCII = regenerate(loneCodePoints)
.removeRange(0x00, 0x7F).toString();
var encodeMultipleSymbolsASCII = jsesc(
arrayEncodeMultipleSymbolsASCII.join('|')
);
var encodeMultipleSymbolsNonASCII = jsesc(
regenerate.difference(
arrayEncodeMultipleSymbols,
arrayEncodeMultipleSymbolsASCII
).join('|')
);
var encodeASCII = joinStrings(
encodeMultipleSymbolsASCII,
encodeSingleSymbolsASCII
);
var encodeNonASCII = joinStrings(
encodeMultipleSymbolsNonASCII,
encodeSingleSymbolsNonASCII
);

module.exports = {
'encodeMap': readJSON('encode-map'),
'encodeSingleSymbols': regenerate.fromCodePoints(readJSON('encode-lone-code-points')),
'encodeMultipleSymbols': stringEscape(readJSON('encode-paired-symbols').join('|')),
'encodeASCII': encodeASCII, // not used
'encodeNonASCII': encodeNonASCII,
'decodeOverrides': readJSON('decode-map-overrides'),
'decodeMap': readJSON('decode-map'),
'decodeMapLegacy': readJSON('decode-map-legacy'),
'astralSymbols': regenerate.fromCodePointRange(0x010000, 0x10FFFF),
'invalidCodePoints': '[' + readJSON('invalid-code-points').join(',') + ']',
'astralSymbol': regenerate.fromCodePointRange(0x010000, 0x10FFFF),
'invalidCodePoints': jsesc(readJSON('invalid-code-points')),
'regexDecimalEscapeSource': '&#([0-9]+)(;?)',
'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)',
'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
'regexLegacyReferenceSource': '&(' + readJSON('decode-legacy-named-references').join('|') + ')([=a-zA-Z0-9])?',
'regexLegacyReferenceSource': '&(' +
readJSON('decode-legacy-named-references').join('|') + ')([=a-zA-Z0-9])?',
'version': JSON.parse(fs.readFileSync('package.json', 'utf-8')).version
};
4 changes: 2 additions & 2 deletions scripts/process-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ _.forOwn(data, function(value, key) {
var string = value.characters;
var codePoints = value.codepoints;
var tmp;
if (/;$/.test(referenceWithoutLeadingAmpersand) && (!/^[\x20-\x7E\n]+$/g.test(string) || /^[&<>"']+$/g.test(string))) {
// only if the entity has a trailing semicolon, and the original string is not printable ASCII already
if (/;$/.test(referenceWithoutLeadingAmpersand)) {
// only if the entity has a trailing semicolon
tmp = encodeMap[string];
// Prefer short named character references with as few uppercase letters as possible
if ( // only add an entry if…
Expand Down
Loading

0 comments on commit 41a05b4

Please sign in to comment.