From 7bc385195c401002ce8edf596f609a166092376f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matt=C3=A9o=20Delabre=20=E2=9C=8F?= Date: Sun, 4 Sep 2016 21:38:34 +0200 Subject: [PATCH] Support expanding XML entities --- README.md | 11 ++++++ lib/index.js | 1 + lib/static/entities.js | 71 +++++++++++++++++++++++++++++++++++++ lib/static/entities.test.js | 26 ++++++++++++++ 4 files changed, 109 insertions(+) create mode 100644 lib/static/entities.js create mode 100644 lib/static/entities.test.js diff --git a/README.md b/README.md index 89e2222..d2436cd 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,15 @@ for every tag and it takes some time. The result is an object associating the attribute names (as object keys) to their attribute values (as object values). +#### `Saxophone.parseEntities(text)` + +Parses a piece of XML text and expands all XML entities inside it to +the character they represent. Just like attributes, this is not +parsed automatically because it takes some time. + +This ignores invalid entities, including unrecognized ones, leaving them +as-is. + ### Events #### `tagopen` @@ -166,6 +175,8 @@ instruction is passed. Emitted when a text node between two tags is parsed. An object with the `contents` of the text node is passed. +You might need to expand XML entities inside the contents of +the text node, using `Saxophone.parseEntities`. #### `cdata` diff --git a/lib/index.js b/lib/index.js index e67598f..786abfa 100644 --- a/lib/index.js +++ b/lib/index.js @@ -25,3 +25,4 @@ const saxophonePrototype = Object.assign( // load the static properties and methods require('./static/attrs')(Saxophone); +require('./static/entities')(Saxophone); diff --git a/lib/static/entities.js b/lib/static/entities.js new file mode 100644 index 0000000..f4545b0 --- /dev/null +++ b/lib/static/entities.js @@ -0,0 +1,71 @@ +const parseEntities = input => { + let position = 0, next = 0; + const parts = []; + + while ((next = input.indexOf('&', position)) !== -1) { + // remember anything there was before the entity + if (next > position) { + parts.push(input.slice(position, next)); + } + + const end = input.indexOf(';', next); + + // ignore unterminated entities + if (end === -1) { + break; + } + + const entity = input.slice(next, end); + + if (entity === '"') { + parts.push('"'); + } else if (entity === '&') { + parts.push('&'); + } else if (entity === '&apos') { + parts.push("'"); + } else if (entity === '<') { + parts.push('<'); + } else if (entity === '>') { + parts.push('>'); + } else { + // ignore unrecognized character entities + if (entity[1] !== '#') { + parts.push(entity + ';'); + } else { + // hexadecimal numeric entities + if (entity[2] == 'x') { + const value = parseInt(entity.slice(3), 16); + + // ignore non-numeric numeric entities + if (isNaN(value)) { + parts.push(entity + ';'); + } else { + parts.push(String.fromCharCode(value)); + } + } else { + // decimal numeric entities + const value = parseInt(entity.slice(2), 10); + + // ignore non-numeric numeric entities + if (isNaN(value)) { + parts.push(entity + ';'); + } else { + parts.push(String.fromCharCode(value)); + } + } + } + } + + position = end + 1; + } + + if (position < input.length) { + parts.push(input.slice(position)); + } + + return parts.join(''); +}; + +module.exports = Saxophone => { + Saxophone.parseEntities = parseEntities; +}; diff --git a/lib/static/entities.test.js b/lib/static/entities.test.js new file mode 100644 index 0000000..5e5fa03 --- /dev/null +++ b/lib/static/entities.test.js @@ -0,0 +1,26 @@ +const test = require('tape'); +const Saxophone = require('../'); + +test('should normalize character entity references', assert => { + assert.equal(Saxophone.parseEntities('"Run!", he said'), '"Run!", he said', 'normalize "'); + assert.equal(Saxophone.parseEntities('& On & On & On'), '& On & On & On', 'normalize &'); + assert.equal(Saxophone.parseEntities('J'irai demain'), "J'irai demain", 'normalize ''); + assert.equal(Saxophone.parseEntities('<thisIsNotATag>'), '', 'normalize > and <'); + assert.equal(Saxophone.parseEntities('<>"&&"'>'), '<>"&&"\'>', 'normalize several'); + assert.end(); +}); + +test('should normalize numeric character references', assert => { + assert.equal(Saxophone.parseEntities('§'), '§', 'normalize hexadecimal entities'); + assert.equal(Saxophone.parseEntities('§'), '§', 'normalize decimal entities'); + assert.equal(Saxophone.parseEntities('⁂☒〃⸻'), '⁂☒〃⸻', 'normalize mixed entities'); + assert.end(); +}); + +test('should ignore invalid character entity references', assert => { + assert.equal(Saxophone.parseEntities('&unknown;'), '&unknown;', 'ignore unknown entity references'); + assert.equal(Saxophone.parseEntities('&'), '&', 'ignore unterminated entity references'); + assert.equal(Saxophone.parseEntities('&#notanumber;'), '&#notanumber;', 'ignore non-numeric decimal character refrences'); + assert.equal(Saxophone.parseEntities('&#xnotanumber;'), '&#xnotanumber;', 'ignore non-numeric hexa character refrences'); + assert.end(); +});