Skip to content

Commit

Permalink
Support expanding XML entities
Browse files Browse the repository at this point in the history
  • Loading branch information
matteodelabre committed Sep 4, 2016
1 parent 309dea5 commit 7bc3851
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 0 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ for every tag and it takes some time.
The result is an object associating the attribute names (as object keys)
to their attribute values (as object values).

#### `Saxophone.parseEntities(text)`

Parses a piece of XML text and expands all XML entities inside it to
the character they represent. Just like attributes, this is not
parsed automatically because it takes some time.

This ignores invalid entities, including unrecognized ones, leaving them
as-is.

### Events

#### `tagopen`
Expand Down Expand Up @@ -166,6 +175,8 @@ instruction is passed.

Emitted when a text node between two tags is parsed.
An object with the `contents` of the text node is passed.
You might need to expand XML entities inside the contents of
the text node, using `Saxophone.parseEntities`.

#### `cdata`

Expand Down
1 change: 1 addition & 0 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ const saxophonePrototype = Object.assign(

// load the static properties and methods
require('./static/attrs')(Saxophone);
require('./static/entities')(Saxophone);
71 changes: 71 additions & 0 deletions lib/static/entities.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
const parseEntities = input => {
let position = 0, next = 0;
const parts = [];

while ((next = input.indexOf('&', position)) !== -1) {
// remember anything there was before the entity
if (next > position) {
parts.push(input.slice(position, next));
}

const end = input.indexOf(';', next);

// ignore unterminated entities
if (end === -1) {
break;
}

const entity = input.slice(next, end);

if (entity === '&quot') {
parts.push('"');
} else if (entity === '&amp') {
parts.push('&');
} else if (entity === '&apos') {
parts.push("'");
} else if (entity === '&lt') {
parts.push('<');
} else if (entity === '&gt') {
parts.push('>');
} else {
// ignore unrecognized character entities
if (entity[1] !== '#') {
parts.push(entity + ';');
} else {
// hexadecimal numeric entities
if (entity[2] == 'x') {
const value = parseInt(entity.slice(3), 16);

// ignore non-numeric numeric entities
if (isNaN(value)) {
parts.push(entity + ';');
} else {
parts.push(String.fromCharCode(value));
}
} else {
// decimal numeric entities
const value = parseInt(entity.slice(2), 10);

// ignore non-numeric numeric entities
if (isNaN(value)) {
parts.push(entity + ';');
} else {
parts.push(String.fromCharCode(value));
}
}
}
}

position = end + 1;
}

if (position < input.length) {
parts.push(input.slice(position));
}

return parts.join('');
};

module.exports = Saxophone => {
Saxophone.parseEntities = parseEntities;
};
26 changes: 26 additions & 0 deletions lib/static/entities.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const test = require('tape');
const Saxophone = require('../');

test('should normalize character entity references', assert => {
assert.equal(Saxophone.parseEntities('&quot;Run!&quot;, he said'), '"Run!", he said', 'normalize &quot;');
assert.equal(Saxophone.parseEntities('&amp; On &amp; On &amp; On'), '& On & On & On', 'normalize &amp;');
assert.equal(Saxophone.parseEntities('J&apos;irai demain'), "J'irai demain", 'normalize &apos;');
assert.equal(Saxophone.parseEntities('&lt;thisIsNotATag&gt;'), '<thisIsNotATag>', 'normalize &gt; and &lt;');
assert.equal(Saxophone.parseEntities('&lt;&gt;&quot;&amp;&amp;&quot;&apos;&gt;'), '<>"&&"\'>', 'normalize several');
assert.end();
});

test('should normalize numeric character references', assert => {
assert.equal(Saxophone.parseEntities('&#xA7;'), '§', 'normalize hexadecimal entities');
assert.equal(Saxophone.parseEntities('&#167;'), '§', 'normalize decimal entities');
assert.equal(Saxophone.parseEntities('&#8258;&#x2612;&#12291;&#x2E3B;'), '⁂☒〃⸻', 'normalize mixed entities');
assert.end();
});

test('should ignore invalid character entity references', assert => {
assert.equal(Saxophone.parseEntities('&unknown;'), '&unknown;', 'ignore unknown entity references');
assert.equal(Saxophone.parseEntities('&amp'), '&amp', 'ignore unterminated entity references');
assert.equal(Saxophone.parseEntities('&#notanumber;'), '&#notanumber;', 'ignore non-numeric decimal character refrences');
assert.equal(Saxophone.parseEntities('&#xnotanumber;'), '&#xnotanumber;', 'ignore non-numeric hexa character refrences');
assert.end();
});

0 comments on commit 7bc3851

Please sign in to comment.