diff --git a/Makefile b/Makefile
index 8e9b769..9c4a0ea 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,6 @@ lint:
./node_modules/.bin/jshint *.js lib test
test:
- node --require should --test
+ node --test
.PHONY: check lint test
diff --git a/lib/stream.js b/lib/stream.js
index a9616cc..4eb46af 100644
--- a/lib/stream.js
+++ b/lib/stream.js
@@ -1,57 +1,47 @@
-const { Transform } = require('stream');
-const { StringDecoder } = require('string_decoder');
-const sax = require('sax');
+const Saxophone = require('@pirxpilot/saxophone');
+const stack = require('./stack');
+
const debug = require('debug')('sax-super-stream');
-const stack = require('./stack');
+/* global TransformStream, EventTarget */
+
module.exports = initParser;
// used to mark all tags that we want to skip
-const IGNORE = Object.create(null);
+const IGNORE = Symbol('ignore');
function stripPrefix(name) {
- const index = name.indexOf(':');
- return index < 0 ? name : name.slice(index + 1);
+ const prefixed = name.split(':', 2);
+ return prefixed[1] ? prefixed : [undefined, prefixed[0]];
}
-function handlers(parserConfig, fn) {
+function makeHandlers(parserConfig, emit) {
+ const ns = Object.create(null);
+ let defaultNS;
const items = stack();
const parsers = stack(parserConfig);
const context = {};
- let cdata;
- function onopentag(node) {
- const tag = node.local;
- const tagParser = verifyNS(parsers.top()[tag]);
- let elem;
+ const target = new EventTarget();
+ target.addEventListener('tagopen', ontagopen);
+ target.addEventListener('tagclose', ontagclose);
+ target.addEventListener('text', ontext);
+ target.addEventListener('cdata', ontext);
+ return target;
- function parseWith(tp) {
- elem = tp.call(items, node, items.top(), context);
- if (elem) {
- items.push(elem, tag);
- }
- }
+ function ontagopen({ detail: { name, attrs, isSelfClosing } }) {
+ const [prefix, tag] = stripPrefix(name);
+ const attrsObj = Saxophone.parseAttrs(attrs);
+ updateNamespaces(attrsObj);
+ const uri = (prefix && ns[prefix]) ?? defaultNS;
- // if parser specifies namespace, it has to much naode namespace
- function verifyNS(tp) {
- if (!tp) {
- return tp;
- }
- if (!tp.$uri) {
- return tp;
- }
- if (tp.$uri === node.uri) {
- return tp;
- }
- }
+ const tagParser = getParser(tag, uri);
- debug('onopentag', tag);
+ debug('tagopen', tag, attrs);
if (!tagParser) {
parsers.push(IGNORE, tag);
- return;
- }
- if (typeof tagParser === 'function') {
+ } else if (typeof tagParser === 'function') {
parseWith(tagParser);
} else {
if (typeof tagParser.$ === 'function') {
@@ -59,136 +49,119 @@ function handlers(parserConfig, fn) {
}
parsers.push(tagParser, tag);
}
- }
+ if (isSelfClosing) {
+ ontagclose({ detail: { name } });
+ }
- function onclosetag(tag) {
- let parser;
- let top;
+ function parseWith(tp) {
+ const attributes = Object.fromEntries(
+ Object.entries(attrsObj).map(
+ ([name, value]) => [
+ name,
+ {
+ name,
+ value: Saxophone.parseEntities(value)
+ }
+ ]
+ )
+ );
+ const node = { prefix, tag, attributes };
+ const elem = tp.call(items, node, items.top(), context);
+ if (elem) {
+ items.push(elem, tag);
+ }
+ }
+ }
- tag = stripPrefix(tag);
+ function ontagclose({ detail: { name } }) {
+ const [, tag] = stripPrefix(name);
- debug('closetag', tag);
+ debug('tagclose', tag);
- parser = parsers.pop(tag);
- if (parser && typeof parser.$after === 'function') {
+ const parser = parsers.pop(tag);
+ if (typeof parser?.$after === 'function') {
+ debug('$after', tag);
parser.$after.call(items, items.top(), context);
}
if (parser !== IGNORE) {
- top = items.pop(tag);
+ const top = items.pop(tag);
// if nothing on the stack emit result
if (top !== undefined && items.empty()) {
- fn(null, top);
+ emit(top);
}
}
}
- function ontext(value) {
- const textParser = parsers.top().$text;
- if (textParser) {
- textParser.call(items, value, items.top(), context);
- }
- }
+ function ontext({ type, detail: { contents } }) {
+ debug('text', contents);
- function onopencdata() {
const textParser = parsers.top().$text;
- if (textParser) {
- cdata = [];
- }
+ textParser?.call(
+ items,
+ type === 'cdata' ? contents : Saxophone.parseEntities(contents),
+ items.top(),
+ context
+ );
}
- function oncdata(value) {
- if (cdata) {
- cdata.push(value);
+ function updateNamespaces(attrsObj) {
+ Object.entries(attrsObj).forEach(xmlns);
+
+ function xmlns([name, value]) {
+ if (name === 'xmlns') {
+ defaultNS = value;
+ } else if (name.startsWith('xmlns:')) {
+ const prefix = name.slice(6); // 'xmlns:'.length === 6
+ ns[prefix] = value;
+ }
}
}
- function onclosecdata() {
- if (!cdata) {
+ function getParser(tag, uri) {
+ const top = parsers.top();
+ if (top === IGNORE) {
return;
}
- const textParser = parsers.top().$text;
- textParser.call(items, cdata.join(''), items.top(), context);
- cdata = undefined;
- }
-
- function onerror(err) {
- debug('Detected error', err);
- // mark error as handled
- this.error = null;
- fn(err);
+ const tp = top[tag];
+ if (!tp) {
+ return;
+ }
+ if (!tp.$uri) {
+ return tp;
+ }
+ // if parser specifies namespace, it has to match node namespace
+ if (tp.$uri === uri) {
+ return tp;
+ }
}
-
- return {
- onopentag,
- onclosetag,
- ontext,
- onopencdata,
- oncdata,
- onclosecdata,
- onerror
- };
}
-function initParser(parserConfig, saxOptions) {
+function initParser(config) {
- saxOptions = Object.assign({
- trim: true,
- normalize: true,
- lowercase: false,
- xmlns: true,
- position: false,
- strictEntities: true,
- noscript: true
- }, saxOptions);
+ const results = [];
- const parser = sax.parser(true, saxOptions);
- const decoder = new StringDecoder('utf8');
- let results = [];
- let parserError;
+ const target = makeHandlers(config, (...objs) => results.push(...objs));
+ const sax = new Saxophone(target);
+ const writer = sax.getWriter();
- Object.assign(parser, handlers(parserConfig, (err, obj) => {
- if (!err) {
- results.push(obj);
- } else {
- // only report the first error
- parserError = parserError || err;
- }
- }));
-
- const ts = new Transform({
- readableObjectMode: true,
- flush(next) {
- parser.close();
- if (parserError) {
- return next(parserError);
- }
- flush(this);
- next();
+ return new TransformStream({
+ async flush(controller) {
+ await writer.ready;
+ await writer.close();
+ flush(controller);
+ controller.terminate();
},
- transform(chunk, encoding, next) {
- if (parserError) {
- return next(parserError);
- }
- write(chunk);
- flush(this);
- next();
+ async transform(chunk, controller) {
+ debug('writing', chunk);
+ await writer.ready;
+ await writer.write(chunk);
+ flush(controller);
}
});
- return ts;
-
- function write(chunk) {
- const str = decoder.write(chunk);
- parser.write(str);
- }
- function flush(stream) {
- if (!results.length) {
- return;
- }
- results.forEach(r => {
- stream.push(r);
- });
- results = [];
+ function flush(controller) {
+ results.forEach(r => controller.enqueue(r));
+ results.length = 0;
}
}
diff --git a/package.json b/package.json
index 217d8e0..152ab48 100644
--- a/package.json
+++ b/package.json
@@ -16,12 +16,11 @@
"object"
],
"dependencies": {
- "debug": "~2 || ~3 || ~4",
- "sax": "^1.2.1"
+ "@pirxpilot/saxophone": "^1.0.0",
+ "debug": "~2 || ~3 || ~4"
},
"devDependencies": {
- "jshint": "~2",
- "should": "~13"
+ "jshint": "~2"
},
"scripts": {
"test": "make check"
@@ -30,4 +29,4 @@
"index.js",
"lib"
]
-}
\ No newline at end of file
+}
diff --git a/test/fixtures/nested.xml b/test/fixtures/nested.xml
new file mode 100644
index 0000000..4ff9ded
--- /dev/null
+++ b/test/fixtures/nested.xml
@@ -0,0 +1,12 @@
+
+
+ -
+ abc
+
-
+ def
+
-
+ ghi
+
+
+
+
diff --git a/test/stream.js b/test/stream.js
index 3c6736f..a126be2 100644
--- a/test/stream.js
+++ b/test/stream.js
@@ -1,228 +1,285 @@
-const { describe, it } = require('node:test');
+const test = require('node:test');
+const assert = require('node:assert/strict');
const fs = require('node:fs');
+const path = require('node:path');
const { pipeline } = require('node:stream/promises');
-const { Writable, Readable } = require('stream');
const stream = require('..');
function readStream(name) {
- return fs.createReadStream([__dirname, 'fixtures', name].join('/'));
+ return fs.createReadStream(path.join(__dirname, 'fixtures', name));
}
+/* global WritableStream, ReadableStream, TextDecoderStream */
+
function memory(array) {
- return new Writable({
- objectMode: true,
- write(item, encoding, next) {
- array.push(item);
- next();
- }
+ return new WritableStream({
+ write: item => array.push(item)
});
}
-describe('sax super stream', () => {
- it('should parse a single empty node', async () => {
- const config = {
- 'item': stream.object()
- };
- const result = [];
+test('should parse a single empty node', async () => {
+ const config = {
+ 'item': stream.object()
+ };
+ const result = [];
- await pipeline(
- readStream('one.xml'),
- stream(config),
- memory(result)
- );
- result.should.have.length(1);
- result[0].should.eql({});
+ await pipeline(
+ readStream('one.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
- });
+ assert.deepEqual(result, [{}]);
+});
- it('should parse nodes with text', async () => {
- const config = {
- 'two': {
- 'item': {
- $: stream.object(),
- 'a': { $text: stream.assignTo('A') },
- 'b': { $text: stream.assignTo('B') }
- }
+test('should parse nodes with text', async () => {
+ const config = {
+ 'two': {
+ 'item': {
+ $: stream.object(),
+ 'a': { $text: stream.assignTo('A') },
+ 'b': { $text: stream.assignTo('B') }
}
- };
- const result = [];
-
- await pipeline(
- readStream('two.xml'),
- stream(config),
- memory(result)
- );
- result.should.have.length(2);
- result[0].should.have.property('A', 'abc');
- result[0].should.have.property('B', '15');
- result[1].should.have.property('A', 'def');
- result[1].should.have.property('B', '16');
-
-
- });
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('two.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+ assert.deepEqual(result, [
+ { A: 'abc', B: '15' },
+ { A: 'def', B: '16' }
+ ]);
+});
- it('should parse attributes', async () => {
- const config = {
- 'THREE': {
- 'ITEMS': {
- 'ITEM': {
- $: stream.object(),
- 'A': { $: appendToCollection },
- 'B': { $: addToParent }
- }
+test('should parse nested nodes', async () => {
+ const itemParser = {
+ $: (_, parent) => parent.item = {},
+ a: { $text: stream.assignTo('A') }
+ };
+ itemParser.item = itemParser;
+ const config = {
+ nested: {
+ $: stream.object(),
+ item: itemParser
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('nested.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+ assert.deepEqual(result, [{
+ item: {
+ A: 'abc',
+ item: {
+ A: 'def',
+ item: {
+ A: 'ghi',
}
}
- };
- const result = [];
-
- function appendToCollection({ attributes }, parent) {
- const obj = {
- value: attributes.attr.value
- };
- parent.children = parent.children || [];
- parent.children.push(obj);
- }
-
- function addToParent({ attributes }, parent) {
- parent.b = attributes.attr.value;
}
+ }]);
+});
- await pipeline(
- readStream('three.xml'),
- stream(config, { lowercase: true }),
- memory(result)
- );
- let item;
- let a;
-
- result.should.have.length(1);
-
- item = result[0];
- item.should.have.property('b', '4');
-
- a = item.children;
- a.should.have.length(3);
- a[0].should.have.property('value', '1');
-
- });
-
- it('should call $after parser if specified', async () => {
- let value = 0;
- const config = {
- 'doc': {
- 'item': {
+test('should parse attributes', async () => {
+ const config = {
+ 'THREE': {
+ 'ITEMS': {
+ 'ITEM': {
$: stream.object(),
- $after(obj) { obj.value = value++; }
+ 'A': { $: appendToCollection },
+ 'B': { $: addToParent }
}
}
- };
- const result = [];
-
- await pipeline(
- readStream('ns.xml'),
- stream(config),
- memory(result)
- );
-
- value.should.be.eql(2);
- result.should.have.length(2);
- result[0].should.have.property('value', 0);
- result[1].should.have.property('value', 1);
-
- });
+ }
+ };
+ const result = [];
- it('should ignore namespace if none declared', async () => {
- const config = {
- 'doc': {
- 'item': { $: stream.object() }
- }
+ function appendToCollection({ attributes }, parent) {
+ const obj = {
+ value: attributes.attr.value
};
- const result = [];
-
- await pipeline(
- readStream('ns.xml'),
- stream(config),
- memory(result)
- );
-
- result.should.have.length(2);
- });
+ parent.children = parent.children || [];
+ parent.children.push(obj);
+ }
+
+ function addToParent({ attributes }, parent) {
+ parent.b = attributes.attr.value;
+ }
+
+ await pipeline(
+ readStream('three.xml'),
+ new TextDecoderStream(),
+ stream(config, { lowercase: true }),
+ memory(result)
+ );
+
+ assert.deepEqual(result, [{
+ b: '4',
+ children: [
+ { value: '1' },
+ { value: '2' },
+ { value: '3' }
+ ]
+ }]);
+});
- it('should accept elements if namespace matches $uri attribute', async () => {
- const config = {
- 'doc': {
- $uri: 'http://example.com',
- 'item': { $: stream.object() }
+test('should call $after parser if specified', async () => {
+ let value = 0;
+ const config = {
+ 'doc': {
+ 'item': {
+ $: stream.object(),
+ $after(obj) { obj.value = value++; }
}
- };
- const result = [];
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('ns.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+
+ assert.equal(value, 2);
+ assert.deepEqual(result, [
+ { value: 0 },
+ { value: 1 }
+ ]);
+});
- await pipeline(
- readStream('ns.xml'),
- stream(config),
- memory(result));
- result.should.have.length(2);
+test('should ignore namespace if none declared', async () => {
+ const config = {
+ 'doc': {
+ 'item': { $: stream.object() }
+ }
+ };
+ const result = [];
- });
+ await pipeline(
+ readStream('ns.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
- it('should ignore elements if namespace does not match $uri attribute', async () => {
- const config = {
- 'doc': {
- $uri: 'http://another.com',
- 'item': { $: stream.object() }
- }
- };
- const result = [];
+ assert.deepEqual(result, [{}, {}]);
+});
- await pipeline(
- readStream('ns.xml'),
- stream(config),
- memory(result));
+test('should accept elements if namespace matches $uri attribute', async () => {
+ const config = {
+ 'doc': {
+ $uri: 'http://example.com',
+ 'item': { $: stream.object() }
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('ns.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+ assert.deepEqual(result, [{}, {}]);
+});
- result.should.have.length(0);
- });
+test('should ignore elements if namespace does not match $uri attribute', async () => {
+ const config = {
+ 'doc': {
+ $uri: 'http://another.com',
+ 'item': { $: stream.object() }
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('ns.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+ assert.deepEqual(result, []);
+});
- it('should parse CDATA as text', async () => {
- const config = {
- 'FOUR': {
- 'ITEM': {
- $: stream.object(),
- 'A': { $text: stream.assignTo('a') },
- 'B': { $text: stream.assignTo('b') }
- }
+test('should parse CDATA as text', async () => {
+ const config = {
+ 'FOUR': {
+ 'ITEM': {
+ $: stream.object(),
+ 'A': { $text: stream.assignTo('a') },
+ 'B': { $text: stream.assignTo('b') }
}
- };
- const result = [];
-
- await pipeline(
- readStream('four.xml'),
- stream(config),
- memory(result));
-
- result.should.have.length(2);
- result[0].should.be.eql({ a: 'abc', b: '15' });
- result[1].should.be.eql({ a: 'def', b: '16' });
-
- });
-
- it('should raise errors on invalid XML', async () => {
-
- const config = {
- 'item': { $: stream.object() }
- };
+ }
+ };
+ const result = [];
+
+ await pipeline(
+ readStream('four.xml'),
+ new TextDecoderStream(),
+ stream(config),
+ memory(result)
+ );
+
+ assert.deepEqual(result, [
+ { a: 'abc', b: '15' },
+ { a: 'def', b: '16' }
+ ]);
+});
- const from = new Readable({
- read() {}
- });
+test('should parse entities in text but not in CDATA', async () => {
+ const config = {
+ doc: {
+ item: {
+ $: stream.object(),
+ $text: stream.assignTo('a'),
+ }
+ }
+ };
+ const result = [];
+
+ const from = ReadableStream.from([
+ '',
+ '- 1<2
',
+ ' ',
+ ''
+ ]);
+
+ await pipeline(
+ from,
+ stream(config),
+ memory(result)
+ );
+
+ assert.deepEqual(result, [
+ { a: '1<2' },
+ { a: '1&2' }
+ ]);
+});
- const pipe = pipeline(from, stream(config));
+test('should raise errors on invalid XML', async () => {
+ const config = {
+ 'item': { $: stream.object() }
+ };
- from.push('- ');
- from.push('');
- from.push(null);
+ const from = ReadableStream.from([
+ '
- ',
+ ''
+ ]);
- await pipe.should.be.rejectedWith('Unexpected close tag');
- });
+ const pipe = pipeline(from, stream(config), memory([]));
+ await assert.rejects(pipe, /unclosed tag: item/i);
});