diff --git a/Makefile b/Makefile index 8e9b769..9c4a0ea 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,6 @@ lint: ./node_modules/.bin/jshint *.js lib test test: - node --require should --test + node --test .PHONY: check lint test diff --git a/lib/stream.js b/lib/stream.js index a9616cc..4eb46af 100644 --- a/lib/stream.js +++ b/lib/stream.js @@ -1,57 +1,47 @@ -const { Transform } = require('stream'); -const { StringDecoder } = require('string_decoder'); -const sax = require('sax'); +const Saxophone = require('@pirxpilot/saxophone'); +const stack = require('./stack'); + const debug = require('debug')('sax-super-stream'); -const stack = require('./stack'); +/* global TransformStream, EventTarget */ + module.exports = initParser; // used to mark all tags that we want to skip -const IGNORE = Object.create(null); +const IGNORE = Symbol('ignore'); function stripPrefix(name) { - const index = name.indexOf(':'); - return index < 0 ? name : name.slice(index + 1); + const prefixed = name.split(':', 2); + return prefixed[1] ? prefixed : [undefined, prefixed[0]]; } -function handlers(parserConfig, fn) { +function makeHandlers(parserConfig, emit) { + const ns = Object.create(null); + let defaultNS; const items = stack(); const parsers = stack(parserConfig); const context = {}; - let cdata; - function onopentag(node) { - const tag = node.local; - const tagParser = verifyNS(parsers.top()[tag]); - let elem; + const target = new EventTarget(); + target.addEventListener('tagopen', ontagopen); + target.addEventListener('tagclose', ontagclose); + target.addEventListener('text', ontext); + target.addEventListener('cdata', ontext); + return target; - function parseWith(tp) { - elem = tp.call(items, node, items.top(), context); - if (elem) { - items.push(elem, tag); - } - } + function ontagopen({ detail: { name, attrs, isSelfClosing } }) { + const [prefix, tag] = stripPrefix(name); + const attrsObj = Saxophone.parseAttrs(attrs); + updateNamespaces(attrsObj); + const uri = (prefix && ns[prefix]) ?? defaultNS; - // if parser specifies namespace, it has to much naode namespace - function verifyNS(tp) { - if (!tp) { - return tp; - } - if (!tp.$uri) { - return tp; - } - if (tp.$uri === node.uri) { - return tp; - } - } + const tagParser = getParser(tag, uri); - debug('onopentag', tag); + debug('tagopen', tag, attrs); if (!tagParser) { parsers.push(IGNORE, tag); - return; - } - if (typeof tagParser === 'function') { + } else if (typeof tagParser === 'function') { parseWith(tagParser); } else { if (typeof tagParser.$ === 'function') { @@ -59,136 +49,119 @@ function handlers(parserConfig, fn) { } parsers.push(tagParser, tag); } - } + if (isSelfClosing) { + ontagclose({ detail: { name } }); + } - function onclosetag(tag) { - let parser; - let top; + function parseWith(tp) { + const attributes = Object.fromEntries( + Object.entries(attrsObj).map( + ([name, value]) => [ + name, + { + name, + value: Saxophone.parseEntities(value) + } + ] + ) + ); + const node = { prefix, tag, attributes }; + const elem = tp.call(items, node, items.top(), context); + if (elem) { + items.push(elem, tag); + } + } + } - tag = stripPrefix(tag); + function ontagclose({ detail: { name } }) { + const [, tag] = stripPrefix(name); - debug('closetag', tag); + debug('tagclose', tag); - parser = parsers.pop(tag); - if (parser && typeof parser.$after === 'function') { + const parser = parsers.pop(tag); + if (typeof parser?.$after === 'function') { + debug('$after', tag); parser.$after.call(items, items.top(), context); } if (parser !== IGNORE) { - top = items.pop(tag); + const top = items.pop(tag); // if nothing on the stack emit result if (top !== undefined && items.empty()) { - fn(null, top); + emit(top); } } } - function ontext(value) { - const textParser = parsers.top().$text; - if (textParser) { - textParser.call(items, value, items.top(), context); - } - } + function ontext({ type, detail: { contents } }) { + debug('text', contents); - function onopencdata() { const textParser = parsers.top().$text; - if (textParser) { - cdata = []; - } + textParser?.call( + items, + type === 'cdata' ? contents : Saxophone.parseEntities(contents), + items.top(), + context + ); } - function oncdata(value) { - if (cdata) { - cdata.push(value); + function updateNamespaces(attrsObj) { + Object.entries(attrsObj).forEach(xmlns); + + function xmlns([name, value]) { + if (name === 'xmlns') { + defaultNS = value; + } else if (name.startsWith('xmlns:')) { + const prefix = name.slice(6); // 'xmlns:'.length === 6 + ns[prefix] = value; + } } } - function onclosecdata() { - if (!cdata) { + function getParser(tag, uri) { + const top = parsers.top(); + if (top === IGNORE) { return; } - const textParser = parsers.top().$text; - textParser.call(items, cdata.join(''), items.top(), context); - cdata = undefined; - } - - function onerror(err) { - debug('Detected error', err); - // mark error as handled - this.error = null; - fn(err); + const tp = top[tag]; + if (!tp) { + return; + } + if (!tp.$uri) { + return tp; + } + // if parser specifies namespace, it has to match node namespace + if (tp.$uri === uri) { + return tp; + } } - - return { - onopentag, - onclosetag, - ontext, - onopencdata, - oncdata, - onclosecdata, - onerror - }; } -function initParser(parserConfig, saxOptions) { +function initParser(config) { - saxOptions = Object.assign({ - trim: true, - normalize: true, - lowercase: false, - xmlns: true, - position: false, - strictEntities: true, - noscript: true - }, saxOptions); + const results = []; - const parser = sax.parser(true, saxOptions); - const decoder = new StringDecoder('utf8'); - let results = []; - let parserError; + const target = makeHandlers(config, (...objs) => results.push(...objs)); + const sax = new Saxophone(target); + const writer = sax.getWriter(); - Object.assign(parser, handlers(parserConfig, (err, obj) => { - if (!err) { - results.push(obj); - } else { - // only report the first error - parserError = parserError || err; - } - })); - - const ts = new Transform({ - readableObjectMode: true, - flush(next) { - parser.close(); - if (parserError) { - return next(parserError); - } - flush(this); - next(); + return new TransformStream({ + async flush(controller) { + await writer.ready; + await writer.close(); + flush(controller); + controller.terminate(); }, - transform(chunk, encoding, next) { - if (parserError) { - return next(parserError); - } - write(chunk); - flush(this); - next(); + async transform(chunk, controller) { + debug('writing', chunk); + await writer.ready; + await writer.write(chunk); + flush(controller); } }); - return ts; - - function write(chunk) { - const str = decoder.write(chunk); - parser.write(str); - } - function flush(stream) { - if (!results.length) { - return; - } - results.forEach(r => { - stream.push(r); - }); - results = []; + function flush(controller) { + results.forEach(r => controller.enqueue(r)); + results.length = 0; } } diff --git a/package.json b/package.json index 217d8e0..152ab48 100644 --- a/package.json +++ b/package.json @@ -16,12 +16,11 @@ "object" ], "dependencies": { - "debug": "~2 || ~3 || ~4", - "sax": "^1.2.1" + "@pirxpilot/saxophone": "^1.0.0", + "debug": "~2 || ~3 || ~4" }, "devDependencies": { - "jshint": "~2", - "should": "~13" + "jshint": "~2" }, "scripts": { "test": "make check" @@ -30,4 +29,4 @@ "index.js", "lib" ] -} \ No newline at end of file +} diff --git a/test/fixtures/nested.xml b/test/fixtures/nested.xml new file mode 100644 index 0000000..4ff9ded --- /dev/null +++ b/test/fixtures/nested.xml @@ -0,0 +1,12 @@ + + + + abc + + def + + ghi + + + + diff --git a/test/stream.js b/test/stream.js index 3c6736f..a126be2 100644 --- a/test/stream.js +++ b/test/stream.js @@ -1,228 +1,285 @@ -const { describe, it } = require('node:test'); +const test = require('node:test'); +const assert = require('node:assert/strict'); const fs = require('node:fs'); +const path = require('node:path'); const { pipeline } = require('node:stream/promises'); -const { Writable, Readable } = require('stream'); const stream = require('..'); function readStream(name) { - return fs.createReadStream([__dirname, 'fixtures', name].join('/')); + return fs.createReadStream(path.join(__dirname, 'fixtures', name)); } +/* global WritableStream, ReadableStream, TextDecoderStream */ + function memory(array) { - return new Writable({ - objectMode: true, - write(item, encoding, next) { - array.push(item); - next(); - } + return new WritableStream({ + write: item => array.push(item) }); } -describe('sax super stream', () => { - it('should parse a single empty node', async () => { - const config = { - 'item': stream.object() - }; - const result = []; +test('should parse a single empty node', async () => { + const config = { + 'item': stream.object() + }; + const result = []; - await pipeline( - readStream('one.xml'), - stream(config), - memory(result) - ); - result.should.have.length(1); - result[0].should.eql({}); + await pipeline( + readStream('one.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); - }); + assert.deepEqual(result, [{}]); +}); - it('should parse nodes with text', async () => { - const config = { - 'two': { - 'item': { - $: stream.object(), - 'a': { $text: stream.assignTo('A') }, - 'b': { $text: stream.assignTo('B') } - } +test('should parse nodes with text', async () => { + const config = { + 'two': { + 'item': { + $: stream.object(), + 'a': { $text: stream.assignTo('A') }, + 'b': { $text: stream.assignTo('B') } } - }; - const result = []; - - await pipeline( - readStream('two.xml'), - stream(config), - memory(result) - ); - result.should.have.length(2); - result[0].should.have.property('A', 'abc'); - result[0].should.have.property('B', '15'); - result[1].should.have.property('A', 'def'); - result[1].should.have.property('B', '16'); - - - }); + } + }; + const result = []; + + await pipeline( + readStream('two.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.deepEqual(result, [ + { A: 'abc', B: '15' }, + { A: 'def', B: '16' } + ]); +}); - it('should parse attributes', async () => { - const config = { - 'THREE': { - 'ITEMS': { - 'ITEM': { - $: stream.object(), - 'A': { $: appendToCollection }, - 'B': { $: addToParent } - } +test('should parse nested nodes', async () => { + const itemParser = { + $: (_, parent) => parent.item = {}, + a: { $text: stream.assignTo('A') } + }; + itemParser.item = itemParser; + const config = { + nested: { + $: stream.object(), + item: itemParser + } + }; + const result = []; + + await pipeline( + readStream('nested.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.deepEqual(result, [{ + item: { + A: 'abc', + item: { + A: 'def', + item: { + A: 'ghi', } } - }; - const result = []; - - function appendToCollection({ attributes }, parent) { - const obj = { - value: attributes.attr.value - }; - parent.children = parent.children || []; - parent.children.push(obj); - } - - function addToParent({ attributes }, parent) { - parent.b = attributes.attr.value; } + }]); +}); - await pipeline( - readStream('three.xml'), - stream(config, { lowercase: true }), - memory(result) - ); - let item; - let a; - - result.should.have.length(1); - - item = result[0]; - item.should.have.property('b', '4'); - - a = item.children; - a.should.have.length(3); - a[0].should.have.property('value', '1'); - - }); - - it('should call $after parser if specified', async () => { - let value = 0; - const config = { - 'doc': { - 'item': { +test('should parse attributes', async () => { + const config = { + 'THREE': { + 'ITEMS': { + 'ITEM': { $: stream.object(), - $after(obj) { obj.value = value++; } + 'A': { $: appendToCollection }, + 'B': { $: addToParent } } } - }; - const result = []; - - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result) - ); - - value.should.be.eql(2); - result.should.have.length(2); - result[0].should.have.property('value', 0); - result[1].should.have.property('value', 1); - - }); + } + }; + const result = []; - it('should ignore namespace if none declared', async () => { - const config = { - 'doc': { - 'item': { $: stream.object() } - } + function appendToCollection({ attributes }, parent) { + const obj = { + value: attributes.attr.value }; - const result = []; - - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result) - ); - - result.should.have.length(2); - }); + parent.children = parent.children || []; + parent.children.push(obj); + } + + function addToParent({ attributes }, parent) { + parent.b = attributes.attr.value; + } + + await pipeline( + readStream('three.xml'), + new TextDecoderStream(), + stream(config, { lowercase: true }), + memory(result) + ); + + assert.deepEqual(result, [{ + b: '4', + children: [ + { value: '1' }, + { value: '2' }, + { value: '3' } + ] + }]); +}); - it('should accept elements if namespace matches $uri attribute', async () => { - const config = { - 'doc': { - $uri: 'http://example.com', - 'item': { $: stream.object() } +test('should call $after parser if specified', async () => { + let value = 0; + const config = { + 'doc': { + 'item': { + $: stream.object(), + $after(obj) { obj.value = value++; } } - }; - const result = []; + } + }; + const result = []; + + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + + assert.equal(value, 2); + assert.deepEqual(result, [ + { value: 0 }, + { value: 1 } + ]); +}); - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result)); - result.should.have.length(2); +test('should ignore namespace if none declared', async () => { + const config = { + 'doc': { + 'item': { $: stream.object() } + } + }; + const result = []; - }); + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); - it('should ignore elements if namespace does not match $uri attribute', async () => { - const config = { - 'doc': { - $uri: 'http://another.com', - 'item': { $: stream.object() } - } - }; - const result = []; + assert.deepEqual(result, [{}, {}]); +}); - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result)); +test('should accept elements if namespace matches $uri attribute', async () => { + const config = { + 'doc': { + $uri: 'http://example.com', + 'item': { $: stream.object() } + } + }; + const result = []; + + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.deepEqual(result, [{}, {}]); +}); - result.should.have.length(0); - }); +test('should ignore elements if namespace does not match $uri attribute', async () => { + const config = { + 'doc': { + $uri: 'http://another.com', + 'item': { $: stream.object() } + } + }; + const result = []; + + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.deepEqual(result, []); +}); - it('should parse CDATA as text', async () => { - const config = { - 'FOUR': { - 'ITEM': { - $: stream.object(), - 'A': { $text: stream.assignTo('a') }, - 'B': { $text: stream.assignTo('b') } - } +test('should parse CDATA as text', async () => { + const config = { + 'FOUR': { + 'ITEM': { + $: stream.object(), + 'A': { $text: stream.assignTo('a') }, + 'B': { $text: stream.assignTo('b') } } - }; - const result = []; - - await pipeline( - readStream('four.xml'), - stream(config), - memory(result)); - - result.should.have.length(2); - result[0].should.be.eql({ a: 'abc', b: '15' }); - result[1].should.be.eql({ a: 'def', b: '16' }); - - }); - - it('should raise errors on invalid XML', async () => { - - const config = { - 'item': { $: stream.object() } - }; + } + }; + const result = []; + + await pipeline( + readStream('four.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + + assert.deepEqual(result, [ + { a: 'abc', b: '15' }, + { a: 'def', b: '16' } + ]); +}); - const from = new Readable({ - read() {} - }); +test('should parse entities in text but not in CDATA', async () => { + const config = { + doc: { + item: { + $: stream.object(), + $text: stream.assignTo('a'), + } + } + }; + const result = []; + + const from = ReadableStream.from([ + '', + '1<2', + '', + '' + ]); + + await pipeline( + from, + stream(config), + memory(result) + ); + + assert.deepEqual(result, [ + { a: '1<2' }, + { a: '1&2' } + ]); +}); - const pipe = pipeline(from, stream(config)); +test('should raise errors on invalid XML', async () => { + const config = { + 'item': { $: stream.object() } + }; - from.push(''); - from.push(''); - from.push(null); + const from = ReadableStream.from([ + '', + '' + ]); - await pipe.should.be.rejectedWith('Unexpected close tag'); - }); + const pipe = pipeline(from, stream(config), memory([])); + await assert.rejects(pipe, /unclosed tag: item/i); });