From 3aee5e1fc11bcab28852bb1e16360d8a7d020fb9 Mon Sep 17 00:00:00 2001 From: Damian Krzeminski Date: Mon, 4 Sep 2023 13:03:46 +0200 Subject: [PATCH] rewrite using Saxophone parser please note that TextDecoderStream needs to be used on binary streams this version is using web streams instead of node streams using TransformStream allows direct usage in web browser using in node is also possible since node's pipe accepts web streams --- Makefile | 2 +- lib/stream.js | 223 +++++++++++++----------------- package.json | 9 +- test/stream.js | 367 ++++++++++++++++++++++++------------------------- 4 files changed, 281 insertions(+), 320 deletions(-) diff --git a/Makefile b/Makefile index 8e9b769..9c4a0ea 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,6 @@ lint: ./node_modules/.bin/jshint *.js lib test test: - node --require should --test + node --test .PHONY: check lint test diff --git a/lib/stream.js b/lib/stream.js index a9616cc..6243008 100644 --- a/lib/stream.js +++ b/lib/stream.js @@ -1,39 +1,80 @@ -const { Transform } = require('stream'); -const { StringDecoder } = require('string_decoder'); -const sax = require('sax'); +const Saxophone = require('@pirxpilot/saxophone'); +const stack = require('./stack'); + const debug = require('debug')('sax-super-stream'); -const stack = require('./stack'); +/* global TransformStream, EventTarget */ + module.exports = initParser; // used to mark all tags that we want to skip -const IGNORE = Object.create(null); +const IGNORE = Symbol('ignore'); function stripPrefix(name) { - const index = name.indexOf(':'); - return index < 0 ? name : name.slice(index + 1); + const prefixed = name.split(':', 2); + return prefixed[1] ? prefixed : [undefined, prefixed[0]]; } -function handlers(parserConfig, fn) { +function makeHandlers(parserConfig, fn) { + const ns = Object.create(null); + let defaultNS; const items = stack(); const parsers = stack(parserConfig); const context = {}; - let cdata; - function onopentag(node) { - const tag = node.local; + const target = new EventTarget(); + target.addEventListener('tagopen', ontagopen); + target.addEventListener('tagclose', ontagclose); + target.addEventListener('text', ontext); + target.addEventListener('cdata', ontext); + return target; + + function ontagopen({ detail: { name, attrs, isSelfClosing } }) { + const [prefix, tag] = stripPrefix(name); + const attrsObj = Saxophone.parseAttrs(attrs); + updateNamespaces(attrsObj); + const uri = (prefix && ns[prefix]) ?? defaultNS; + const tagParser = verifyNS(parsers.top()[tag]); - let elem; + + debug('tagopen', tag, attrs); + if (!tagParser) { + parsers.push(IGNORE, tag); + return; + } + if (typeof tagParser === 'function') { + parseWith(tagParser); + } else { + if (typeof tagParser.$ === 'function') { + parseWith(tagParser.$); + } + parsers.push(tagParser, tag); + } + if (isSelfClosing) { + ontagclose({ detail: { name } }); + } function parseWith(tp) { - elem = tp.call(items, node, items.top(), context); + const attributes = Object.fromEntries( + Object.entries(attrsObj).map( + ([name, value]) => [ + name, + { + name, + value: Saxophone.parseEntities(value) + } + ] + ) + ); + const node = { prefix, tag, attributes }; + const elem = tp.call(items, node, items.top(), context); if (elem) { items.push(elem, tag); } } - // if parser specifies namespace, it has to much naode namespace + // if parser specifies namespace, it has to match node namespace function verifyNS(tp) { if (!tp) { return tp; @@ -41,40 +82,24 @@ function handlers(parserConfig, fn) { if (!tp.$uri) { return tp; } - if (tp.$uri === node.uri) { + if (tp.$uri === uri) { return tp; } } - - debug('onopentag', tag); - if (!tagParser) { - parsers.push(IGNORE, tag); - return; - } - if (typeof tagParser === 'function') { - parseWith(tagParser); - } else { - if (typeof tagParser.$ === 'function') { - parseWith(tagParser.$); - } - parsers.push(tagParser, tag); - } } - function onclosetag(tag) { - let parser; - let top; - - tag = stripPrefix(tag); + function ontagclose({ detail: { name } }) { + const [, tag] = stripPrefix(name); - debug('closetag', tag); + debug('tagclose', tag); - parser = parsers.pop(tag); - if (parser && typeof parser.$after === 'function') { + const parser = parsers.pop(tag); + if (typeof parser?.$after === 'function') { + debug('$after', tag); parser.$after.call(items, items.top(), context); } if (parser !== IGNORE) { - top = items.pop(tag); + const top = items.pop(tag); // if nothing on the stack emit result if (top !== undefined && items.empty()) { fn(null, top); @@ -82,113 +107,53 @@ function handlers(parserConfig, fn) { } } - function ontext(value) { - const textParser = parsers.top().$text; - if (textParser) { - textParser.call(items, value, items.top(), context); - } - } + function ontext({ detail: { contents } }) { + debug('text', contents); - function onopencdata() { const textParser = parsers.top().$text; - if (textParser) { - cdata = []; - } + textParser?.call(items, Saxophone.parseEntities(contents), items.top(), context); } - function oncdata(value) { - if (cdata) { - cdata.push(value); - } - } + function updateNamespaces(attrsObj) { + Object.entries(attrsObj).forEach(xmlns); - function onclosecdata() { - if (!cdata) { - return; + function xmlns([name, value]) { + if (name === 'xmlns') { + defaultNS = value; + } else if (name.startsWith('xmlns:')) { + const prefix = name.slice(6); // 'xmlns:'.length === 6 + ns[prefix] = value; + } } - const textParser = parsers.top().$text; - textParser.call(items, cdata.join(''), items.top(), context); - cdata = undefined; - } - - function onerror(err) { - debug('Detected error', err); - // mark error as handled - this.error = null; - fn(err); } - - return { - onopentag, - onclosetag, - ontext, - onopencdata, - oncdata, - onclosecdata, - onerror - }; } -function initParser(parserConfig, saxOptions) { +function initParser(config) { - saxOptions = Object.assign({ - trim: true, - normalize: true, - lowercase: false, - xmlns: true, - position: false, - strictEntities: true, - noscript: true - }, saxOptions); + const results = []; - const parser = sax.parser(true, saxOptions); - const decoder = new StringDecoder('utf8'); - let results = []; - let parserError; + const target = makeHandlers(config, (err, obj) => results.push(obj)); + const sax = new Saxophone(target); + const writer = sax.getWriter(); - Object.assign(parser, handlers(parserConfig, (err, obj) => { - if (!err) { - results.push(obj); - } else { - // only report the first error - parserError = parserError || err; - } - })); - - const ts = new Transform({ - readableObjectMode: true, - flush(next) { - parser.close(); - if (parserError) { - return next(parserError); - } - flush(this); - next(); + return new TransformStream({ + async flush(controller) { + await writer.ready; + await writer.close(); + flush(controller); + controller.terminate(); }, - transform(chunk, encoding, next) { - if (parserError) { - return next(parserError); - } - write(chunk); - flush(this); - next(); + async transform(chunk, controller) { + debug('writing', chunk); + await writer.ready; + await writer.write(chunk); + flush(controller); } }); - return ts; - - function write(chunk) { - const str = decoder.write(chunk); - parser.write(str); - } - function flush(stream) { - if (!results.length) { - return; - } - results.forEach(r => { - stream.push(r); - }); - results = []; + function flush(controller) { + results.forEach(r => controller.enqueue(r)); + results.length = 0; } } diff --git a/package.json b/package.json index 217d8e0..152ab48 100644 --- a/package.json +++ b/package.json @@ -16,12 +16,11 @@ "object" ], "dependencies": { - "debug": "~2 || ~3 || ~4", - "sax": "^1.2.1" + "@pirxpilot/saxophone": "^1.0.0", + "debug": "~2 || ~3 || ~4" }, "devDependencies": { - "jshint": "~2", - "should": "~13" + "jshint": "~2" }, "scripts": { "test": "make check" @@ -30,4 +29,4 @@ "index.js", "lib" ] -} \ No newline at end of file +} diff --git a/test/stream.js b/test/stream.js index 3c6736f..3e5afc0 100644 --- a/test/stream.js +++ b/test/stream.js @@ -1,228 +1,225 @@ -const { describe, it } = require('node:test'); +const test = require('node:test'); +const assert = require('node:assert/strict'); const fs = require('node:fs'); +const path = require('node:path'); const { pipeline } = require('node:stream/promises'); -const { Writable, Readable } = require('stream'); const stream = require('..'); function readStream(name) { - return fs.createReadStream([__dirname, 'fixtures', name].join('/')); + return fs.createReadStream(path.join(__dirname, 'fixtures', name)); } +/* global WritableStream, ReadableStream, TextDecoderStream */ + function memory(array) { - return new Writable({ - objectMode: true, - write(item, encoding, next) { - array.push(item); - next(); - } + return new WritableStream({ + write: item => array.push(item) }); } -describe('sax super stream', () => { - it('should parse a single empty node', async () => { - const config = { - 'item': stream.object() - }; - const result = []; - - await pipeline( - readStream('one.xml'), - stream(config), - memory(result) - ); - result.should.have.length(1); - result[0].should.eql({}); - - }); - - it('should parse nodes with text', async () => { - const config = { - 'two': { - 'item': { - $: stream.object(), - 'a': { $text: stream.assignTo('A') }, - 'b': { $text: stream.assignTo('B') } - } - } - }; - const result = []; - - await pipeline( - readStream('two.xml'), - stream(config), - memory(result) - ); - result.should.have.length(2); - result[0].should.have.property('A', 'abc'); - result[0].should.have.property('B', '15'); - result[1].should.have.property('A', 'def'); - result[1].should.have.property('B', '16'); - - - }); +test('should parse a single empty node', async () => { + const config = { + 'item': stream.object() + }; + const result = []; + + await pipeline( + readStream('one.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + + assert.equal(result.length, 1); + assert.deepEqual(result[0], {}); +}); - it('should parse attributes', async () => { - const config = { - 'THREE': { - 'ITEMS': { - 'ITEM': { - $: stream.object(), - 'A': { $: appendToCollection }, - 'B': { $: addToParent } - } - } +test('should parse nodes with text', async () => { + const config = { + 'two': { + 'item': { + $: stream.object(), + 'a': { $text: stream.assignTo('A') }, + 'b': { $text: stream.assignTo('B') } } - }; - const result = []; - - function appendToCollection({ attributes }, parent) { - const obj = { - value: attributes.attr.value - }; - parent.children = parent.children || []; - parent.children.push(obj); - } - - function addToParent({ attributes }, parent) { - parent.b = attributes.attr.value; } + }; + const result = []; + + await pipeline( + readStream('two.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.equal(result.length, 2); + assert.equal(result[0].A, 'abc'); + assert.equal(result[0].B, '15'); + assert.equal(result[1].A, 'def'); + assert.equal(result[1].B, '16'); +}); - await pipeline( - readStream('three.xml'), - stream(config, { lowercase: true }), - memory(result) - ); - let item; - let a; - - result.should.have.length(1); - - item = result[0]; - item.should.have.property('b', '4'); - - a = item.children; - a.should.have.length(3); - a[0].should.have.property('value', '1'); - - }); - - it('should call $after parser if specified', async () => { - let value = 0; - const config = { - 'doc': { - 'item': { +test('should parse attributes', async () => { + const config = { + 'THREE': { + 'ITEMS': { + 'ITEM': { $: stream.object(), - $after(obj) { obj.value = value++; } + 'A': { $: appendToCollection }, + 'B': { $: addToParent } } } + } + }; + const result = []; + + function appendToCollection({ attributes }, parent) { + const obj = { + value: attributes.attr.value }; - const result = []; + parent.children = parent.children || []; + parent.children.push(obj); + } - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result) - ); + function addToParent({ attributes }, parent) { + parent.b = attributes.attr.value; + } - value.should.be.eql(2); - result.should.have.length(2); - result[0].should.have.property('value', 0); - result[1].should.have.property('value', 1); + await pipeline( + readStream('three.xml'), + new TextDecoderStream(), + stream(config, { lowercase: true }), + memory(result) + ); - }); + assert.equal(result.length, 1); - it('should ignore namespace if none declared', async () => { - const config = { - 'doc': { - 'item': { $: stream.object() } - } - }; - const result = []; + const item = result[0]; + assert.equal(item.b, '4'); - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result) - ); + const a = item.children; + assert.equal(a.length, 3); + assert.equal(a[0].value, '1'); - result.should.have.length(2); - }); +}); - it('should accept elements if namespace matches $uri attribute', async () => { - const config = { - 'doc': { - $uri: 'http://example.com', - 'item': { $: stream.object() } +test('should call $after parser if specified', async () => { + let value = 0; + const config = { + 'doc': { + 'item': { + $: stream.object(), + $after(obj) { obj.value = value++; } } - }; - const result = []; - - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result)); - result.should.have.length(2); - - }); + } + }; + const result = []; + + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + + assert.equal(value, 2); + assert.equal(result.length, 2); + assert.equal(result[0].value, 0); + assert.equal(result[1].value, 1); +}); - it('should ignore elements if namespace does not match $uri attribute', async () => { - const config = { - 'doc': { - $uri: 'http://another.com', - 'item': { $: stream.object() } - } - }; - const result = []; +test('should ignore namespace if none declared', async () => { + const config = { + 'doc': { + 'item': { $: stream.object() } + } + }; + const result = []; - await pipeline( - readStream('ns.xml'), - stream(config), - memory(result)); + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); - result.should.have.length(0); - }); + assert.equal(result.length, 2); +}); +test('should accept elements if namespace matches $uri attribute', async () => { + const config = { + 'doc': { + $uri: 'http://example.com', + 'item': { $: stream.object() } + } + }; + const result = []; - it('should parse CDATA as text', async () => { - const config = { - 'FOUR': { - 'ITEM': { - $: stream.object(), - 'A': { $text: stream.assignTo('a') }, - 'B': { $text: stream.assignTo('b') } - } - } - }; - const result = []; + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + assert.equal(result.length, 2); - await pipeline( - readStream('four.xml'), - stream(config), - memory(result)); +}); - result.should.have.length(2); - result[0].should.be.eql({ a: 'abc', b: '15' }); - result[1].should.be.eql({ a: 'def', b: '16' }); +test('should ignore elements if namespace does not match $uri attribute', async () => { + const config = { + 'doc': { + $uri: 'http://another.com', + 'item': { $: stream.object() } + } + }; + const result = []; - }); + await pipeline( + readStream('ns.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); - it('should raise errors on invalid XML', async () => { + assert.equal(result.length, 0); +}); - const config = { - 'item': { $: stream.object() } - }; - const from = new Readable({ - read() {} - }); +test('should parse CDATA as text', async () => { + const config = { + 'FOUR': { + 'ITEM': { + $: stream.object(), + 'A': { $text: stream.assignTo('a') }, + 'B': { $text: stream.assignTo('b') } + } + } + }; + const result = []; + + await pipeline( + readStream('four.xml'), + new TextDecoderStream(), + stream(config), + memory(result) + ); + + assert.equal(result.length, 2); + assert.deepEqual(result[0], { a: 'abc', b: '15' }); + assert.deepEqual(result[1], { a: 'def', b: '16' }); +}); - const pipe = pipeline(from, stream(config)); +test('should raise errors on invalid XML', async () => { + const config = { + 'item': { $: stream.object() } + }; - from.push(''); - from.push(''); - from.push(null); + const from = ReadableStream.from([ + '', + '' + ]); - await pipe.should.be.rejectedWith('Unexpected close tag'); - }); + const pipe = pipeline(from, stream(config), memory([])); + await assert.rejects(pipe, /unclosed tag: item/i); });