Skip to content

Commit

Permalink
rewrite using Saxophone parser
Browse files Browse the repository at this point in the history
please note that TextDecoderStream needs to be used on binary streams

this version is using web streams instead of node streams

using TransformStream allows direct usage in web browser
using in node is also possible since node's pipe accepts web streams
  • Loading branch information
pirxpilot committed Sep 15, 2023
1 parent 8674624 commit a4cbc8d
Show file tree
Hide file tree
Showing 5 changed files with 330 additions and 324 deletions.
2 changes: 1 addition & 1 deletion Makefile
Expand Up @@ -4,6 +4,6 @@ lint:
./node_modules/.bin/jshint *.js lib test

test:
node --require should --test
node --test

.PHONY: check lint test
228 changes: 98 additions & 130 deletions lib/stream.js
@@ -1,52 +1,45 @@
const { Transform } = require('stream');
const { StringDecoder } = require('string_decoder');
const sax = require('sax');
const Saxophone = require('@pirxpilot/saxophone');
const stack = require('./stack');

const debug = require('debug')('sax-super-stream');

const stack = require('./stack');
/* global TransformStream, EventTarget */


module.exports = initParser;

// used to mark all tags that we want to skip
const IGNORE = Object.create(null);
const IGNORE = Symbol('ignore');

function stripPrefix(name) {
const index = name.indexOf(':');
return index < 0 ? name : name.slice(index + 1);
const prefixed = name.split(':', 2);
return prefixed[1] ? prefixed : [undefined, prefixed[0]];
}

function handlers(parserConfig, fn) {
function makeHandlers(parserConfig, emit) {
const ns = Object.create(null);
let defaultNS;
const waiting = [];
const items = stack();
const parsers = stack(parserConfig);
const context = {};
let cdata;

function onopentag(node) {
const tag = node.local;
const tagParser = verifyNS(parsers.top()[tag]);
let elem;
const target = new EventTarget();
target.addEventListener('tagopen', ontagopen);
target.addEventListener('tagclose', ontagclose);
target.addEventListener('text', ontext);
target.addEventListener('cdata', ontext);
return target;

function parseWith(tp) {
elem = tp.call(items, node, items.top(), context);
if (elem) {
items.push(elem, tag);
}
}
function ontagopen({ detail: { name, attrs, isSelfClosing } }) {
const [prefix, tag] = stripPrefix(name);
const attrsObj = Saxophone.parseAttrs(attrs);
updateNamespaces(attrsObj);
const uri = (prefix && ns[prefix]) ?? defaultNS;

// if parser specifies namespace, it has to much naode namespace
function verifyNS(tp) {
if (!tp) {
return tp;
}
if (!tp.$uri) {
return tp;
}
if (tp.$uri === node.uri) {
return tp;
}
}
const tagParser = getParser(tag, uri);

debug('onopentag', tag);
debug('tagopen', tag, attrs);
if (!tagParser) {
parsers.push(IGNORE, tag);
return;
Expand All @@ -59,136 +52,111 @@ function handlers(parserConfig, fn) {
}
parsers.push(tagParser, tag);
}
}
if (isSelfClosing) {
ontagclose({ detail: { name } });
}

function onclosetag(tag) {
let parser;
let top;
function parseWith(tp) {
const attributes = Object.fromEntries(
Object.entries(attrsObj).map(
([name, value]) => [
name,
{
name,
value: Saxophone.parseEntities(value)
}
]
)
);
const node = { prefix, tag, attributes };
const elem = tp.call(items, node, items.top(), context);
if (elem) {
items.push(elem, tag);
}
}
}

tag = stripPrefix(tag);
function ontagclose({ detail: { name } }) {
const [, tag] = stripPrefix(name);

debug('closetag', tag);
debug('tagclose', tag);

parser = parsers.pop(tag);
if (parser && typeof parser.$after === 'function') {
const parser = parsers.pop(tag);
if (typeof parser?.$after === 'function') {
debug('$after', tag);
parser.$after.call(items, items.top(), context);
}
if (parser !== IGNORE) {
top = items.pop(tag);
const top = items.pop(tag);
// if nothing on the stack emit result
if (top !== undefined && items.empty()) {
fn(null, top);
emit(top);
}
}
}

function ontext(value) {
const textParser = parsers.top().$text;
if (textParser) {
textParser.call(items, value, items.top(), context);
}
}
function ontext({ detail: { contents } }) {
debug('text', contents);

function onopencdata() {
const textParser = parsers.top().$text;
if (textParser) {
cdata = [];
}
textParser?.call(items, Saxophone.parseEntities(contents), items.top(), context);
}

function oncdata(value) {
if (cdata) {
cdata.push(value);
function updateNamespaces(attrsObj) {
Object.entries(attrsObj).forEach(xmlns);

function xmlns([name, value]) {
if (name === 'xmlns') {
defaultNS = value;
} else if (name.startsWith('xmlns:')) {
const prefix = name.slice(6); // 'xmlns:'.length === 6
ns[prefix] = value;
}
}
}

function onclosecdata() {
if (!cdata) {
function getParser(tag, uri) {
const top = parsers.top();
const tp = top[tag];
if (!tp) {
return;
}
const textParser = parsers.top().$text;
textParser.call(items, cdata.join(''), items.top(), context);
cdata = undefined;
}

function onerror(err) {
debug('Detected error', err);
// mark error as handled
this.error = null;
fn(err);
if (!tp.$uri) {
return tp;
}
// if parser specifies namespace, it has to match node namespace
if (tp.$uri === uri) {
return tp;
}
}

return {
onopentag,
onclosetag,
ontext,
onopencdata,
oncdata,
onclosecdata,
onerror
};
}


function initParser(parserConfig, saxOptions) {
function initParser(config) {

saxOptions = Object.assign({
trim: true,
normalize: true,
lowercase: false,
xmlns: true,
position: false,
strictEntities: true,
noscript: true
}, saxOptions);
const results = [];

const parser = sax.parser(true, saxOptions);
const decoder = new StringDecoder('utf8');
let results = [];
let parserError;
const target = makeHandlers(config, (...objs) => results.push(...objs));
const sax = new Saxophone(target);
const writer = sax.getWriter();

Object.assign(parser, handlers(parserConfig, (err, obj) => {
if (!err) {
results.push(obj);
} else {
// only report the first error
parserError = parserError || err;
}
}));

const ts = new Transform({
readableObjectMode: true,
flush(next) {
parser.close();
if (parserError) {
return next(parserError);
}
flush(this);
next();
return new TransformStream({
async flush(controller) {
await writer.ready;
await writer.close();
flush(controller);
controller.terminate();
},
transform(chunk, encoding, next) {
if (parserError) {
return next(parserError);
}
write(chunk);
flush(this);
next();
async transform(chunk, controller) {
debug('writing', chunk);
await writer.ready;
await writer.write(chunk);
flush(controller);
}
});
return ts;

function write(chunk) {
const str = decoder.write(chunk);
parser.write(str);
}

function flush(stream) {
if (!results.length) {
return;
}
results.forEach(r => {
stream.push(r);
});
results = [];
function flush(controller) {
results.forEach(r => controller.enqueue(r));
results.length = 0;
}
}
9 changes: 4 additions & 5 deletions package.json
Expand Up @@ -16,12 +16,11 @@
"object"
],
"dependencies": {
"debug": "~2 || ~3 || ~4",
"sax": "^1.2.1"
"@pirxpilot/saxophone": "^1.0.0",
"debug": "~2 || ~3 || ~4"
},
"devDependencies": {
"jshint": "~2",
"should": "~13"
"jshint": "~2"
},
"scripts": {
"test": "make check"
Expand All @@ -30,4 +29,4 @@
"index.js",
"lib"
]
}
}
12 changes: 12 additions & 0 deletions test/fixtures/nested.xml
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<nested>
<item>
<a>abc</a>
<item>
<a>def</a>
<item>
<a>ghi</a>
</item>
</item>
</item>
</nested>

0 comments on commit a4cbc8d

Please sign in to comment.