Skip to content

Commit

Permalink
basic make_canonicaliser with test
Browse files Browse the repository at this point in the history
  • Loading branch information
mohawk2 committed Jan 8, 2020
1 parent 879622a commit b2fcd33
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 1 deletion.
130 changes: 129 additions & 1 deletion lib/XML/Invisible.pm
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use Pegex::Parser;
use XML::Invisible::Receiver;

our $VERSION = '0.06';
our @EXPORT_OK = qw(make_parser ast2xml);
our @EXPORT_OK = qw(make_parser ast2xml make_canonicaliser);

use constant DEBUG => $ENV{XML_INVISIBLE_DEBUG};

Expand All @@ -26,6 +26,100 @@ sub make_parser {
};
}

my $grammar_parser;
sub make_canonicaliser {
my ($grammar_text) = @_;
require Pegex::Compiler;
require Pegex::Grammar::Atoms;
my $grammar_tree = Pegex::Compiler->new->parse($grammar_text)->tree;
my $toprule = $grammar_tree->{'+toprule'};
my $atoms = _atoms2canonical(Pegex::Grammar::Atoms->atoms);
sub {
my ($ast) = @_;
my @results = _extract_canonical($atoms, $ast, $grammar_tree, $toprule);
return undef if grep !defined, @results;
join '', @results;
};
}

my %ATOM2SPECIAL = (
ALL => "",
BLANK => " ",
BREAK => "\n",
BS => "\x08",
CONTROL => "\x00",
CR => "\r",
DOS => "\r\n",
EOL => "\n",
EOS => "",
FF => "\x0C",
HICHAR => "\x7f",
NL => "\n",
TAB => "\t",
WORD => "a",
WS => " ",
_ => "",
__ => " ",
ws => "",
ws1 => "",
ws2 => " ",
);
sub _atoms2canonical {
my ($atoms) = @_;
my %lookup;
for my $atom (keys %$atoms) {
my $c = $atoms->{$atom};
if (exists $ATOM2SPECIAL{$atom}) {
$c = $ATOM2SPECIAL{$atom};
} elsif ($c =~ s/^\\//) {
# all good
} elsif ($c =~ s/^\[//) {
$c = substr $c, 0, 1;
}
$lookup{$atom} = $c;
}
\%lookup;
}

sub _extract_canonical {
my ($atoms, $elt, $grammar_tree, $elt_sought, $grammar_frag) = @_;
$grammar_frag ||= $grammar_tree->{$elt_sought};
return undef if !ref $elt; # just text node - not valid
return undef if defined($elt_sought) and $elt_sought ne $elt->{nodename}; # non-match
if ($grammar_frag->{'.rgx'}) {
# RE, so parent of text nodes
return $elt->{children} ? join('', @{$elt->{children}}) : $elt->{nodename};
}
if (my $all = $grammar_frag->{'.all'}) {
# sequence of productions
return undef if @$all != @{$elt->{children}};
my @results;
for my $i (0..$#$all) {
my @partial = _extract_canonical(
$atoms, $elt->{children}[$i], $grammar_tree, undef, $all->[$i],
);
return undef if grep !defined, @partial; # any non-match
push @results, @partial;
}
return @results;
} elsif (my $ref = $grammar_frag->{'.ref'}) {
return $atoms->{$ref} if exists $atoms->{$ref};
return undef if $elt->{nodename} ne $ref;
my $new_frag = $grammar_tree->{$ref};
if ($new_frag->{'.ref'}) {
# this one is just a single-child empty if it's a match
return undef if @{$elt->{children}} != 1;
return _extract_canonical(
$atoms, $elt->{children}[0], $grammar_tree, $new_frag->{'.ref'}, $new_frag,
);
}
# treat ourselves as if we're the ref-ed to thing
return _extract_canonical(
$atoms, $elt, $grammar_tree, $ref, $new_frag,
);
}
}

my $xml_loaded = 0;
sub ast2xml {
do { require XML::LibXML; $xml_loaded = 1 } unless $xml_loaded;
Expand Down Expand Up @@ -94,6 +188,14 @@ XML::Invisible - transform "invisible XML" documents into XML using a grammar
'print ast2xml(make_parser(join "", <>)->("(a+b)"))->toStringC14N(1)' \
examples/arith-grammar.ixml | xml_pp
# canonicalise a document
use XML::Invisible qw(make_parser make_canonicaliser);
my $ixml_grammar = from_file('examples/arith-grammar.ixml');
my $transformer = make_parser($ixml_grammar);
my $ast = $transformer->(from_file($ixml_input));
my $canonicaliser = make_canonicaliser($ixml_grammar);
my $canonical = $canonicaliser->($ast);
=head1 DESCRIPTION
An implementation of Steven Pemberton's Invisible XML concept, using
Expand Down Expand Up @@ -179,6 +281,32 @@ Arguments:
=back
=head2 make_canonicaliser
Exportable. Returns a function that when called with an AST as produced
from a document by a L</make_parser>, returns a canonical version of
the original document, or C<undef> if it failed.
Arguments:
=over
=item an XML::Invisible grammar
=back
It uses a few heuristics:
=over
=item literals that are 0-1 (C<?>) or any number (C<*>) will be omitted
=item literals that are at least one (C<+>) will be inserted once
=item literal whitespace will be treated specially
=back
=head1 DEBUGGING
To debug, set environment variable C<XML_INVISIBLE_DEBUG> to a true value.
Expand Down
28 changes: 28 additions & 0 deletions t/reverse.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use lib 't/lib';
use JTTest;
use XML::Invisible qw(make_parser make_canonicaliser);

sub run_test {
my ($grammar_text, $doc, $expected_canonical, $label) = @_;
my $parser = make_parser($grammar_text);
my $got_forward = $parser->($doc);
my $got_reversed = make_canonicaliser($grammar_text)->($got_forward);
is $got_reversed, $expected_canonical, "$label reversed";
my $reparsed = $parser->($got_reversed);
is_deeply $reparsed, $got_forward, "$label reparsed is same as first";
}

run_test(
<<'EOF',
expr: target assign source
target: name
assign: (- EQUAL -)
source: name
name: /( ALPHA (: ALPHA | DIGIT )* )/
EOF
'a = b',
'a=b',
'basic',
);

done_testing;

0 comments on commit b2fcd33

Please sign in to comment.