Skip to content

Commit

Permalink
Fix some control character escaping and encoding issues
Browse files Browse the repository at this point in the history
See
* #17 dump_string must not care about Perl's internal representation of a variable
* #18 Dump() can create invalid YAML 1.1
  • Loading branch information
perlpunk committed Jan 28, 2020
1 parent def51cf commit 764b85f
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 31 deletions.
2 changes: 0 additions & 2 deletions examples/schema-perl.pm
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ EOM
regexp => [
<<'EOM',
my $string = 'unblessed';
utf8::upgrade($string);
qr{$string}
EOM
<<"EOM",
Expand All @@ -88,7 +87,6 @@ EOM
regexp_blessed => [
<<'EOM',
my $string = 'blessed';
utf8::upgrade($string);
bless qr{$string}, "Foo"
EOM
<<"EOM",
Expand Down
17 changes: 11 additions & 6 deletions lib/YAML/PP.pm
Original file line number Diff line number Diff line change
Expand Up @@ -579,34 +579,39 @@ The layout is like libyaml output:
my $doc = $ypp->load_string("foo: bar");
my @docs = $ypp->load_string("foo: bar\n---\n- a");
Input should be Unicode characters (decoded).
Input should be Unicode characters.
So if you read from a file, you should decode it, for example with
C<Encode::decode_utf8($bytes)>.
=item load_file
my $doc = $ypp->load_file("file.yaml");
my @docs = $ypp->load_file("file.yaml");
Strings will be loaded as unicode characters (decoded).
Strings will be loaded as unicode characters.
=item dump_string
my $yaml = $ypp->dump_string($doc);
my $yaml = $ypp->dump_string($doc1, $doc2);
my $yaml = $ypp->dump_string(@docs);
Input strings should be Unicode characters. If not, they will be upgraded with
Input strings should be Unicode characters.
C<utf8::upgrade>.
Output will return Unicode characters (decoded).
Output will return Unicode characters.
So if you want to write that to a file (or pass to YAML::XS, for example),
you typically encode it via C<Encode::encode_utf8($yaml)>.
=item dump_file
$ypp->dump_file("file.yaml", $doc);
$ypp->dump_file("file.yaml", $doc1, $doc2);
$ypp->dump_file("file.yaml", @docs);
Input data should be UTF-8 decoded. If not, it will be upgraded with
C<utf8::upgrade>.
Input data should be Unicode characters.
=item dump
Expand Down
39 changes: 34 additions & 5 deletions lib/YAML/PP/Emitter.pm
Original file line number Diff line number Diff line change
Expand Up @@ -335,13 +335,45 @@ my %control = (
"\x1d" => '\x1d',
"\x1e" => '\x1e',
"\x1f" => '\x1f',
"\x7f" => '\x7f',
"\x80" => '\x80',
"\x81" => '\x81',
"\x82" => '\x82',
"\x83" => '\x83',
"\x84" => '\x84',
"\x86" => '\x86',
"\x87" => '\x87',
"\x88" => '\x88',
"\x89" => '\x89',
"\x8a" => '\x8a',
"\x8b" => '\x8b',
"\x8c" => '\x8c',
"\x8d" => '\x8d',
"\x8e" => '\x8e',
"\x8f" => '\x8f',
"\x90" => '\x90',
"\x91" => '\x91',
"\x92" => '\x92',
"\x93" => '\x93',
"\x94" => '\x94',
"\x95" => '\x95',
"\x96" => '\x96',
"\x97" => '\x97',
"\x98" => '\x98',
"\x99" => '\x99',
"\x9a" => '\x9a',
"\x9b" => '\x9b',
"\x9c" => '\x9c',
"\x9d" => '\x9d',
"\x9e" => '\x9e',
"\x9f" => '\x9f',
"\x{2029}" => '\P',
"\x{2028}" => '\L',
"\x85" => '\N',
"\xa0" => '\_',
);

my $control_re = '\x00-\x08\x0b\x0c\x0e-\x1f\x{2029}\x{2028}\x85\xa0';
my $control_re = '\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f\x{d800}-\x{dfff}\x{fffe}\x{ffff}\x{2028}\x{2029}\x85\xa0';
my %to_escape = (
"\n" => '\n',
"\t" => '\t',
Expand All @@ -361,9 +393,6 @@ sub scalar_event {
my $last = $stack->[-1];
my $indent = $last->{indent};
my $value = $info->{value};
unless (utf8::is_utf8($value)) {
utf8::upgrade($value);
}

my $props = '';
my $anchor = $info->{anchor};
Expand Down Expand Up @@ -506,7 +535,7 @@ sub scalar_event {
}
}
else {
$value =~ s/([$escape_re"\\])/$to_escape{ $1 }/g;
$value =~ s/([$escape_re"\\])/$to_escape{ $1 } || sprintf '\\u%04x', ord($1)/eg;
$value = '"' . $value . '"';
}

Expand Down
13 changes: 10 additions & 3 deletions lib/YAML/PP/Schema/Binary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,29 @@ YAML::PP::Schema::Binary - Schema for loading and binary data
my $yp = YAML::PP->new( schema => [qw/ JSON Binary /] );
# or
my $binary = $yp->load_string(<<'EOM');
my ($binary, $same_binary) = $yp->load_string(<<'EOM');
# The binary value a tiny arrow encoded as a gif image.
--- !!binary "\
R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5\
OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+\
+f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC\
AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs="
--- !!binary |
R0lGODlhDAAMAIQAAP//9/X17unp5WZmZgAAAOfn515eXvPz7Y6OjuDg4J+fn5
OTk6enp56enmlpaWNjY6Ojo4SEhP/++f/++f/++f/++f/++f/++f/++f/++f/+
+f/++f/++f/++f/++f/++SH+Dk1hZGUgd2l0aCBHSU1QACwAAAAADAAMAAAFLC
AgjoEwnuNAFOhpEMTRiggcz4BNJHrv/zCFcLiwMWYNG84BwwEeECcgggoBADs=
EOM
=head1 DESCRIPTION
By prepending a base64 encoded binary string with the C<!!binary> tag, it can
be automatically decoded when loading.
If you are using this schema, any string containing C<[\x{7F}-\x{10FFFF}]>
will be dumped as binary. That also includes encoded utf8.
Note that the logic for dumping is probably broken, see
L<https://github.com/perlpunk/YAML-PP-p5/issues/28>.
Suggestions welcome.
=head1 METHODS
Expand Down
2 changes: 0 additions & 2 deletions lib/YAML/PP/Schema/Perl.pm
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,6 @@ YAML:
# Code
my $string = 'unblessed';
utf8::upgrade($string);
qr{$string}
Expand All @@ -727,7 +726,6 @@ YAML:
# Code
my $string = 'blessed';
utf8::upgrade($string);
bless qr{$string}, "Foo"
Expand Down
35 changes: 22 additions & 13 deletions t/45.binary.t
Original file line number Diff line number Diff line change
Expand Up @@ -59,43 +59,52 @@ my @tests = (
[binary => "\xE0\x83\xBF"],
[binary => "\xF0\x80\x83\xBF"],
[binary => "\xF0\x80\xA3\x80"],
[binary => $gif,],
[binary => [$gif, decode_utf8("ä")],],
[binary => [$gif, 'foo'],],
);

subtest roundtrip => sub {
for my $item (@tests) {
select undef, undef, undef, 0.1;
my ($type, $string) = @$item;
my ($type, $data) = @$item;
local $Data::Dumper::Useqq = 1;
my $label = Data::Dumper->Dump([$string], ['string']);
my $label = Data::Dumper->Dump([$data], ['data']);
chomp $label;
note("\n\n\n=============== $type: $label");
my $dump = $yp->dump_string($string);
my $dump = $yp->dump_string($data);
#note("========= YAML:\n$dump");
my $reload = $yp->load_string($dump);
if ($type eq 'binary') {
if (utf8::is_utf8($reload)) {
utf8::downgrade($reload);
}
if (ref $reload eq 'ARRAY') {
cmp_ok($reload->[0], 'eq', $data->[0], "Reload binary->[0] ok ($label)");
cmp_ok($reload->[1], 'eq', $data->[1], "Reload binary->[1] ok ($label)");
}
else {
cmp_ok($reload, 'eq', $data, "Reload binary ok ($label)");
}
cmp_ok($reload, 'eq', $string, "Reload binary ok ($label)");
}
};

subtest roundtrip_binary => sub {
for my $item (@tests) {
my ($type, $string) = @$item;
my ($type, $data) = @$item;
local $Data::Dumper::Useqq = 1;
my $label = Data::Dumper->Dump([$string], ['string']);
my $label = Data::Dumper->Dump([$data], ['data']);
note("=============== $type: $label");
my $dump = $yp_binary->dump_string($string);
my $dump = $yp_binary->dump_string($data);
if ($type eq 'binary') {
like($dump, qr{!!binary}, "Output YAML contains !!binary");
}
else {
unlike($dump, qr{!!binary}, "Output YAML does not contain !!binary");
}
my $reload = $yp_binary->load_string($dump);
cmp_ok($reload, 'eq', $string, "Reload binary ok ($label)");
if (ref $reload eq 'ARRAY') {
cmp_ok($reload->[0], 'eq', $data->[0], "Reload binary->[0] ok ($label)");
cmp_ok($reload->[1], 'eq', $data->[1], "Reload binary->[1] ok ($label)");
}
else {
cmp_ok($reload, 'eq', $data, "Reload binary ok ($label)");
}
}
};

Expand Down

0 comments on commit 764b85f

Please sign in to comment.