Skip to content

Commit

Permalink
Extract host parser
Browse files Browse the repository at this point in the history
  • Loading branch information
wakaba committed Sep 2, 2016
1 parent 2c0c015 commit 0f7ff3b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 24 deletions.
8 changes: 0 additions & 8 deletions README
Expand Up @@ -30,14 +30,6 @@ The |Web::Encoding| module and related modules from the
perl-web-encodings repository, which is a submodule of this
repository, are also required.

* Tests

- modules/tests-web-url = <https://github.com/wakaba/tests-web-url>
- URL/IDN canonicalization tests
- Test harnesses for testing your browser

- t/*.t - Test harnesses for Perl modules

* Distribution

Latest version of these files are available from the Git repository:
Expand Down
61 changes: 46 additions & 15 deletions lib/Web/DomainName/Canonicalize.pm
Expand Up @@ -133,11 +133,37 @@ sub domain_to_ascii ($) {
} # domain_to_ascii

*canonicalize_domain_name = \&canonicalize_url_host;
sub canonicalize_url_host ($;%) {
my ($s, %args) = @_;

sub _canonicalize_url_host_for_file ($;%) {
my $s = $_[0];

## 2.
$s = encode_web_utf8 $s;
$s =~ s{%([0-9A-Fa-f]{2})}{pack 'C', hex $1}ge;
$s = decode_web_utf8_no_bom $s;

## 3.
$s = domain_to_ascii $s;

## 4.
return undef unless defined $s;

## Spec: <https://url.spec.whatwg.org/#host-parsing>.
## 5.
if ($s =~ /[\x00\x09\x0A\x0D\x20\x23\x25\x2F\x5B\x5C\x5D]/) {
# XXX syntax violation
return undef;
}

$s =~ s{([\x01-\x08\x0B\x0C\x0E-\x1F\x21\x22\x24\x26-\x2A\x2C\x3B-\x3F\x5E\x60\x7B-\x7D\x7F])}{
sprintf '%%%02X', ord $1;
}ge;

return $s;
} # _canonicalize_url_host_for_file

## Spec: <https://url.spec.whatwg.org/#host-parsing>.
sub _host_parser_to_ascii ($) {
my $s = $_[0];

## 1.
if ($s =~ /\A\[/) {
Expand Down Expand Up @@ -165,20 +191,11 @@ sub canonicalize_url_host ($;%) {
return undef unless defined $s;

## 5.
return undef if not $args{is_file} and $s =~ /[:?\x40]/; # XXX
if ($s =~ /[\x00\x09\x0A\x0D\x20\x23\x25\x2F\x5B\x5C\x5D]/) {
if ($s =~ /[\x00\x09\x0A\x0D\x20\x23\x25\x2F:?\x40\x5B\x5C\x5D]/) {
# XXX syntax violation
return undef;
}

# XXX
if ($args{is_file}) {
$s =~ s{([\x00-\x2A\x2C\x2F\x3B-\x3F\x5C\x5E\x60\x7B-\x7D\x7F])}{
sprintf '%%%02X', ord $1;
}ge;
return $s;
}

## 6., 7.
my $ipv4 = Web::IPAddr::Canonicalize::_parse_ipv4_addr $s;
if (defined $ipv4) {
Expand All @@ -189,13 +206,27 @@ sub canonicalize_url_host ($;%) {
return undef;
}

## 8. is for toUnicode.

return $s;
} # _host_parser_to_ascii

sub canonicalize_url_host ($;%) {
my ($s, %args) = @_;
return undef unless defined $s;

if ($args{is_file}) {
return _canonicalize_url_host_for_file $s;
}

$s = _host_parser_to_ascii ($s);
return undef unless defined $s;

# XXX
$s =~ s{([\x01-\x08\x0B\x0C\x0E-\x1F\x21\x22\x24\x26-\x2A\x2C\x3B-\x3E\x5E\x60\x7B-\x7D\x7F])}{
sprintf '%%%02X', ord $1;
}ge;

## 8.
# XXX domain to Unicode, if Unicode flag is true
return $s;
} # canonicalize_url_host

Expand Down
2 changes: 1 addition & 1 deletion t_deps/tests
Submodule tests updated 1 files
+0 −49 url/parsing/data/README

0 comments on commit 0f7ff3b

Please sign in to comment.