Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1002 lines (871 sloc) 31.2 KB
package Web::URL::Checker;
use strict;
use warnings;
our $VERSION = '9.0';
use Web::Encoding;
# XXX URL Standard support
# XXX scheme-dependent conformance
sub new_from_string ($$) {
return bless {value => $_[1]}, $_[0];
} # new_from_string
sub onerror ($;$) {
if (@_ > 1) {
$_[0]->{onerror} = $_[1];
}
return $_[0]->{onerror} ||= sub {
my %args = @_;
warn sprintf qq{"%s": %s (%s)\n},
$args{value}, $args{type}, $args{level};
};
} # onerror
sub _uri_scheme ($) {
if ($_[0]->{value} =~ m!^([^/?#:]+):!) {
return $1;
} else {
return undef;
}
} # _uri_scheme
sub _uri_authority ($) {
if ($_[0]->{value} =~ m!^(?:[^:/?#]+:)?(?://([^/?#]*))?!) {
return $1;
} else {
return undef;
}
} # _uri_authority
sub _uri_userinfo ($) {
my $v = $_[0]->_uri_authority;
if (defined $v and $v =~ /^([^@\[\]]*)\@/) {
return $1;
} else {
return undef;
}
} # _uri_userinfo
sub _uri_host ($) {
my $v = $_[0]->_uri_authority;
if (defined $v) {
$v =~ s/^[^@\[\]]*\@//;
$v =~ s/:[0-9]*\z//;
return $v;
} else {
return undef;
}
} # _uri_host
sub _uri_port ($) {
my $v = $_[0]->_uri_authority;
if (defined $v and $v =~ /:([0-9]*)\z/) {
return $1;
} else {
return undef;
}
} # _uri_port
sub _uri_path ($) {
if ($_[0]->{value} =~ m!\A(?:[^:/?#]+:)?(?://[^/?#]*)?([^?#]*)!) {
return $1;
} else {
die "No URI path";
}
} # _uri_path
sub _uri_query ($) {
if ($_[0]->{value} =~ m!^(?:[^:/?#]+:)?(?://[^/?#]*)?[^?#]*(?:\?([^#]*))?!s) {
return $1;
} else {
return undef;
}
} # _uri_query
sub _uri_fragment ($) {
if ($_[0]->{value} =~ m!^(?:[^:/?#]+:)?(?://[^/?#]*)?[^?#]*(?:\?[^#]*)?(?:#(.*))?!s) {
return $1;
} else {
return undef;
}
} # _uri_fragment
*is_uri = \&is_uri_3986;
sub is_uri_3986 ($) {
my $v = $_[0]->{value};
## Scheme
return 0 unless $v =~ s/^[A-Za-z][A-Za-z0-9+.-]*://s;
## Fragment
if ($v =~ s/#(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z};
}
## Query
if ($v =~ s/\?(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z};
}
## Authority
if ($v =~ s!^//([^/]*)!!s) {
my $w = $1;
$w =~ s/^(?>[A-Za-z0-9._~!\$&'()*+,;=:-]|%[0-9A-Fa-f][0-9A-Fa-f])*\@//os;
$w =~ s/:[0-9]*\z//;
if ($w =~ /^\[(.*)\]\z/s) {
my $x = $1;
unless ($x =~ /\A[vV][0-9A-Fa-f]+\.[A-Za-z0-9._~!\$&'()*+,;=:-]+\z/) {
## IPv6address
my $isv6;
my $ipv4 = qr/(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)(?>\.(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)){3}/;
my $h16 = qr/[0-9A-Fa-f]{1,4}/;
if ($x =~ s/(?:$ipv4|$h16)\z//o) {
if ($x =~ /\A(?>$h16:){6}\z/o or
$x =~ /\A::(?>$h16:){0,5}\z/o or
$x =~ /\A${h16}::(?>$h16:){4}\z/o or
$x =~ /\A$h16(?::$h16)?::(?>$h16:){3}\z/o or
$x =~ /\A$h16(?::$h16){0,2}::(?>$h16:){2}\z/o or
$x =~ /\A$h16(?::$h16){0,3}::$h16:\z/o or
$x =~ /\A$h16(?::$h16){0,4}::\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/$h16\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,5}\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/::\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,6}\z/o) {
$isv6 = 1;
}
}
return 0 unless $isv6;
}
} else {
return 0 unless $w =~ /\A(?>[A-Za-z0-9._~!\$&'()*+,;=-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z/;
}
}
## Path
return 0 unless $v =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}s;
return 1;
} # is_uri_3986
*is_relative_reference = \&is_relative_reference_3986;
sub is_relative_reference_3986 ($) {
my $v = $_[0]->{value};
## No scheme
return 0 if $v =~ s!^[^/?#]*:!!s;
## Fragment
if ($v =~ s/#(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z};
}
## Query
if ($v =~ s/\?(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z};
}
## Authority
if ($v =~ s!^//([^/]*)!!s) {
my $w = $1;
$w =~ s/^(?>[A-Za-z0-9._~!\$&'()*+,;=:-]|%[0-9A-Fa-f][0-9A-Fa-f])*\@//os;
$w =~ s/:[0-9]*\z//;
if ($w =~ /^\[(.*)\]\z/s) {
my $x = $1;
unless ($x =~ /\A[vV][0-9A-Fa-f]+\.[A-Za-z0-9._~!\$&'()*+,;=:-]+\z/) {
## IPv6address
my $isv6;
my $ipv4 = qr/(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)(?>\.(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)){3}/;
my $h16 = qr/[0-9A-Fa-f]{1,4}/;
if ($x =~ s/(?:$ipv4|$h16)\z//o) {
if ($x =~ /\A(?>$h16:){6}\z/o or
$x =~ /\A::(?>$h16:){0,5}\z/o or
$x =~ /\A${h16}::(?>$h16:){4}\z/o or
$x =~ /\A$h16(?::$h16)?::(?>$h16:){3}\z/o or
$x =~ /\A$h16(?::$h16){0,2}::(?>$h16:){2}\z/o or
$x =~ /\A$h16(?::$h16){0,3}::$h16:\z/o or
$x =~ /\A$h16(?::$h16){0,4}::\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/$h16\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,5}\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/::\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,6}\z/o) {
$isv6 = 1;
}
}
return 0 unless $isv6;
}
} else {
return 0 unless $w =~ /\A(?>[A-Za-z0-9._~!\$&'()*+,;=-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z/;
}
}
## Path
return 0 unless $v =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}s;
return 1;
} # is_relative_reference_3986
*is_uri_reference = \&is_uri_reference_3986;
sub is_uri_reference_3986 ($) {
return $_[0]->is_uri_3986 || $_[0]->is_relative_reference_3986;
} # is_uri_reference_3986
*is_absolute_uri = \&is_absolute_uri_3986;
sub is_absolute_uri_3986 ($) {
return $_[0]->{value} !~ /#/ && $_[0]->is_uri_3986;
} # is_uri_reference_3986
sub is_empty_reference ($) {
return $_[0]->{value} eq '';
} # is_empty_reference
*is_iri = \&is_iri_3987;
sub is_iri_3987 ($) {
my $v = $_[0]->{value};
## |ucschar| except for:
## LRM, RLM, LRE, RLE, LRO, RLO, PDF
## U+200E, U+200F, U+202A - U+202E
my $ucschar = qq{\x{00A0}-\x{200D}\x{2010}-\x{2029}\x{202F}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}};
## Scheme
return 0 unless $v =~ s/^[A-Za-z][A-Za-z0-9+.-]*://s;
## Fragment
if ($v =~ s/#(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}o;
}
## Query
if ($v =~ s/\?(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?$ucschar\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}o;
}
## Authority
if ($v =~ s!^//([^/]*)!!s) {
my $w = $1;
$w =~ s/^(?>[A-Za-z0-9._~!\$&'()*+,;=:$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\@//os;
$w =~ s/:[0-9]*\z//;
if ($w =~ /^\[(.*)\]\z/s) {
my $x = $1;
unless ($x =~ /\A[vV][0-9A-Fa-f]+\.[A-Za-z0-9._~!\$&'()*+,;=:-]+\z/) {
## IPv6address
my $isv6;
my $ipv4 = qr/(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)(?>\.(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)){3}/;
my $h16 = qr/[0-9A-Fa-f]{1,4}/;
if ($x =~ s/(?:$ipv4|$h16)\z//o) {
if ($x =~ /\A(?>$h16:){6}\z/o or
$x =~ /\A::(?>$h16:){0,5}\z/o or
$x =~ /\A${h16}::(?>$h16:){4}\z/o or
$x =~ /\A$h16(?::$h16)?::(?>$h16:){3}\z/o or
$x =~ /\A$h16(?::$h16){0,2}::(?>$h16:){2}\z/o or
$x =~ /\A$h16(?::$h16){0,3}::$h16:\z/o or
$x =~ /\A$h16(?::$h16){0,4}::\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/$h16\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,5}\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/::\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,6}\z/o) {
$isv6 = 1;
}
}
return 0 unless $isv6;
}
} else {
return 0 unless $w =~ /\A(?>[A-Za-z0-9._~!\$&'()*+,;=$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z/o;
}
}
## Path
return 0 unless $v =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}os;
return 1;
} # is_iri_3987
*is_relative_iri_reference = \&is_relative_iri_reference_3987;
sub is_relative_iri_reference_3987 ($) {
my $v = $_[0]->{value};
## |ucschar| except for:
## LRM, RLM, LRE, RLE, LRO, RLO, PDF
## U+200E, U+200F, U+202A - U+202E
my $ucschar = qq{\x{00A0}-\x{200D}\x{2010}-\x{2029}\x{202F}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}};
## No scheme
return 0 if $v =~ s!^[^/?#]*:!!s;
## Fragment
if ($v =~ s/#(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}o;
}
## Query
if ($v =~ s/\?(.*)\z//s) {
my $w = $1;
return 0 unless $w =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/?$ucschar\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}o;
}
## Authority
if ($v =~ s!^//([^/]*)!!s) {
my $w = $1;
$w =~ s/^(?>[A-Za-z0-9._~!\$&'()*+,;=:$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\@//os;
$w =~ s/:[0-9]*\z//;
if ($w =~ /^\[(.*)\]\z/s) {
my $x = $1;
unless ($x =~ /\A[vV][0-9A-Fa-f]+\.[A-Za-z0-9._~!\$&'()*+,;=:-]+\z/) {
## IPv6address
my $isv6;
my $ipv4 = qr/(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)(?>\.(?>0|1[0-9]{0,2}|2(?>[0-4][0-9]?|5[0-5]?|[6-9])?|[3-9][0-9]?)){3}/;
my $h16 = qr/[0-9A-Fa-f]{1,4}/;
if ($x =~ s/(?:$ipv4|$h16)\z//o) {
if ($x =~ /\A(?>$h16:){6}\z/o or
$x =~ /\A::(?>$h16:){0,5}\z/o or
$x =~ /\A${h16}::(?>$h16:){4}\z/o or
$x =~ /\A$h16(?::$h16)?::(?>$h16:){3}\z/o or
$x =~ /\A$h16(?::$h16){0,2}::(?>$h16:){2}\z/o or
$x =~ /\A$h16(?::$h16){0,3}::$h16:\z/o or
$x =~ /\A$h16(?::$h16){0,4}::\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/$h16\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,5}\z/o) {
$isv6 = 1;
}
} elsif ($x =~ s/::\z//o) {
if ($x eq '' or $x =~ /\A$h16(?>:$h16){0,6}\z/o) {
$isv6 = 1;
}
}
return 0 unless $isv6;
}
} else {
return 0 unless $w =~ /\A(?>[A-Za-z0-9._~!\$&'()*+,;=$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z/o;
}
}
## Path
return 0 unless $v =~ m{\A(?>[A-Za-z0-9._~!\$&'()*+,;=:\@/$ucschar-]|%[0-9A-Fa-f][0-9A-Fa-f])*\z}os;
return 1;
} # is_relative_iri_reference_3987
*is_iri_reference = \&is_iri_reference_3987;
sub is_iri_reference_3987 ($) {
return $_[0]->is_iri_3987 || $_[0]->is_relative_iri_reference_3987;
} # is_iri_reference_3987
*is_absolute_iri = \&is_absolute_iri_3987;
sub is_absolute_iri_3987 ($) {
return $_[0]->{value} !~ /#/ && $_[0]->is_iri_3987;
} # is_absolute_iri_3987
sub get_uri_reference ($) {
return $_[0]->get_uri_reference_3986;
} # get_uri_reference
sub get_uri_reference_3986 ($) {
my $v = encode_web_utf8 $_[0]->{value};
$v =~ s/([<>"{}|\\\^`\x00-\x20\x7E-\xFF])/sprintf '%%%02X', ord $1/ge;
return $v;
} # get_uri_reference_3986
sub get_iri_reference ($) {
return $_[0]->get_iri_reference_3987;
} # get_iri_reference
sub get_iri_reference_3987 ($) {
my $v = encode_web_utf8 $_[0]->{value};
$v =~ s{%([2-9A-Fa-f][0-9A-Fa-f])}
{
my $ch = hex $1;
if ([
# 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7
# 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF
1, 1, 1, 1, 1, 1, 1, 1, # 0x00
1, 1, 1, 1, 1, 1, 1, 1, # 0x08
1, 1, 1, 1, 1, 1, 1, 1, # 0x10
1, 1, 1, 1, 1, 1, 1, 1, # 0x18
1, 1, 1, 1, 1, 1, 1, 1, # 0x20
1, 1, 1, 1, 1, 0, 0, 1, # 0x28
0, 0, 0, 0, 0, 0, 0, 0, # 0x30
0, 0, 1, 1, 1, 1, 1, 1, # 0x38
1, 0, 0, 0, 0, 0, 0, 0, # 0x40
0, 0, 0, 0, 0, 0, 0, 0, # 0x48
0, 0, 0, 0, 0, 0, 0, 0, # 0x50
0, 0, 0, 1, 1, 1, 1, 0, # 0x58
1, 0, 0, 0, 0, 0, 0, 0, # 0x60
0, 0, 0, 0, 0, 0, 0, 0, # 0x68
0, 0, 0, 0, 0, 0, 0, 0, # 0x70
0, 0, 0, 1, 1, 1, 0, 1, # 0x78
# 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7
# 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF
]->[$ch]) {
# PERCENT SIGN, reserved, not-allowed in ASCII
'%'.$1;
} else {
pack 'C', $ch;
}
}ge;
$v =~ s{(
[\xC2-\xDF][\x80-\xBF] | # UTF8-2
[\xE0][\xA0-\xBF][\x80-\xBF] |
[\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
[\xED][\x80-\x9F][\x80-\xBF] |
[\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] | # UTF8-4
[\x80-\xFF]
)}{
my $c = $1;
if (length ($c) == 1) {
$c =~ s/(.)/sprintf '%%%02X', ord $1/ge;
$c;
} else {
my $ch = decode_web_utf8 $c;
if ($ch =~ /^[\x{00A0}-\x{200D}\x{2010}-\x{2029}\x{202F}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}]/) {
$c;
} else {
$c =~ s/([\x80-\xFF])/sprintf '%%%02X', ord $1/ge;
$c;
}
}
}gex;
$v =~ s/([<>"{}|\\\^`\x00-\x20\x7F])/sprintf '%%%02X', ord $1/ge;
return decode_web_utf8 $v;
} # get_iri_reference_3987
sub get_absolute_reference ($$;%) {
#my ($self, $base, %opt) = @_;
return shift->get_absolute_reference_3986 (@_);
} # get_absolute_reference
sub get_absolute_reference_3986 ($$%) {
my ($self, $base, %opt) = @_;
my $r;
## Decomposition
my ($b_scheme, $b_auth, $b_path, $b_query, $b_frag);
my ($r_scheme, $r_auth, $r_path, $r_query, $r_frag);
if ($self->{value} =~ m!\A(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?\z!s) {
($r_scheme, $r_auth, $r_path, $r_query, $r_frag) = ($1, $2, $3, $4, $5);
} else { # unlikely happen
($r_scheme, $r_auth, $r_path, $r_query, $r_frag)
= (undef, undef, '', undef, undef);
}
if ($base =~ m!\A(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?\z!s) {
($b_scheme, $b_auth, $b_path, $b_query, $b_frag)
= (defined $1 ? $1 : '', $2, $3, $4, $5);
} else { # unlikely happen
($b_scheme, $b_auth, $b_path, $b_query, $b_frag)
= ('', undef, '', undef, undef);
}
## Merge
my $path_merge = sub ($$) {
my ($bpath, $rpath) = @_;
if ($bpath eq '') {
return '/'.$rpath;
}
$bpath =~ s/[^\/]*\z//;
return $bpath . $rpath;
}; # merge
## Removing Dot Segments
my $remove_dot_segments = sub ($) {
local $_ = shift;
my $buf = '';
L: while (length $_) {
next L if s/^\.\.?\///;
next L if s/^\/\.(?:\/|\z)/\//;
if (s/^\/\.\.(\/|\z)/\//) {
$buf =~ s/\/?[^\/]*$//;
next L;
}
last Z if s/^\.\.?\z//;
s/^(\/?[^\/]*)//;
$buf .= $1;
}
return $buf;
}; # remove_dot_segments
## Transformation
my ($t_scheme, $t_auth, $t_path, $t_query, $t_frag);
if ($opt{non_strict} and $r_scheme eq $b_scheme) {
undef $r_scheme;
}
if (defined $r_scheme) {
$t_scheme = $r_scheme;
$t_auth = $r_auth;
$t_path = $remove_dot_segments->($r_path);
$t_query = $r_query;
} else {
if (defined $r_auth) {
$t_auth = $r_auth;
$t_path = $remove_dot_segments->($r_path);
$t_query = $r_query;
} else {
if ($r_path =~ /\A\z/) {
$t_path = $b_path;
if (defined $r_query) {
$t_query = $r_query;
} else {
$t_query = $b_query;
}
} elsif ($r_path =~ /^\//) {
$t_path = $remove_dot_segments->($r_path);
$t_query = $r_query;
} else {
$t_path = $path_merge->($b_path, $r_path);
$t_path = $remove_dot_segments->($t_path);
$t_query = $r_query;
}
$t_auth = $b_auth;
}
$t_scheme = $b_scheme;
}
$t_frag = $r_frag;
## -- Recomposition
my $result = '' ;
$result .= $t_scheme . ':' if defined $t_scheme;
$result .= '//' . $t_auth if defined $t_auth ;
$result .= $t_path ;
$result .= '?' . $t_query if defined $t_query ;
$result .= '#' . $t_frag if defined $t_frag ;
return $result;
} # get_absolute_reference_3986
sub get_absolute_reference_3987 ($$;%) {
#my ($self, $base, %opt) = @_;
return shift->get_absolute_reference_3986 (@_);
} # get_absolute_reference_3987
sub is_same_document_reference ($$) {
#my ($self, $base) = @_;
return $_[0]->is_same_document_reference_3986 ($_[1]);
} # is_same_document_reference
sub is_same_document_reference_3986 ($$;%) {
my ($self, $base, %opt) = @_;
if (substr ($self->{value}, 0, 1) eq '#') {
return 1;
} else {
my $target = $self->get_absolute_reference_3986
($base, non_strict => $opt{non_strict});
$target =~ s/#.*\z//;
$base =~ s/#.*\z//;
return ($target eq $base);
}
} # is_same_document_reference_3986
sub get_relative_reference ($$) {
my ($self, $base) = @_;
my @base;
if ($base =~ m!\A(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?\z!) {
(@base) = (defined $1 ? $1 : '', $2, $3, $4, $5);
} else { # unlikeley happen
(@base) = ('', undef, '', undef, undef);
}
my @t;
my $t = $self->get_absolute_reference ($base);
if ($t =~ m!\A(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?\z!) {
(@t) = (defined $1 ? $1 : '', $2, $3, $4, $5);
} else { # unlikeley happen
(@t) = ('', undef, '', undef, undef);
}
my @ref;
R: {
## Scheme
if ($base[0] ne $t[0]) {
(@ref) = @t;
last R;
}
## Authority
if (not defined $base[1] and not defined $t[1]) {
(@ref) = @t;
last R;
} elsif (not defined $t[1]) {
(@ref) = @t;
last R;
} elsif (not defined $base[1]) {
(@ref) = @t;
last R;
} elsif ($base[1] ne $t[1]) {
(@ref) = @t;
last R;
}
## NOTE: Avoid uncommon references.
if (defined $t[4] and # fragment
$t[2] eq $base[2] and # path
((not defined $t[3] and not defined $base[3]) or # query
(defined $t[3] and defined $base[3] and $t[3] eq $base[3]))) {
(@ref) = (undef, undef, '', undef, $t[4]);
last R;
}
## Path
my @tpath = split m!/!, $t[2], -1;
my @bpath = split m!/!, $base[2], -1;
if (@tpath < 1 or @bpath < 1) { ## No |/|
(@ref) = @t;
last R;
}
my $bpl;
## Removes common segments
while (@tpath and @bpath and $tpath[0] eq $bpath[0]) {
shift @tpath;
$bpl = shift @bpath;
}
if (@tpath == 0) {
if (@bpath == 0) { ## Avoid empty path for backward compatibility
unshift @tpath, $bpl;
} else {
unshift @tpath, '..', $bpl;
}
} elsif (@bpath == 0) {
unshift @tpath, $bpl;
}
unshift @tpath, ('..') x (@bpath - 1) if @bpath > 1;
unshift @tpath, '.' if $tpath[0] eq '' or
$tpath[0] =~ /:/;
(@ref) = (undef, undef, (join '/', @tpath), $t[3], $t[4]);
} # R
## -- Recomposition
my $result = '' ;
$result .= $ref[0] . ':' if defined $ref[0]; # scheme;
$result .= '//' . $ref[1] if defined $ref[1]; # authority
$result .= $ref[2] ; # path
$result .= '?' . $ref[3] if defined $ref[3]; # query
$result .= '#' . $ref[4] if defined $ref[4]; # fragment
return $result;
} # get_relative_reference
our $DefaultPort = {
http => 80,
};
my $ErrorLevels = {
uri_fact => 'm',
uri_lc_must => 'm', ## Non-RFC 2119 "must" (or fact)
uri_lc_should => 'w', ## Non-RFC 2119 "should"
uri_syntax => 'm',
rdf_fact => 'm',
}; # $ErrorLevels
*check_iri = \&check_iri_3987;
sub check_iri_3987 ($) {
my $self = $_[0];
unless ($self->is_iri_3987) {
$self->onerror->(type => 'syntax error:iri3987',
level => $ErrorLevels->{uri_syntax},
value => $self->{value});
}
$self->check_iri_reference_3987;
} # check_iri
*check_iri_reference = \&check_iri_reference_3987;
sub check_iri_reference_3987 ($) {
my $self = $_[0];
## RFC 3987 4.1.
unless ($self->is_iri_reference_3987) {
$self->onerror->(type => 'syntax error:iriref3987',
level => $ErrorLevels->{uri_syntax},
value => $self->{value});
## MUST (NOTE: A requirement for bidi IRIs.)
}
## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
pos ($self->{value}) = 0;
while ($self->{value} =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
$self->onerror->(type => 'URL:lowercase hexadecimal digit',
level => $ErrorLevels->{uri_lc_should},
value => $self->{value},
pos_start => $-[0], pos_end => $+[0]);
## shoult not
}
## RFC 3986 2.2.
## URI producing applications should percent-encode ... reserved ...
## unless ... allowed by the URI scheme .... --- This is not testable.
## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
pos ($self->{value}) = 0;
while ($self->{value} =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
$self->onerror->(type => 'URL:percent-encoded unreserved',
level => $ErrorLevels->{uri_lc_should},
value => $self->{value},
pos_start => $-[0], pos_end => $+[0]);
## should
## should
}
## RFC 3986 2.4.
## ... "%" ... must be percent-encoded as "%25" ...
## --- Either syntax error or undetectable if followed by two hexadecimals
## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
my $scheme = $self->_uri_scheme;
my $scheme_canon;
if (defined $scheme) {
$scheme_canon = encode_web_utf8 $scheme;
$scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
if ($scheme_canon =~ tr/A-Z/a-z/) {
$self->onerror->(type => 'URL:uppercase scheme name',
level => $ErrorLevels->{uri_lc_should},
value => $scheme, value_mark => qr/[A-Z]+/);
## should
}
}
## Note that nothing prevent a conforming URI (if there is one)
## using an unregistered URI scheme...
## RFC 3986 3.2.1., 7.5.
my $ui = $self->_uri_userinfo;
if (defined $ui and $ui =~ /:/) {
$self->onerror->(type => 'URL:password',
level => $ErrorLevels->{uri_lc_should},
value => $ui);
# deprecated, should be considered an error
}
## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
my $host = $self->_uri_host;
if (defined $host) {
if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
$self->onerror->(type => 'URL:address format',
level => 'w',
text => $1,
value => $host, pos_start => $-[1], pos_end => $+[1]);
## NOTE: No conformance creteria is defined for new address format,
## nor is any standardization process.
}
my $hostnp = $host;
$hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
if ($hostnp =~ /[A-Z]/) {
$self->onerror->(type => 'URL:uppercase host',
level => $ErrorLevels->{uri_lc_should},
value => $hostnp, value_mark => qr/[A-Z]+/);
## should
}
if ($host =~ /^\[/) {
#
} else {
my $host_np = encode_web_utf8 $host;
$host_np =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
if ($host_np eq '') {
## NOTE: Although not explicitly mentioned, an empty host
## should be considered as an exception for the recommendation
## that a host "should" be a DNS name.
} elsif ($host_np !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
$self->onerror->(type => 'URL:non-DNS host',
level => $ErrorLevels->{uri_lc_should},
value => $host_np);
## should
## should be IDNA encoding if wish to maximize interoperability
} elsif (length $host > 255) {
## NOTE: This length might be incorrect if there were percent-encoded
## UTF-8 bytes; however, the above condition catches all non-ASCII.
$self->onerror->(type => 'URL:long host',
level => $ErrorLevels->{uri_lc_should},
value => $host_np,
pos_start => 256, pos_end => length $host);
## should
}
## FQDN should be followed by "." if necessary --- untestable
## must be UTF-8
unless ($host_np =~ /\A(?>
[\x00-\x7F] |
[\xC2-\xDF][\x80-\xBF] | # UTF8-2
[\xE0][\xA0-\xBF][\x80-\xBF] |
[\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
[\xED][\x80-\x9F][\x80-\xBF] |
[\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
)*\z/x) {
$self->onerror->(type => 'URL:non UTF-8 host',
level => $ErrorLevels->{uri_lc_must},
value => $host); # not $host_np
# must
}
}
}
## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
my $port = $self->_uri_port;
if (defined $port) {
if ($port =~ /\A([0-9]+)\z/) {
if ($DefaultPort->{$scheme_canon} == $1) {
$self->onerror->(type => 'URL:default port',
level => $ErrorLevels->{uri_lc_should},
value => $port);
## should
}
} elsif ($port eq '') {
$self->onerror->(type => 'URL:empty port',
level => $ErrorLevels->{uri_lc_should},
value => $self->_uri_authority,
value_mark_end => 1);
## should
}
}
## RFC 3986 3.4.
## ... says that "/" or "?" in query might be problematic for old
## implementations, but also suggest that for readability
## percent-encoding might not be good idea. It provides no
## recommendation on this issue. Therefore, we do no check for this
## matter.
## RFC 3986 3.5.
## ... says again that "/" or "?" in fragment might be problematic,
## without any recommendation.
## We again left this unchecked.
## RFC 3986 4.4.
## Authors should not assume ... different, though equivalent,
## URI will (or will not) be interpreted as a same-document reference ...
## This is not testable.
## RFC 3986 5.4.2.
## "scheme:relative" should be avoided
## This is not testable without scheme specific information.
## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
my $path = $self->_uri_path;
if (defined $scheme) {
if (
$path =~ m!/\.\./! or
$path =~ m!/\./! or
$path =~ m!/\.\.\z! or
$path =~ m!/\.\z! or
$path =~ m!\A\.\./! or
$path =~ m!\A\./! or
$path eq '.,' or
$path eq '.'
) {
$self->onerror->(type => 'URL:dot-segment',
level => $ErrorLevels->{uri_lc_should},
value => $path,
value_mark => qr[(?<=/)\.\.?(?=/|\z)|\A\.\.?(?=/|\z)]);
## should
}
}
## RFC 3986 6.2.3., RFC 3987 5.3.3.
my $authority = $self->_uri_authority;
if (defined $authority) {
if ($path eq '') {
$self->onerror->(type => 'URL:empty path',
level => $ErrorLevels->{uri_lc_should},
value => $self->{value}, value_mark_end => 1);
## should
}
}
## RFC 3986 6.2.3., RFC 3987 5.3.3.
## Scheme dependent default authority should be omitted
## RFC 3986 6.2.3., RFC 3987 5.3.3.
if (defined $host and $host eq '' and
(defined $ui or defined $port)) {
$self->onerror->(type => 'URL:empty host',
level => $ErrorLevels->{uri_lc_should},
value => $authority,
pos_start => defined $ui ? 1 + length $ui : 0,
pos_end => defined $ui ? 1 + length $ui : 0);
## should # when empty authority is allowed
}
## RFC 3986 7.5.
## should not ... username or password that is intended to be secret
## This is not testable.
## RFC 3987 4.1.
## MUST be in full logical order
## This is not testable.
## RFC 3987 4.1., 6.4.
## URI scheme dependent syntax
## MUST
## TODO
## RFC 3987 4.2.
## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
## SHOULD NOT use both rtl and ltr characters
## SHOULD start with rtl if using rtl characters
## TODO
## RFC 3987 5.3.2.2.
## SHOULD be NFC
## NFKC may avoid even more problems
## TODO
## RFC 3987 5.3.3.
## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
## SHOULD be normalized by Nameprep
## TODO
## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
} # check_iri_reference
sub check_rdf_uri_reference ($) {
my $self = $_[0];
pos ($self->{value}) = 0;
if ($self->{value} =~ /[\x00-\x1F\x7F-\x9F]/) {
$self->onerror->(type => 'syntax error:rdfuriref',
level => $ErrorLevels->{rdf_fact},
value => $self->{value},
position => $-[0]);
}
my $ascii_uri = $self->get_uri_reference_3986; # same as RDF spec's one
$ascii_uri = (ref $self)->new_from_string ($ascii_uri);
unless ($ascii_uri->is_uri) { ## TODO: is_uri_2396 should be used.
$self->onerror->(#type => 'syntax error:uri2396',
type => 'syntax error:uri3986',
level => $ErrorLevels->{uri_fact},
value => $ascii_uri->{value});
}
## TODO: Check against RFC 2396.
#Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
} # check_rdf_uri_reference
1;
=head1 LICENSE
Copyright 2006-2013 Wakaba <wakaba@suikawiki.org>.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
Portions of this module are derived from the example parser (April 7,
2004) available at
<http://www.gbiv.com/protocols/uri/rev-2002/uri_test.pl> that is
placed in the Public Domain by Roy T. Fielding and Day Software, Inc.
=back
You can’t perform that action at this time.