Skip to content

Commit

Permalink
Added PSL data
Browse files Browse the repository at this point in the history
  • Loading branch information
wakaba committed May 31, 2016
1 parent b1eb1ac commit 3066377
Show file tree
Hide file tree
Showing 10 changed files with 24,328 additions and 1,353 deletions.
10 changes: 8 additions & 2 deletions Makefile
Expand Up @@ -138,7 +138,7 @@ data/file-name-extensions.json: data/mime-types.json

## ------ URLs ------

all-urls: data/url-schemes.json data/tlds.json
all-urls: data/url-schemes.json data/tlds.json data/psl-tests.json
clean-urls:
rm -fr local/sw-url-schemes.*
rm -fr local/iana-url-schemes.*
Expand All @@ -165,6 +165,10 @@ data/url-schemes.json: bin/url-schemes.pl \

local/iana-tlds.txt:
$(WGET) -O $@ https://data.iana.org/TLD/tlds-alpha-by-domain.txt
local/psl.txt:
$(WGET) -O $@ https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat
local/psl-test.txt:
$(WGET) -O $@ https://raw.githubusercontent.com/publicsuffix/list/master/tests/test_psl.txt

local/mozilla-prefs.js:
$(WGET) -O $@ https://raw.githubusercontent.com/mozilla/gecko-dev/master/modules/libpref/init/all.js
Expand All @@ -176,8 +180,10 @@ local/mozilla-idn-whitelist.txt: local/mozilla-prefs.js
}' $< > $@

data/tlds.json: local/iana-tlds.txt src/tld-additional.txt bin/tlds.pl \
local/mozilla-idn-whitelist.txt
local/mozilla-idn-whitelist.txt local/psl.txt
$(PERL) bin/tlds.pl > $@
data/psl-tests.json: bin/psl-tests.pl local/psl-test.txt
$(PERL) $< > $@

## ------ Language tags ------

Expand Down
3 changes: 2 additions & 1 deletion README
Expand Up @@ -39,12 +39,13 @@ In the "data/" directory:
microdata.json Microdata vocabularies
mime-types.json MIME types
ogp.json OGP vocabulary
psl-tests.json Test data for public suffixes
rdf.json RDF
selectors.json Selectors
specs.json Relevant specifications
temma-syntax.json Temma syntax
temma-tokenizer-expanded.json A Temma tokenizer description
tlds.json TLDs (top-level domains)
tlds.json TLDs (top-level domains) and public suffixes
tls.json TLS (SSL)
url-schemes.json URL schemes
webidl.json Web IDL
Expand Down
33 changes: 33 additions & 0 deletions bin/psl-tests.pl
@@ -0,0 +1,33 @@
use strict;
use warnings;
use Path::Tiny;
use JSON::PS;

my $root_path = path (__FILE__)->parent->parent;

my $Data = [];

my $path = $root_path->child ('local/psl-test.txt');
for (split /\x0D?\x0A/, $path->slurp_utf8) {
if (m{^\s*checkPublicSuffix\s*\(("[^"]*"|'[^']*'|null)\s*,\s*("[^"]*"|'[^']*'|null)\s*\);\s*$}) {
my $input = $1;
my $output = $2;
if ($input eq 'null') {
$input = undef;
} else {
$input =~ s/^["']//;
$input =~ s/["']$//;
}
if ($output eq 'null') {
$output = undef;
} else {
$output =~ s/^["']//;
$output =~ s/["']$//;
}
push @$Data, [$input, $output];
}
}

print perl2json_bytes_for_record $Data;

## License: Public Domain.
53 changes: 50 additions & 3 deletions bin/tlds.pl
@@ -1,12 +1,16 @@
use strict;
use warnings;
use Path::Tiny;
use lib glob path (__FILE__)->parent->child ('modules/*/lib');
use JSON::PS;
use Web::DomainName::Canonicalize;
use Web::DomainName::Punycode;

my $root_path = path (__FILE__)->parent->parent;
my $Data = {};

{
my $path = path (__FILE__)->parent->parent->child ('local/iana-tlds.txt');
my $path = $root_path->child ('local/iana-tlds.txt');
for (split /\x0A/, $path->slurp) {
if (/^\s*#/) {
#
Expand All @@ -21,7 +25,7 @@
}

{
my $path = path (__FILE__)->parent->parent->child ('src/tld-additional.txt');
my $path = $root_path->child ('src/tld-additional.txt');
for (split /\x0A/, $path->slurp) {
if (/^\s*#/) {
#
Expand All @@ -36,7 +40,7 @@
}

{
my $path = path (__FILE__)->parent->parent->child ('local/mozilla-idn-whitelist.txt');
my $path = $root_path->child ('local/mozilla-idn-whitelist.txt');
for (split /\x0A/, $path->slurp) {
if (/^([A-Za-z0-9-]+)$/) {
my $domain = $1;
Expand All @@ -48,6 +52,49 @@
}
}

{
my $path = $root_path->child ('local/psl.txt');
my $type = 'ICANN';
for (split /\x0D?\x0A/, $path->slurp_utf8) {
if (m{^// ===BEGIN PRIVATE DOMAINS===$}) {
$type = 'PRIVATE';
} elsif (m{^//}) {
#
} elsif (/^(\S+)$/) {
my $suffix = $1;
my $exception = $suffix =~ s/^!//;
$suffix =~ s/^\.//;
$suffix =~ s/\.$//;
my @label = map { canonicalize_domain_name $_ } split /\./, $suffix;
my $data = $Data->{tlds}->{pop @label} ||= {};
while (@label) {
my $label = pop @label;
$data = $data->{subdomains}->{$label} ||= {};
}
$data->{public_suffix} = $exception ? 0 : $type;
} elsif (/\S/) {
die "Bad line |$_|";
}
}
}

{
my @d = ($Data->{tlds});
while (@d) {
my $d = shift @d;
next unless defined $d;
for my $a_label (keys %$d) {
if ($a_label =~ /^xn--/) {
my $u_label = decode_punycode substr $a_label, 4;
if (defined $u_label and not $a_label eq $u_label) {
$d->{$a_label}->{u} = $u_label;
}
}
push @d, $d->{$a_label}->{subdomains};
}
}
}

print perl2json_bytes_for_record $Data;

## License: Public Domain.

0 comments on commit 3066377

Please sign in to comment.