Skip to content

Commit

Permalink
Improved handling of Québec
Browse files Browse the repository at this point in the history
  • Loading branch information
nigelhorne committed Oct 20, 2020
1 parent 76d21b4 commit 63197e8
Showing 1 changed file with 203 additions and 3 deletions.
206 changes: 203 additions & 3 deletions createdatabase.PL
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ use constant MAX_INSERT_COUNT => 250; # Maximum number of CSV rows to insert in
# use constant MAX_INSERT_COUNT => 1; # Maximum number of CSV rows to insert in a single statement
use constant SQLITE_CHUNK_SIZE => 1_000; # Number of rows to read at a time

binmode(STDOUT, "encoding(UTF-8)");

my %zipcodes = (
'04350' => { city => 'Litchfield', county => 'Kennebec' },
'04410' => { city => 'Bradford', county => 'Penobscot' },
Expand Down Expand Up @@ -1118,6 +1120,199 @@ flush_queue($dbh, $redis);

my $inserts = 0;

###########
# Code doubled here to run first for debugging. OSM will be the last data to
# be imported
if(my $osm = $ENV{'OSM_HOME'}) {
# Openstreetmap

my @files = ('north-america-latest.osm.bz2', 'europe-latest.osm.bz2');

foreach my $file(@files) {
my $filename = File::Spec->catfile($osm, $file);

$| = 1;
printf "%-70s\r", $filename;
$| = 0;
print "\n" if(MAX_INSERT_COUNT == 1);

# TODO: check for the presence of bzcat
open(my $pin, '-|', "bzcat $filename");
my $reader = XML::LibXML::Reader->new(FD => $pin)
or die "cannot read $filename";

my $in_node;
my $node;
my $key;
my $name;
my $is_in;
my $lat;
my $lon;
my $place;

while($reader->read()) {
# These constants are not exported by default :-(
if($reader->nodeType() == 1) {
$node = $reader->name();
if($node eq 'node') {
if($reader->hasAttributes()) {
$lat = $reader->getAttribute('lat');
$lon = $reader->getAttribute('lon');
$in_node = 1;
$name = undef;
$is_in = undef;
}
} elsif($in_node) {
if($node eq 'tag') {
if($reader->hasAttributes()) {
my $key = $reader->getAttribute('k');
# print "$key\n";
if($key eq 'name:en') {
$name = $reader->getAttribute('v');
# print "$name\n";
} elsif(($key eq 'name') && !defined($name)) {
$name = $reader->getAttribute('v');
} elsif($key eq 'is_in') {
$is_in = $reader->getAttribute('v');
} elsif($key eq 'is_in:country') {
my $country = $reader->getAttribute('v');
if(defined($is_in) && ($is_in !~ /\Q$country\E$/)) {
$is_in .= ", $country";
} elsif(!defined($is_in)) {
$is_in = $country;
}
} elsif($key eq 'place') {
$place = $reader->getAttribute('v');
}
}
}
}
} elsif($reader->nodeType() == 15) {
if(defined($name) && defined($is_in) && defined($lat) && defined($lon) && defined($place)) {
my $add_record = 1;

$is_in =~ s/,(\w)/, $1/g;
$is_in =~ s/, United Kingdom/, GB/;
$is_in =~ s/, UK,.+$/, GB/;
$is_in =~ s/^UK, UK$/GB/;
$is_in =~ s/GB, GB$/GB/;
$is_in =~ s/, UK$/, GB/;
$is_in =~ s/(England|Scotland|Wales), GB/GB/;
$is_in =~ s/Yorkshire, UK/Yorkshire, GB/;
$is_in =~ s/, Europe$//;
$is_in =~ s/;\s?/, /g;
$is_in =~ s/United States of America.*/US/;
$is_in =~ s/United States$/US/;
$is_in =~ s/USA$/US/;
$is_in =~ s/(\w)? USA$/$1, US/;
my $preamble;
my $state;
my $country;
if($is_in =~ /(.+), (.+), US/) {
$preamble = $1;
$state = $2;
$country = 'US';
} elsif($is_in =~ /^(.+), US/) {
$state = $1;
$country = 'US';
}
if($state && (length($state) > 2)) {
if(my $code = $us->{state2code}{uc($state)}) {
if($preamble) {
$is_in = "$preamble, $code, US";
} else {
$is_in = "$code, US";
}
$country = 'US';
} else {
warn "$is_in: unknown US state $state" if(DEBUG&DEBUG_DATA_VALIDATE);
$add_record = 0;
}
} elsif(my $code = $us->{state2code}{uc($is_in)}) {
$is_in = "$code, US";
} elsif($code = $ca->{province2code}{uc($is_in)}) {
$is_in = "$code, Canada";
$country = 'Canada';
} else {
if($is_in =~ /(.+), (.+), Canada/) {
$preamble = $1;
$state = $2;
$country = 'Canada';
} elsif($is_in =~ /^(.+), Canada/) {
$state = $1;
$country = 'Canada';
}
if($state && (length($state) > 2)) {
if($state eq 'Québec') {
$state = 'Quebec';
}
if(my $code = $ca->{province2code}{uc($state)}) {

if($preamble) {
$is_in = "$preamble, $code, Canada";
} else {
$is_in = "$code, Canada";
}
$country = 'Canada';
} else {
die "$is_in: unknown Canadian province $state";
}
} elsif($is_in =~ /(Australia|Canada|US|GB)$/) {
$add_record = 0;
}
}
if($add_record) {
if($is_in !~ /,/) {
if($is_in ne 'GB') {
if(!Locale::Country::country2code($is_in)) {
$add_record = 0;
}
}
} elsif($is_in =~ /(.+), (.+)$/) {
my $place = $1;
$country = $2;
if($country ne 'GB') {
if(!Locale::Country::country2code($country)) {
$add_record = 0;
}
}
}
$add_record = 0 if($is_in =~ /,.+,/); # Just towns for now
# means it's another country
$add_record = 0 if(!defined($country));
if($add_record) {
print "$name, $is_in: $lat, $lon\n" if(DEBUG&DEBUG_ALL);
my $row = {
'CITY' => $name,
'STATE' => $state,
'COUNTRY' => $country,
'LAT' => $lat,
'LON' => $lon,
};
$inserts += import(country => $country, state => $state, row => $row, file => $filename, ua => $ua, dbh => $dbh, redis => $redis);
if($inserts >= MAX_INSERT_COUNT) {
flush_queue($dbh, $redis);
$inserts = 0;
}
}
}
$name = undef;
$lat = undef;
$lon = undef;
$is_in = undef;
$key = undef;
$node = undef;
$in_node = 0;
}
}
}
}

flush_queue($dbh, $redis); # Check for hanging dups in current state
$inserts = 0;
}
###########

my %address_parsers;
# my $wof_global_dbh;
if(my $whosonfirst = $ENV{'WHOSONFIRST_HOME'}) {
Expand Down Expand Up @@ -1523,7 +1718,8 @@ foreach my $country(@whosonfirst_only_countries) {
flush_queue($dbh, $redis);
%whosonfirst = ();

if(my $osm = $ENV{'OSM_HOME'}) {
# if(my $osm = $ENV{'OSM_HOME'}) {
if(0) {
# Openstreetmap

my @files = ('north-america-latest.osm.bz2', 'europe-latest.osm.bz2');
Expand Down Expand Up @@ -1625,7 +1821,8 @@ if(my $osm = $ENV{'OSM_HOME'}) {
}
$country = 'US';
} else {
die "$is_in: unknown US state $state";
warn "$is_in: unknown US state $state" if(DEBUG&DEBUG_DATA_VALIDATE);
$add_record = 0;
}
} elsif(my $code = $us->{state2code}{uc($is_in)}) {
$is_in = "$code, US";
Expand All @@ -1642,6 +1839,9 @@ if(my $osm = $ENV{'OSM_HOME'}) {
$country = 'Canada';
}
if($state && (length($state) > 2)) {
if($state eq 'Québec') {
$state = 'Quebec';
}
if(my $code = $ca->{province2code}{uc($state)}) {

if($preamble) {
Expand All @@ -1651,7 +1851,7 @@ if(my $osm = $ENV{'OSM_HOME'}) {
}
$country = 'Canada';
} else {
die "Unknown Canadian province $state";
die "$is_in: unknown Canadian province $state";
}
} elsif($is_in =~ /(Australia|Canada|US|GB)$/) {
$add_record = 0;
Expand Down

0 comments on commit 63197e8

Please sign in to comment.