bin/usgs-geonames-parse

#!/usr/bin/perl -w
#
# usgs-geonames-parse:
# Parse USGS geonames data.  For non-US countries see geonames-parse.
#
# Pass input data in on pipe, no arguments. Get Populated Places zip from here:
#   http://geonames.usgs.gov/domestic/download_data.htm
#
# Copyright (c) 2011 UK Citizens Online Democracy. All rights reserved.
# Email: matthew@mysociety.org; WWW: http://www.mysociety.org/

use strict;
require 5.8.0;

# Horrible boilerplate to set up appropriate library paths.
use FindBin;
use lib "$FindBin::Bin/../perllib";
use lib "$FindBin::Bin/../commonlib/perllib";

use Geo::Distance;
use IO::File;
use POSIX qw(acos);
use Text::CSV;
use Text::Levenshtein;

use mySociety::Config;
BEGIN {
    mySociety::Config::set_file("$FindBin::Bin/../conf/general");
}
use mySociety::DBHandle qw(dbh);

use Gaze;

my %state_names;

# Grab list of US state names.
my $c = new Text::CSV;
open(STATES, "$FindBin::Bin/../data/us-states.csv");
while (my $line = <STATES>) {
    $c->parse($line);
    my ($abbr, $name) = $c->fields();
    $state_names{$abbr} = $name;
}
close(STATES);

# feature ID -> [name, state, county, lat, long]
my %place;

# canonicalised name to [feature ID, ...]
my %name;

# ambiguous names and corresponding IDs
my %ambiguous;
my %ambiguous_id;

binmode(STDIN, ":bytes");
my $nr = 0;
while (defined(my $line = <STDIN>)) {
    print "Reading names: ", ++$nr, "\r";
    chomp($line);
    my @f = split(/\|/, $line);

    # Fields are:
    #   feature ID (0)
    #   feature name(1 - previously 2)
    #   feature type(2 - previously 3)
    #   state code (3 - previously 1)
    #   FIPS state number(4 - previously 5)
    #   county name(5 - previously 4)
    #   FIPS county number(6)
    #   latitude (DMS) (7)
    #   longitude (DMS) (8)
    #   latitude (decimal) (9)
    #   longitude (decimal) (10)
    #   source latitude (DMS)  (11) -- not sure what these are
    #   source longitude (DMS) (12)
    #   source latitude (decimal) (13)
    #   source longitude (decimal) (14)
    #   elevation in metres
    #   elevation in feet
    #   map name
    #   date created
    #   date edited

    # Only use 'populated places'
    next unless ($f[2] eq 'Populated Place');

    # Some places have no location.
    next if ($f[7] eq 'UNKNOWN' || !defined($f[9]) || !defined($f[10]));

    # Names are coded in Latin1, apparently.
    utf8::upgrade($f[1]);
    utf8::upgrade($f[5]);

    # http://geonames.usgs.gov/gnis_users_guide_descripdbs.html
    # Two special terms are used in parentheses. The word historical signifies
    # a feature that no longer exists, and the word subdivision indicates a
    # named population cluster within another populated place or civil
    # division.
    #
    # Drop historic names, but keep subdivisions.
    next if ($f[1] =~ m# \(historical\)$#);

    # Strip the '(subdivision)' note.
    $f[1] =~ s# \([a-z]+\)##;
    
    $place{$f[0]} = [$f[1], $f[3], $f[5], $f[9], $f[10]];

    my $canon = lc(Gaze::strip_punctuation($f[2]));
    push(@{$name{$canon}}, $f[0]);
    if (@{$name{$canon}} > 1) {
        $ambiguous{$canon} = @{$name{$canon}};
        $ambiguous_id{$f[0]} = 1;
    }
}
print STDERR "\n";

# Spatial index to speed nearest-neighbour searches.
my @features_in_lat_order = sort { $place{$a}->[3] <=> $place{$b}->[3] } keys(%place);

# If a place appears to be duplicated, we delete the duplicates; but to avoid
# having to regenerate the sorted list, mark them here.
my %deleted;

# find_places_near LAT LON DISTANCE
# Return in list context [feature ID, distance] of places within DISTANCE km of
# (LAT, LON).
sub find_places_near ($$$) {
    my ($lat, $lon, $dist) = @_;
    my ($lat_l, $lat_h) = ($lat - Geo::Distance::deg($dist / Geo::Distance::R_e), $lat + Geo::Distance::deg($dist / Geo::Distance::R_e));
    my ($ind_l, $ind_h);    # Bracketing indexes for search in list of UFIs

    # Find low limit of UFIs.
    my ($l, $h) = (0, $#features_in_lat_order);
    while ($h > $l + 1) {
        my $a = int(($h + $l) / 2);
        my $id = $features_in_lat_order[$a];
        my $ll = $place{$id}->[3];
        if ($ll < $lat_l) {
            $l = $a;
        } else {
            $h = $a;
        }
    }
    $ind_l = $l;

    ($l, $h) = (0, $#features_in_lat_order);
    while ($h > $l + 1) {
        my $a = int(($h + $l) / 2);
        my $id = $features_in_lat_order[$a];
        my $ll = $place{$id}->[3];
        if ($ll > $lat_h) {
            $h = $a;
        } else {
            $l = $a;
        }
    }
    $ind_h = $h;

    return grep { !exists($deleted{$_->[0]}) }
            grep { $_->[1] < $dist }
            map { [$_, Geo::Distance::distance($lat, $lon, $place{$_}->[3], $place{$_}->[4])] }
                @features_in_lat_order[$ind_l .. $ind_h];
}

# Qualifications we use for places which have duplicate names within a county.
my (%near_qualifier);

my $nd = 0;
                    # sort so that diagnostic output groups places by name
foreach my $id (sort { $place{$a}->[0] cmp $place{$b}->[0] } keys(%ambiguous_id)) {
    print STDERR "Disambiguating names: ", ++$nd, "/", scalar(keys(%ambiguous_id)), "\r";
    next if (exists($deleted{$id}));
    my ($name, $state, $county, $lat, $lon) = @{$place{$id}};
    my $canon = lc(Gaze::strip_punctuation($name));

    # Find the other places within the same state and county.
                    # sort in order of distance from this place
    my @others = sort { $a->[1] <=> $b->[1] }
                    # compute distances
                    map { [$_, Geo::Distance::distance($lat, $lon, $place{$_}->[3], $place{$_}->[4]) ] }
                    # find places with the same name in this county
                    grep { $_ != $id
                            && !exists($deleted{$id})
                            && $place{$_}->[1] eq $state
                            && $place{$_}->[2] eq $county }
                    @{$name{$canon}};

    # Possibly the name is not or no longer ambiguous with others in the same
    # county.
    next if (@others == 0);

    # The largest distance away that a disambiguating place may lie. Obviously
    # it has to be closer to this place than it is to any of the others.
    my $maxdist = $others[0]->[1] / 2;
    $maxdist = 20 if ($maxdist > 20);
    
                    # sort in order of distance from this place
    my @disambig = sort { $a->[1] <=> $b->[1] }
                    # places within $maxdist of here which are not deleted and
                    # are not themselves ambiguously named, and whose names are
                    # not very similar to this place's (there are typos in the
                    # USGS data)
                    grep {
                        $_->[0] != $id
                        && !exists($deleted{$_->[0]})
                        && !exists($ambiguous_id{$_->[0]})
                        && Text::Levenshtein::distance($name, $place{$_->[0]}->[0]) > (length($name) / 4.)
                    } find_places_near($lat, $lon, $maxdist);

    if (@disambig == 0) {
        print STDERR "#$id $name, $county, $state: no disambiguating names within ${maxdist}km; skipping\n";
        $deleted{$id} = 1;
    } else {
        my $n = @disambig > 3 ? 3 : @disambig;
        $near_qualifier{$id} = join(', ', map { $place{$_->[0]}->[0] } @disambig[0 .. $n - 1]);
#        print STDERR "#$id $name, $county, $state: 'near $near_qualifier{$id}'\n";
    }
}
print STDERR "\n";

print STDERR "Will skip ", scalar(keys(%deleted)), "/", scalar(keys(%place)), " names\n";

my $nw = 0;
foreach my $id (keys %place) {
    print STDERR "Storing names: ", ++$nw, "/", scalar(keys(%place)), "\r";
    next if (exists($deleted{$id}));

    my ($name, $state, $county, $lat, $lon) = @{$place{$id}};
    my $ufi = $id + 100_000_000;

    dbh()->do("insert into feature (ufi, country, state, lat, lon, in_qualifier, near_qualifier) values (?, 'US', ?, ?, ?, ?, ?)", {}, $ufi, $state, $lat, $lon, "$county County", $near_qualifier{$id});
    dbh()->do("insert into name (ufi, uni, is_primary, full_name, name_type) values (?, ?, true, ?, 'C')", {}, $ufi, $ufi, $name);
}
print STDERR "\n";

dbh()->commit();