Changes so that import of modern placename data works.

mysociety · Dec 16, 2011 · 17565a5 · 17565a5
1 parent 17120ef
commit 17565a5
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 38 deletions.
diff --git a/README b/README
@@ -12,16 +12,32 @@ for a particular map, or on PledgeBank for local search.
 Installation
 ============
 
-Set up a PostgreSQL database using the schema in db/schema.sql
+$ git clone git://github.com/mysociety/gaze.git
+$ mkdir gaze-data
+$ cd gaze
+$ cp conf/general-example conf/general
 
-Fetch the single compressed zip that contains the entire country files dataset
-from http://earth-info.nga.mil/gns/html/cntry_files.html and unzip it in an
-empty directory.
+Edit the conf/general file to have the database connection parameters, and the
+location of two directories to store the Xapian and GPW data.
 
-In that directory, run geonames-split with that file as input. Run load-all.
-This will import all the placename data for non-US countries. For the US,
-fetch the Populated Places dataset from http://geonames.usgs.gov/ and
-run usgs-geonames-parse.
+Set up a PostgreSQL database called gaze.
 
-gpw-parse - convert Gridded Population of the World data into the format Gaze uses.
+Fetch the single compressed zip that contains the entire country files dataset
+from http://earth-info.nga.mil/gns/html/cntry_files.html and unzip it in the
+gaze-data directory. For the US, fetch and unzip the Populated Places dataset
+from http://geonames.usgs.gov/ in the same place.
+
+Then run the following (replacing the input filenames with whatever you've just
+downloaded):
+
+$ psql gaze gaze < db/schema.sql
+$ cd ../gaze-data
+$ ../gaze/bin/usgs-geonames-parse < P_PLACES_20111204.txt
+$ ../gaze/bin/xapian-index US
+$ ../gaze/bin/geonames-split < geonames_dd_dms_date_20111212.txt
+$ ../gaze/bin/load-all
+
+This will set up all the place name related stuff. TODO: Document gpw-parse
+beyond the comments in it - converting Gridded Population of the World data
+into the format Gaze uses.
 
diff --git a/bin/geonames-parse b/bin/geonames-parse
@@ -12,7 +12,7 @@
 #
 # For US see usgs-geonames-parse
 #
-# Copyright (c) 2005 UK Citizens Online Democracy. All rights reserved.
+# Copyright (c) 2011 UK Citizens Online Democracy. All rights reserved.
 # Email: matthew@mysociety.org; WWW: http://www.mysociety.org/
 
 use strict;
@@ -95,10 +95,18 @@ sub process_one_country($$) {
     # select the preferred "primary" name where names of several types are
     # available.
     my %name_type_preference = (
-            C => 0,         # Conventional
-            N => 1,         # Native
-            V => 2,         # Variant or alternate
-            D => 10         # Not verified
+            C  => 0,         # Conventional
+            NS => 1,         # Native
+            N  => 2,         # Native
+            VS => 3,         # Variant or alternate
+            V  => 4,         # Variant or alternate
+            VA => 5,         # Variant or alternate
+            DS => 6,         # Not verified
+            D  => 7,         # Not verified
+            HS => 8,         # Historic
+            H  => 9,         # Historic
+            PS => 10,        # Provisional
+            P  => 11,        # Provisional
         );
 
     # Hash which maps UFI to reference to hash of name to name-type and UNI.
@@ -279,9 +287,6 @@ sub trim ($) {
     return $x;
 }
 
-# Read in names
-<STDIN>;    # Skip header
-
 # Each record describes a name and a feature. Each feature may have multiple
 # names. 
 my $nf = 0;
@@ -307,7 +312,7 @@ while (my $line = <STDIN>) {
     if ($fields{NT} =~ m/^\s*$/) {
         warn "Name type blank, assuming 'C'. Line:\n$line";
         $fields{NT} = 'C';
-    } elsif ($fields{NT} !~ m/^[CDNV]$/) {
+    } elsif ($fields{NT} !~ m/^(C|[DNHP]S?|V[AS]?)$/) {
         die "Name type $fields{NT} not known. Line:\n$line";
     }
 

diff --git a/bin/usgs-geonames-parse b/bin/usgs-geonames-parse
@@ -3,11 +3,10 @@
 # usgs-geonames-parse:
 # Parse USGS geonames data.  For non-US countries see geonames-parse.
 #
-# Pass input data in on pipe, no argumens.  Get files from here:
-#   http://geonames.usgs.gov/stategaz/POP_PLACES_DECI.zip
-# (use the DECI one for decimal, rather than the degrees/mins/secs one)
+# Pass input data in on pipe, no arguments. Get Populated Places zip from here:
+#   http://geonames.usgs.gov/domestic/download_data.htm
 #
-# Copyright (c) 2005 UK Citizens Online Democracy. All rights reserved.
+# Copyright (c) 2011 UK Citizens Online Democracy. All rights reserved.
 # Email: matthew@mysociety.org; WWW: http://www.mysociety.org/
 
 use strict;
@@ -22,9 +21,7 @@ use Geo::Distance;
 use IO::File;
 use POSIX qw(acos);
 use Text::CSV;
-use Text::LevenshteinXS;    # Levenshtein edit distance -- use XS version
-                            # because of bugs in pure perl version:
-                            # http://rt.cpan.org/NoAuth/Bug.html?id=13873
+use Text::Levenshtein;
 
 use mySociety::Config;
 BEGIN {
@@ -79,10 +76,11 @@ while (defined(my $line = <STDIN>)) {
     #   source longitude (DMS) (12)
     #   source latitude (decimal) (13)
     #   source longitude (decimal) (14)
-    #   elevation
-    #   estimated population
-    #   Federal status
-    #   cell name   -- which map sheet it's on, I think
+    #   elevation in metres
+    #   elevation in feet
+    #   map name
+    #   date created
+    #   date edited
 
     # Only use 'populated places'
     next unless ($f[2] eq 'Populated Place');
@@ -207,7 +205,7 @@ foreach my $id (sort { $place{$a}->[0] cmp $place{$b}->[0] } keys(%ambiguous_id)
                         $_->[0] != $id
                         && !exists($deleted{$_->[0]})
                         && !exists($ambiguous_id{$_->[0]})
-                        && Text::LevenshteinXS::distance($name, $place{$_->[0]}->[0]) > (length($name) / 4.)
+                        && Text::Levenshtein::distance($name, $place{$_->[0]}->[0]) > (length($name) / 4.)
                     } find_places_near($lat, $lon, $maxdist);
 
     if (@disambig == 0) {

diff --git a/bin/xapian-compact b/bin/xapian-compact
@@ -24,7 +24,7 @@ do
     echo "compacting $X"
     rm -fr $X-compacted
     rm -fr $X-old
-    quartzcompact --fuller $X $X-compacted >/dev/null
+    xapian-compact --fuller $X $X-compacted >/dev/null
     BEFORE=`du -k $X | cut -f 1`
     AFTER=`du -k $X-compacted | cut -f 1`
     TOTAL_SAVED=$((TOTAL_SAVED + BEFORE - AFTER))

diff --git a/conf/packages b/conf/packages
@@ -5,3 +5,5 @@ libgeo-ip-perl
 libjson-perl
 libxapian-dev
 libsearch-xapian-perl
+libtext-levenshtein-perl
+libtext-csv-perl
diff --git a/db/schema.sql b/db/schema.sql
@@ -28,15 +28,15 @@ create table name (
     uni integer not null primary key,
     ufi integer not null references feature(ufi),
     is_primary boolean not null default false,
-    -- Name of the place, in UTF-8. But note that these are transcribed
-    -- into the Latin alphabet and so are no good to us in, e.g., China or
-    -- Russia.
+    -- Name of the place, in UTF-8.
     full_name text not null,
     -- C - Conventional
-    -- N - Native
-    -- V - Variant or alternate
-    -- D - Not verified
-    name_type char(1) check (name_type in ('C', 'D', 'N', 'V'))
+    -- N - Native (NS non-roman)
+    -- V - Variant or alternate (VA Anglicized, VS non-roman)
+    -- D - Not verified (DS non-roman)
+    -- H - Historic (HS non-roman)
+    -- P - Provisional (PS non-roman)
+    name_type char(2) check (name_type in ('C', 'D', 'DS', 'N', 'NS', 'V', 'VA', 'VS', 'H', 'HS', 'P', 'PS'))
 --    language_code char(2) -- references language(code) ?
 );