Merge pull request #4 from openrural/featnames-paflag

Use primary feature name during block import; log alternate names. Refs #278. Thanks to Colin and Kim at Caktus!
openplans · Apr 23, 2012 · 7c0ada7 · 7c0ada7
2 parents 303df0f + 89baab5
commit 7c0ada7
Showing 1 changed file with 42 additions and 23 deletions.
diff --git a/ebpub/ebpub/streets/blockimport/tiger/import_blocks.py b/ebpub/ebpub/streets/blockimport/tiger/import_blocks.py
@@ -18,7 +18,9 @@
 #
 
 import sys
+import pprint
 import optparse
+from collections import defaultdict
 from django.contrib.gis.gdal import DataSource
 from django.contrib.gis.gdal.error import OGRIndexError
 from ebdata.parsing import dbf
@@ -121,24 +123,7 @@ def __init__(self, edges_shp, featnames_dbf, faces_dbf, place_shp,
         BlockImporter.__init__(self, shapefile=edges_shp, layer_id=0,
                                verbose=verbose, encoding=encoding)
         self.fix_cities = fix_cities
-        self.featnames_db = featnames_db = {}
-        for tlid, row in self._load_rel_db(featnames_dbf, 'TLID').iteritems():
-            # TLID is Tiger/Line ID, unique per edge.
-            # We use TLID instead of LINEARID as the key because
-            # LINEARID is only unique per 'linear feature', which is
-            # an implicit union of some edges. So if we used LINEARID,
-            # we'd clobber a lot of keys in the call to
-            # _load_rel_db().
-            # Fixes #14 ("missing blocks").
-            if row['MTFCC'] not in VALID_MTFCC:
-                continue
-            if not row.get('FULLNAME'):
-                self.log("skipping tlid %r, no fullname" % tlid)
-                continue
-
-            featnames_db.setdefault(tlid, [])
-            featnames_db[tlid].append(row)
-
+        self.featnames_db = self._clean_featnames(featnames_dbf)
         self.faces_db = self._load_rel_db(faces_dbf, 'TFID')
         # Load places keyed by FIPS code
         places_layer = DataSource(place_shp)[0]
@@ -157,18 +142,17 @@ def __init__(self, edges_shp, featnames_dbf, faces_dbf, place_shp,
         self.filter_bounds = filter_bounds
         self.tlids_with_blocks = set()
 
-
     def _load_rel_db(self, dbf_file, rel_key):
         """
         Reads rows as dicts from a .dbf file.
         Returns a mapping of rel_key -> row dict.
         """
         f = open(dbf_file, 'rb')
-        db = {}
+        db = defaultdict(list)
         rowcount = 0
         try:
             for row in dbf.dict_reader(f, strip_values=True):
-                db[row[rel_key]] = row
+                db[row[rel_key]].append(row)
                 rowcount += 1
                 self.log(
                     " GOT DBF ROW %s for %s" % (row[rel_key], row.get('FULLNAME', 'unknown')))
@@ -178,6 +162,41 @@ def _load_rel_db(self, dbf_file, rel_key):
         self.log("Unique keys for %r: %d" % (rel_key, len(db)))
         return db
 
+    def _clean_featnames(self, featnames_dbf):
+        rel_db = self._load_rel_db(featnames_dbf, 'TLID')
+        featnames_db = defaultdict(list)
+        for tlid, rows in rel_db.iteritems():
+            primary = None
+            alternates = []
+            for row in rows:
+                # TLID is Tiger/Line ID, unique per edge.
+                # We use TLID instead of LINEARID as the key because
+                # LINEARID is only unique per 'linear feature', which is
+                # an implicit union of some edges. So if we used LINEARID,
+                # we'd clobber a lot of keys in the call to
+                # _load_rel_db().
+                # Fixes #14 ("missing blocks").
+                if row['MTFCC'] not in VALID_MTFCC:
+                    continue
+                if not row.get('FULLNAME'):
+                    self.log("skipping tlid %r, no fullname" % tlid)
+                    continue
+                if row['PAFLAG'] == 'P':
+                    primary = row
+                    featnames_db[tlid].append(row)
+                else:
+                    alternates.append(row)
+            # For now we just log alternates that were found. Ideally we could save these
+            # as aliases somehow, but at the moment we don't have a good way to do that.
+            for alternate in alternates:
+                correct = primary['NAME'].upper()
+                incorrect = alternate['NAME'].upper()
+                msg = 'Found alternate name for {0} ({1}): {2}\n{3}\n{4}'
+                logger.debug(msg.format(correct, primary['TLID'], incorrect,
+                                        pprint.pformat(primary),
+                                        pprint.pformat(alternate)))
+        return featnames_db
+
     def _get_city(self, feature, side):
         city = ''
         if self.fix_cities:
@@ -189,7 +208,7 @@ def _get_city(self, feature, side):
         else:
             fid = feature.get('TFID' + side)
             if fid in self.faces_db:
-                face = self.faces_db[fid]
+                face = self.faces_db[fid][0]
                 # Handle both 2010 and older census files.
                 # If none of these work, we simply get no city.
                 pid = face.get('PLACEFP10') or face.get('PLACEFP00') or face.get('PLACEFP')
@@ -203,7 +222,7 @@ def _get_city(self, feature, side):
     def _get_state(self, feature, side):
         fid = feature.get('TFID' + side)
         if fid in self.faces_db:
-            face = self.faces_db[fid]
+            face = self.faces_db[fid][0]
             # Handle both 2010 and older census files.
             state_fip = STATE_FIPS[face.get('STATEFP10') or face['STATEFP']]
             return state_fip[0]