Permalink
Browse files

Compare newly generated KML files with the current generation in MapI…

…t Global

This script, when given a directory that contains a newly generated
set of KML files, will check every KML file against the database to
see whether the boundary has changed, just appeared, is invalid, etc.
and produce a CSV file with a row for each KML file.

This is useful for working out how many of the boundaries have
changed since the last import, and so how much of an increase
in database size one might expect on importing it.
  • Loading branch information...
1 parent d4c45ac commit 570655fa17d8c614ff98a2bbc2246cd2062e2089 @mhl mhl committed Nov 9, 2012
Showing with 186 additions and 0 deletions.
  1. +186 −0 mapit/management/commands/mapit_global_check_new.py
@@ -0,0 +1,186 @@
+# import_global_osm.py:
+#
+# This script is used to import administrative boundaries from
+# OpenStreetMap into MaPit.
+#
+# It takes KML data generated by get-boundaries-by-admin-level.py, so
+# you need to have run that first.
+#
+# This script is heavily based on import_norway_osm.py by Matthew
+# Somerville.
+#
+# Copyright (c) 2011, 2012 UK Citizens Online Democracy. All rights reserved.
+# Email: mark@mysociety.org; WWW: http://www.mysociety.org
+
+import os
+import re
+import xml.sax
+from optparse import make_option
+from django.core.management.base import LabelCommand
+from django.contrib.gis.gdal import *
+from mapit.models import Area, Generation, Country, Type, Code, CodeType, NameType
+from mapit.management.command_utils import save_polygons, KML
+from glob import glob
+import urllib2
+from BeautifulSoup import BeautifulSoup
+from collections import namedtuple
+import json
+import csv
+
+def empty_if_none(o):
+ return '' if o is None else o
+
+class Command(LabelCommand):
+ help = 'Import OSM administrative boundary data'
+ args = '<KML-DIRECTORY>'
+
+ def handle_label(self, directory_name, **options):
+ current_generation = Generation.objects.current()
+
+ if not os.path.isdir(directory_name):
+ raise Exception, "'%s' is not a directory" % (directory_name,)
+
+ os.chdir(directory_name)
+
+ if not glob("al[0-1][0-9]"):
+ raise Exception, "'%s' did not contain any admin level directories (e.g. al02, al03, etc.)" % (directory_name,)
+
+ skip_up_to = None
+ # skip_up_to = 'relation-80370'
+
+ skipping = bool(skip_up_to)
+
+ osm_elements_seen_in_new_data = set([])
+
+ with open("/home/mark/difference-results.csv", 'w') as fp:
+ csv_writer = csv.writer(fp)
+ csv_writer.writerow(["ElementType",
+ "ElementID",
+ "ExistedPreviously",
+ "PreviousEmpty",
+ "PreviousArea",
+ "NewEmpty",
+ "NewArea",
+ "SymmetricDifferenceArea",
+ "GEOSEquals",
+ "GEOSEqualsExact"])
+
+ for admin_level in range(2,12):
+
+ admin_directory = "al%02d" % (admin_level)
+
+ if not os.path.exists(admin_directory):
+ continue
+
+ files = sorted(os.listdir(admin_directory))
+ total_files = len(files)
+
+ for i, e in enumerate(files):
+
+ progress = "[%d%% complete] " % ((i * 100) / total_files,)
+
+ if skipping:
+ if skip_up_to in e:
+ skipping = False
+ else:
+ continue
+
+ if not e.endswith('.kml'):
+ continue
+
+ m = re.search(r'^(way|relation)-(\d+)-', e)
+ if not m:
+ raise Exception, u"Couldn't extract OSM element type and ID from: " + e
+
+ osm_type, osm_id = m.groups()
+
+ osm_elements_seen_in_new_data.add((osm_type, osm_id))
+
+ kml_filename = os.path.join(admin_directory, e)
+
+ # Need to parse the KML manually to get the ExtendedData
+ kml_data = KML()
+ print "parsing", kml_filename
+ xml.sax.parse(kml_filename, kml_data)
+
+ useful_names = [n for n in kml_data.data.keys() if not n.startswith('Boundaries for')]
+ if len(useful_names) == 0:
+ raise Exception, "No useful names found in KML data"
+ elif len(useful_names) > 1:
+ raise Exception, "Multiple useful names found in KML data"
+ name = useful_names[0]
+ print " ", name.encode('utf-8')
+
+ if osm_type == 'relation':
+ code_type_osm = CodeType.objects.get(code='osm_rel')
+ elif osm_type == 'way':
+ code_type_osm = CodeType.objects.get(code='osm_way')
+ else:
+ raise Exception, "Unknown OSM element type:", osm_type
+
+ ds = DataSource(kml_filename)
+ if len(ds) != 1:
+ raise Exception, "We only expect one layer in a DataSource"
+
+ layer = ds[0]
+ if len(layer) != 1:
+ raise Exception, "We only expect one feature in each layer"
+
+ feat = layer[0]
+
+ area_code = 'O%02d' % (admin_level)
+
+ osm_codes = list(Code.objects.filter(type=code_type_osm, code=osm_id))
+ osm_codes.sort(key=lambda e: e.area.generation_high.created)
+
+ new_area = None
+ new_valid = None
+ new_empty = None
+
+ previous_area = None
+ previous_valid = None
+ previous_empty = None
+
+ symmetric_difference_area = None
+
+ g = feat.geom.transform(4326, clone=True)
+
+ new_some_nonempty = False
+ for polygon in g:
+ if polygon.point_count < 4:
+ new_empty = True
+ else:
+ new_some_nonempty = True
+ if not new_empty:
+ new_geos_geometry = g.geos.simplify(tolerance=0)
+ new_area = new_geos_geometry.area
+ new_empty = new_geos_geometry.empty
+
+ geos_equals = None
+ geos_equals_exact = None
+
+ most_recent_osm_code = None
+ if osm_codes:
+ most_recent_osm_code = osm_codes[-1]
+ previous_geos_geometry = most_recent_osm_code.area.polygons.collect()
+ previous_empty = previous_geos_geometry is None
+
+ if not previous_empty:
+ previous_geos_geometry = previous_geos_geometry.simplify(tolerance=0)
+ previous_area = previous_geos_geometry.area
+
+ if not new_empty:
+ symmetric_difference_area = previous_geos_geometry.sym_difference(new_geos_geometry).area
+ geos_equals = previous_geos_geometry.equals(new_geos_geometry)
+ geos_equals_exact = previous_geos_geometry.equals_exact(new_geos_geometry)
+
+ csv_writer.writerow([osm_type,
+ osm_id,
+ bool(osm_codes), # ExistedPreviously
+ empty_if_none(previous_empty),
+ empty_if_none(previous_area),
+ empty_if_none(new_empty),
+ empty_if_none(new_area),
+ empty_if_none(symmetric_difference_area),
+ empty_if_none(geos_equals),
+ empty_if_none(geos_equals_exact)])

0 comments on commit 570655f

Please sign in to comment.