Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Geozones 2019 #2140

Merged
merged 6 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

- Rename og:image target :warning: this will break your custom theme, please rename your logo image file to `logo-social.png` instead of `logo-600x600.png` [#2217](https://github.com/opendatateam/udata/pull/2217)
- Don't automatically overwrite `last_update` field if manually set [#2020](https://github.com/opendatateam/udata/pull/2220)
- Spatial completion: only index last version of each zone and prevent completion cluttering [#2140](https://github.com/opendatateam/udata/pull/2140)
- Init: prompt to loads countries [#2140](https://github.com/opendatateam/udata/pull/2140)
- Handle UTF-8 filenames in `spatial load_logos` command [#2223](https://github.com/opendatateam/udata/pull/2223)

## 1.6.12 (2019-06-26)

Expand Down Expand Up @@ -66,6 +69,7 @@
- Add cache directives to dataset display blocks [#2129](https://github.com/opendatateam/udata/pull/2129)
- Export multiple models objects to CSV (dataset of datasets) [#2124](https://github.com/opendatateam/udata/pull/2124)


## 1.6.6 (2019-03-27)

- Automatically loads default settings from plugins (if `plugin.settings` module exists) [#2058](https://github.com/opendatateam/udata/pull/2058)
Expand Down
5 changes: 5 additions & 0 deletions udata/commands/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from udata.commands import cli, success, IS_TTY
from udata.core.dataset.commands import licenses
from udata.core.spatial.commands import load as spatial_load
from udata.core.user import commands as user_commands
from udata.i18n import gettext as _
from udata.search.commands import index
Expand Down Expand Up @@ -37,6 +38,10 @@ def init(ctx):
if click.confirm(text, default=True):
ctx.invoke(licenses)

text = _('Do you want to import some spatial zones (countries)?')
if click.confirm(text, default=True):
ctx.invoke(spatial_load)

text = _('Do you want to create some sample data?')
if click.confirm(text, default=True):
ctx.invoke(generate_fixtures)
Expand Down
120 changes: 44 additions & 76 deletions udata/core/spatial/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
import shutil

from collections import Counter
from datetime import date
from string import Formatter
from textwrap import dedent
from urllib import urlretrieve

import click
Expand All @@ -22,12 +21,16 @@

from udata.commands import cli
from udata.core.dataset.models import Dataset
from udata.core.spatial import geoids
from udata.core.spatial.models import GeoLevel, GeoZone, SpatialCoverage
from udata.core.storages import logos, tmp

log = logging.getLogger(__name__)


DEFAULT_GEOZONES_FILE = 'https://github.com/etalab/geozones/releases/download/2019.0/geozones-countries-2019-0-msgpack.tar.xz'


def level_ref(level):
return DBRef(GeoLevel._get_collection_name(), level)

Expand All @@ -39,9 +42,9 @@ def grp():


@grp.command()
@click.argument('filename', metavar='<filename>')
@click.argument('filename', metavar='<filename>', default=DEFAULT_GEOZONES_FILE)
@click.option('-d', '--drop', is_flag=True, help='Drop existing data')
def load(filename, drop=False):
def load(filename=DEFAULT_GEOZONES_FILE, drop=False):
'''
Load a geozones archive from <filename>

Expand Down Expand Up @@ -100,6 +103,7 @@ def load(filename, drop=False):
'dbpedia': geozone.get('dbpedia'),
'flag': geozone.get('flag'),
'blazon': geozone.get('blazon'),
'wikidata': geozone.get('wikidata'),
'wikipedia': geozone.get('wikipedia'),
'area': geozone.get('area'),
}
Expand All @@ -121,6 +125,12 @@ def load(filename, drop=False):
shutil.rmtree(tmp.path('translations')) # Not in use for now.


def safe_tarinfo(tarinfo):
'''make a tarinfo utf8-compatible'''
tarinfo.name = tarinfo.name.decode('utf8')
return tarinfo


@grp.command()
@click.argument('filename', metavar='<filename>')
def load_logos(filename):
Expand All @@ -135,8 +145,9 @@ def load_logos(filename):

log.info('Extracting GeoLogos bundle')
with contextlib.closing(lzma.LZMAFile(filename)) as xz:
with tarfile.open(fileobj=xz) as f:
f.extractall(tmp.root)
with tarfile.open(fileobj=xz, encoding='utf8') as tar:
decoded = (safe_tarinfo(t) for t in tar.getmembers())
tar.extractall(tmp.root, members=decoded)

log.info('Moving to the final location and cleaning up')
if os.path.exists(logos.root):
Expand All @@ -152,83 +163,40 @@ def migrate():

Should only be run once with the new version of geozones w/ geohisto.
'''
counter = Counter()
drom_zone = GeoZone.objects(id='country-subset:fr:drom').first()
dromcom_zone = GeoZone.objects(id='country-subset:fr:dromcom').first()
counter = Counter(['zones', 'datasets'])
qs = GeoZone.objects.only('id', 'level', 'successors')
# Iter over datasets with zones
for dataset in Dataset.objects(spatial__zones__gt=[]):
counter['datasets'] += 1
new_zones = []
for zone in dataset.spatial.zones:
if zone.id.startswith('fr/'):
counter['zones'] += 1
country, kind, zone_id = zone.id.split('/')
zone_id = zone_id.upper() # Corsica 2a/b case.
if kind == 'town':
counter['towns'] += 1
new_zones.append(
GeoZone
.objects(code=zone_id, level='fr:commune')
.valid_at(date.today())
.first())
elif kind == 'county':
counter['counties'] += 1
new_zones.append(
GeoZone
.objects(code=zone_id, level='fr:departement')
.valid_at(date.today())
.first())
elif kind == 'region':
counter['regions'] += 1
# Only link to pre-2016 regions which kept the same id.
new_zones.append(
GeoZone
.objects(code=zone_id, level='fr:region')
.first())
elif kind == 'epci':
counter['epcis'] += 1
new_zones.append(
GeoZone
.objects(code=zone_id, level='fr:epci')
.valid_at(dataset.created_at.date())
.first())
else:
new_zones.append(zone)
elif zone.id.startswith('country-subset/fr'):
counter['zones'] += 1
subset, country, kind = zone.id.split('/')
if kind == 'dom':
counter['drom'] += 1
new_zones.append(drom_zone)
elif kind == 'domtom':
counter['dromcom'] += 1
new_zones.append(dromcom_zone)
elif zone.id.startswith('country/'):
counter['zones'] += 1
counter['countries'] += 1
new_zones.append(zone.id.replace('/', ':'))
elif zone.id.startswith('country-group/'):
counter['zones'] += 1
counter['countrygroups'] += 1
new_zones.append(zone.id.replace('/', ':'))
else:
new_zones.append(zone)
for current_zone in dataset.spatial.zones:
counter['zones'] += 1
level, code, validity = geoids.parse(current_zone.id)
zone = qs(level=level, code=code).valid_at(validity).first()
if not zone:
log.warning('No match for %s: skipped', current_zone.id)
counter['skipped'] += 1
continue
previous = None
while not zone.is_current and len(zone.successors) == 1 and zone.id != previous:
previous = zone.id
zone = qs(id=zone.successors[0]).first() or zone
new_zones.append(zone.id)
counter[zone.level] += 1
dataset.update(
spatial=SpatialCoverage(
granularity=dataset.spatial.granularity,
zones=[getattr(z, 'id', z) for z in new_zones if z]
zones=list(new_zones)
)
)
log.info(Formatter().vformat('''Summary
Processed {zones} zones in {datasets} datasets:
- {countrygroups} country groups (World/UE)
- {countries} countries
- France:
- {regions} regions
- {counties} counties
- {epcis} EPCIs
- {towns} towns
- {drom} DROM
- {dromcom} DROM-COM
''', (), counter))
level_summary = '\n'.join([
' - {0}: {1}'.format(l.id, counter[l.id])
for l in GeoLevel.objects.order_by('admin_level')
])
summary = '\n'.join([dedent('''\
Summary
=======
Processed {zones} zones in {datasets} datasets:\
'''.format(level_summary, **counter)), level_summary])
log.info(summary)
log.info('Done')
5 changes: 5 additions & 0 deletions udata/core/spatial/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ class Meta:
geom = factory.Faker('multipolygon')
validity = factory.SubFactory(DateRangeFactory)

class Params:
is_current = factory.Trait(
validity=factory.SubFactory(DateRangeFactory, end=None)
)


class SpatialCoverageFactory(ModelFactory):
class Meta:
Expand Down
1 change: 1 addition & 0 deletions udata/core/spatial/geoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def parse(text):
else:
spatial = text
validity = 'latest'
spatial = spatial.lower().replace('/', ':') # Backward compatibility
if ':' not in spatial:
raise GeoIDError('Bad GeoID format: {0}'.format(text))
# country-subset is a special case:
Expand Down
25 changes: 18 additions & 7 deletions udata/core/spatial/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ class GeoLevel(db.Document):


class GeoZoneQuerySet(db.BaseQuerySet):
def valid_at(self, valid_date):
def valid_at(self, at):
'''Limit current QuerySet to zone valid at a given date'''
is_valid = db.Q(validity__end__gt=valid_date,
validity__start__lte=valid_date)
no_validity = db.Q(validity=None)
return self(is_valid | no_validity)
only_start = db.Q(validity__start__lte=at, validity__end=None)
only_end = db.Q(validity__start=None, validity__end__gt=at)
both = db.Q(validity__end__gt=at, validity__start__lte=at)
no_validity = db.Q(validity=None) | db.Q(validity__start=None, validity__end=None)
return self(no_validity | both | only_start | only_end)

def latest(self):
'''
Expand Down Expand Up @@ -98,6 +99,7 @@ class GeoZone(db.Document):
population = db.IntField()
area = db.FloatField()
wikipedia = db.StringField()
wikidata = db.StringField()
dbpedia = db.StringField()
flag = db.ImageField(fs=logos)
blazon = db.ImageField(fs=logos)
Expand Down Expand Up @@ -248,9 +250,18 @@ def handled_level(self):
return self.level in current_app.config.get('HANDLED_LEVELS')

def valid_at(self, valid_date):
if not self.validity:
if not self.validity or not (self.validity.start or self.validity.end):
return True
return self.validity.start <= valid_date <= self.validity.end
if self.validity.start and self.validity.end:
return self.validity.start <= valid_date < self.validity.end
elif self.validity.start:
return self.validity.start <= valid_date
else:
return self.validity.end > valid_date

@property
def is_current(self):
return self.valid_at(date.today())

def toGeoJSON(self):
return {
Expand Down
40 changes: 23 additions & 17 deletions udata/core/spatial/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,6 @@
MAX_POPULATION = 1E8


def labels_for_zone(zone):
'''
Extract all known zone labels
- main code
- keys (postal...)
- name translation in every supported languages
'''
labels = set([zone.name, zone.code] + zone.keys_values)
for lang in current_app.config['LANGUAGES'].keys():
with language(lang):
labels.add(_(zone.name))
return list(labels)


@register
class GeoZoneSearch(ModelSearchAdapter):
model = GeoZone
Expand Down Expand Up @@ -73,14 +59,34 @@ def compute_weight(cls, zone):

@classmethod
def is_indexable(cls, zone):
excluded = current_app.config['SPATIAL_SEARCH_EXCLUDE_LEVELS']
return zone.level not in excluded
return (
# Only index non-excluded levels
zone.level not in current_app.config['SPATIAL_SEARCH_EXCLUDE_LEVELS']
# Only index latest zone
and zone.is_current
)

@classmethod
def labels_for_zone(cls, zone):
'''
Extract all known zone labels
- main code
- keys (postal...)
- name translation in every supported languages
'''
labels = set(cls.completer_tokenize(zone.name))
labels.add(zone.code)
labels |= set(zone.keys_values)
for lang in current_app.config['LANGUAGES'].keys():
with language(lang):
labels |= set(cls.completer_tokenize(_(zone.name)))
return list(labels)

@classmethod
def serialize(cls, zone):
return {
'zone_suggest': {
'input': labels_for_zone(zone),
'input': cls.labels_for_zone(zone),
'output': zone.id,
'payload': {
'name': zone.name,
Expand Down