Refactor import-wikidata (#118)

* Use openmaptiles-tools as the base * Use standard PG* env vars in addition to the legacy POSTGRES_* ones * Minor python syntax formatting
openmaptiles · Nov 25, 2019 · a9f04b8 · a9f04b8
1 parent be4ad3b
commit a9f04b8
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 33 deletions.
diff --git a/docker/import-wikidata/Dockerfile b/docker/import-wikidata/Dockerfile
@@ -1,8 +1,7 @@
-FROM python:3.6
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-      wget \
-    && rm -rf /var/lib/apt/lists/
+# Use a separate docker for downloading to minimize final docker image
+# BASE_TAG will be injected by the dockerhub auto-build environment
+ARG BASE_TAG=latest
+FROM openmaptiles/openmaptiles-tools:${BASE_TAG}
 
 RUN mkdir -p /usr/src/app
 WORKDIR /usr/src/app

diff --git a/docker/import-wikidata/bin/import-wikidata b/docker/import-wikidata/bin/import-wikidata
@@ -25,7 +25,7 @@ if __name__ == '__main__':
     etl.empty_table(TABLE_NAME, cur)
     conn.commit()
 
-    if(os.path.exists(DUMP)):
+    if os.path.exists(DUMP):
         print('Scanning following tables:')
         print(' ', '\n  '.join(sorted(OSM_TABLES)))
 
@@ -37,11 +37,9 @@ if __name__ == '__main__':
 
         print('Parsing Wikidata dump {} ...'.format(DUMP))
         etl.multi_parse(DUMP, ids, pages, cur, conn, TABLE_NAME, LIMIT)
-
     else:
         print('File {} not found, no Wikidata imported!'.format(DUMP))
 
     cur.close()
     conn.close()
     print()
-
diff --git a/docker/import-wikidata/hooks/build b/docker/import-wikidata/hooks/build
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+docker build \
+  --build-arg "BASE_TAG=$DOCKER_TAG" \
+  -t "$IMAGE_NAME" \
+  -f "$DOCKERFILE_PATH" \
+  .
diff --git a/docker/import-wikidata/wikidata/cfg.py b/docker/import-wikidata/wikidata/cfg.py
@@ -1,11 +1,11 @@
 import os
 
-POSTGRES_DB=os.environ['POSTGRES_DB']
-POSTGRES_USER=os.environ['POSTGRES_USER']
-POSTGRES_PASSWORD=os.environ['POSTGRES_PASSWORD']
-POSTGRES_HOST=os.environ['POSTGRES_HOST']
-POSTGRES_PORT=os.environ['POSTGRES_PORT']
-
+# Backward compatibility - for now, allow POSTGRES_* env if set, or use standard PG*
+POSTGRES_DB = os.getenv('POSTGRES_DB') or os.environ['PGDATABASE']
+POSTGRES_USER = os.getenv('POSTGRES_USER') or os.environ['PGUSER']
+POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') or os.environ['PGPASSWORD']
+POSTGRES_HOST = os.getenv('POSTGRES_HOST') or os.environ['PGHOST']
+POSTGRES_PORT = os.getenv('POSTGRES_PORT') or os.getenv('PGPORT') or '5432'
 
 '''Path to Wikidata dump from /import folder'''
 DUMP = 'latest-all.json.gz'
@@ -33,4 +33,3 @@
 
 '''Table with imported wikidata'''
 TABLE_NAME = 'wd_names'
-
diff --git a/docker/import-wikidata/wikidata/etl.py b/docker/import-wikidata/wikidata/etl.py
@@ -16,9 +16,11 @@
 TRUNCATE {table};
 """
 
+
 def empty_table(table, cur):
     cur.execute(EMPTY_TABLE.format(table=table))
 
+
 def get_json(line):
     if line != "[\n" and line != "]" and line != "]\n" and len(line) > 2:
         try:
@@ -32,39 +34,41 @@ def get_json(line):
             print(traceback.format_exc())
             print(line)
 
+
 def get_id(line):
     prefix = '{"type":"item","id":"'
     prefix_len = len(prefix)
-    part = line[prefix_len:(prefix_len+20)]
-    if(part and line.startswith(prefix)):
+    part = line[prefix_len:(prefix_len + 20)]
+    if part and line.startswith(prefix):
         m = re.search('^(Q[0-9]+)"', part)
-        if(m is not None):
+        if m is not None:
             return m.group(1)
 
 
 def to_osm_names(wd_names):
     res = {}
     for lang in wd_names:
         name = wd_names[lang]
-        if (lang != name['language']):
+        if lang != name['language']:
             continue
-        res['name:'+lang] = name['value']
+        res['name:' + lang] = name['value']
     return res
 
 
 def remove_duplicate_ids_and_pages(ids, pages):
     orig_ids_len = len(ids)
     ids = list(set(ids))
-    if(len(ids) != orig_ids_len):
+    if len(ids) != orig_ids_len:
         print('ignoring {} duplicate ids'.format(orig_ids_len - len(ids)))
 
     orig_pages_len = len(pages)
     pages = list(set(pages))
-    if(len(pages) != orig_pages_len):
+    if len(pages) != orig_pages_len:
         print('ignoring {} duplicate pages'.format(orig_pages_len - len(
             pages)))
 
-    return (ids, pages)
+    return ids, pages
+
 
 def simple_parse(file, ids, pages, cur, conn, table_name, limit):
     ids, pages = remove_duplicate_ids_and_pages(ids, pages)
@@ -87,7 +91,7 @@ def simple_parse(file, ids, pages, cur, conn, table_name, limit):
                             "%s)".format(table=table_name), (id, osm_labels))
                 found_ids.append(id)
                 ids.remove(id)
-                if(len(ids) == 0):
+                if len(ids) == 0:
                     break
 
             if i % 100000 == 0:
@@ -109,13 +113,14 @@ def get_page(item, pages):
     if 'sitelinks' not in item:
         return None
     for lang in pages:
-        key = lang+'wiki'
+        key = lang + 'wiki'
         if key in item['sitelinks']:
             title = item['sitelinks'][key]['title']
             if title in pages[lang]:
-                return (lang, title)
+                return lang, title
     return None
 
+
 def multi_parse(file, ids, pages, cur, conn, table_name, limit):
     ids, pages = remove_duplicate_ids_and_pages(ids, pages)
     ids = SortedList(ids)
@@ -128,7 +133,7 @@ def multi_parse(file, ids, pages, cur, conn, table_name, limit):
     pages_bucket = defaultdict(SortedList)
     for page in pages:
         page_parts = page.split(':')
-        if(len(page_parts)==2):
+        if len(page_parts) == 2:
             pages_bucket[page_parts[0]].add(page_parts[1])
             # pages_bucket[page_parts[0]].add(page_parts[1].decode('utf8'))
     pool = Pool()
@@ -140,7 +145,7 @@ def multi_parse(file, ids, pages, cur, conn, table_name, limit):
     def process_json(item):
         try:
             parsed_lines[0] += 1
-            if(item is not None):
+            if item is not None:
                 id = item['id']
                 if id in ids:
                     osm_labels = to_osm_names(item['labels'])
@@ -150,7 +155,7 @@ def process_json(item):
                     ids.remove(id)
                 else:
                     page_tuple = get_page(item, pages_bucket)
-                    if(page_tuple is not None):
+                    if page_tuple is not None:
                         page = ':'.join(page_tuple)
                         osm_labels = to_osm_names(item['labels'])
                         cur.execute("INSERT INTO {table} (page, labels) VALUES ("
@@ -160,7 +165,7 @@ def process_json(item):
                         lang = page_tuple[0]
                         title = page_tuple[1]
                         pages_bucket[lang].remove(title)
-                        if(len(pages_bucket[lang])==0):
+                        if len(pages_bucket[lang]) == 0:
                             print('Deleting lang', lang)
                             del pages_bucket[lang]
             if parsed_lines[0] % 10000 == 0:

diff --git a/docker/import-wikidata/wikidata/osm.py b/docker/import-wikidata/wikidata/osm.py
@@ -1,6 +1,3 @@
-
-
-
 ID_SELECT = '''
 select distinct tags->'wikidata' AS id
 from {table}
@@ -13,6 +10,7 @@
 where tags ? 'wikipedia' and not tags ? 'wikidata'
 '''
 
+
 def get_ids(tables, cur):
     parts = map(lambda t: ID_SELECT.format(table=t), tables)
     q = 'select t.*'
@@ -23,6 +21,7 @@ def get_ids(tables, cur):
     ids = list(map(lambda t: t[0], cur.fetchall()))
     return ids
 
+
 def get_pages(tables, cur):
     parts = map(lambda t: PAGE_SELECT.format(table=t), tables)
     q = 'select t.*'