Skip to content

Commit

Permalink
Refactor import-wikidata (#118)
Browse files Browse the repository at this point in the history
* Use openmaptiles-tools as the base
* Use standard PG* env vars in addition to the legacy POSTGRES_* ones
* Minor python syntax formatting
  • Loading branch information
nyurik committed Nov 25, 2019
1 parent be4ad3b commit a9f04b8
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 33 deletions.
9 changes: 4 additions & 5 deletions docker/import-wikidata/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
FROM python:3.6

RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
&& rm -rf /var/lib/apt/lists/
# Use a separate docker for downloading to minimize final docker image
# BASE_TAG will be injected by the dockerhub auto-build environment
ARG BASE_TAG=latest
FROM openmaptiles/openmaptiles-tools:${BASE_TAG}

RUN mkdir -p /usr/src/app
WORKDIR /usr/src/app
Expand Down
4 changes: 1 addition & 3 deletions docker/import-wikidata/bin/import-wikidata
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if __name__ == '__main__':
etl.empty_table(TABLE_NAME, cur)
conn.commit()

if(os.path.exists(DUMP)):
if os.path.exists(DUMP):
print('Scanning following tables:')
print(' ', '\n '.join(sorted(OSM_TABLES)))

Expand All @@ -37,11 +37,9 @@ if __name__ == '__main__':

print('Parsing Wikidata dump {} ...'.format(DUMP))
etl.multi_parse(DUMP, ids, pages, cur, conn, TABLE_NAME, LIMIT)

else:
print('File {} not found, no Wikidata imported!'.format(DUMP))

cur.close()
conn.close()
print()

7 changes: 7 additions & 0 deletions docker/import-wikidata/hooks/build
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

docker build \
--build-arg "BASE_TAG=$DOCKER_TAG" \
-t "$IMAGE_NAME" \
-f "$DOCKERFILE_PATH" \
.
13 changes: 6 additions & 7 deletions docker/import-wikidata/wikidata/cfg.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os

POSTGRES_DB=os.environ['POSTGRES_DB']
POSTGRES_USER=os.environ['POSTGRES_USER']
POSTGRES_PASSWORD=os.environ['POSTGRES_PASSWORD']
POSTGRES_HOST=os.environ['POSTGRES_HOST']
POSTGRES_PORT=os.environ['POSTGRES_PORT']

# Backward compatibility - for now, allow POSTGRES_* env if set, or use standard PG*
POSTGRES_DB = os.getenv('POSTGRES_DB') or os.environ['PGDATABASE']
POSTGRES_USER = os.getenv('POSTGRES_USER') or os.environ['PGUSER']
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD') or os.environ['PGPASSWORD']
POSTGRES_HOST = os.getenv('POSTGRES_HOST') or os.environ['PGHOST']
POSTGRES_PORT = os.getenv('POSTGRES_PORT') or os.getenv('PGPORT') or '5432'

'''Path to Wikidata dump from /import folder'''
DUMP = 'latest-all.json.gz'
Expand Down Expand Up @@ -33,4 +33,3 @@

'''Table with imported wikidata'''
TABLE_NAME = 'wd_names'

35 changes: 20 additions & 15 deletions docker/import-wikidata/wikidata/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
TRUNCATE {table};
"""


def empty_table(table, cur):
cur.execute(EMPTY_TABLE.format(table=table))


def get_json(line):
if line != "[\n" and line != "]" and line != "]\n" and len(line) > 2:
try:
Expand All @@ -32,39 +34,41 @@ def get_json(line):
print(traceback.format_exc())
print(line)


def get_id(line):
prefix = '{"type":"item","id":"'
prefix_len = len(prefix)
part = line[prefix_len:(prefix_len+20)]
if(part and line.startswith(prefix)):
part = line[prefix_len:(prefix_len + 20)]
if part and line.startswith(prefix):
m = re.search('^(Q[0-9]+)"', part)
if(m is not None):
if m is not None:
return m.group(1)


def to_osm_names(wd_names):
res = {}
for lang in wd_names:
name = wd_names[lang]
if (lang != name['language']):
if lang != name['language']:
continue
res['name:'+lang] = name['value']
res['name:' + lang] = name['value']
return res


def remove_duplicate_ids_and_pages(ids, pages):
orig_ids_len = len(ids)
ids = list(set(ids))
if(len(ids) != orig_ids_len):
if len(ids) != orig_ids_len:
print('ignoring {} duplicate ids'.format(orig_ids_len - len(ids)))

orig_pages_len = len(pages)
pages = list(set(pages))
if(len(pages) != orig_pages_len):
if len(pages) != orig_pages_len:
print('ignoring {} duplicate pages'.format(orig_pages_len - len(
pages)))

return (ids, pages)
return ids, pages


def simple_parse(file, ids, pages, cur, conn, table_name, limit):
ids, pages = remove_duplicate_ids_and_pages(ids, pages)
Expand All @@ -87,7 +91,7 @@ def simple_parse(file, ids, pages, cur, conn, table_name, limit):
"%s)".format(table=table_name), (id, osm_labels))
found_ids.append(id)
ids.remove(id)
if(len(ids) == 0):
if len(ids) == 0:
break

if i % 100000 == 0:
Expand All @@ -109,13 +113,14 @@ def get_page(item, pages):
if 'sitelinks' not in item:
return None
for lang in pages:
key = lang+'wiki'
key = lang + 'wiki'
if key in item['sitelinks']:
title = item['sitelinks'][key]['title']
if title in pages[lang]:
return (lang, title)
return lang, title
return None


def multi_parse(file, ids, pages, cur, conn, table_name, limit):
ids, pages = remove_duplicate_ids_and_pages(ids, pages)
ids = SortedList(ids)
Expand All @@ -128,7 +133,7 @@ def multi_parse(file, ids, pages, cur, conn, table_name, limit):
pages_bucket = defaultdict(SortedList)
for page in pages:
page_parts = page.split(':')
if(len(page_parts)==2):
if len(page_parts) == 2:
pages_bucket[page_parts[0]].add(page_parts[1])
# pages_bucket[page_parts[0]].add(page_parts[1].decode('utf8'))
pool = Pool()
Expand All @@ -140,7 +145,7 @@ def multi_parse(file, ids, pages, cur, conn, table_name, limit):
def process_json(item):
try:
parsed_lines[0] += 1
if(item is not None):
if item is not None:
id = item['id']
if id in ids:
osm_labels = to_osm_names(item['labels'])
Expand All @@ -150,7 +155,7 @@ def process_json(item):
ids.remove(id)
else:
page_tuple = get_page(item, pages_bucket)
if(page_tuple is not None):
if page_tuple is not None:
page = ':'.join(page_tuple)
osm_labels = to_osm_names(item['labels'])
cur.execute("INSERT INTO {table} (page, labels) VALUES ("
Expand All @@ -160,7 +165,7 @@ def process_json(item):
lang = page_tuple[0]
title = page_tuple[1]
pages_bucket[lang].remove(title)
if(len(pages_bucket[lang])==0):
if len(pages_bucket[lang]) == 0:
print('Deleting lang', lang)
del pages_bucket[lang]
if parsed_lines[0] % 10000 == 0:
Expand Down
5 changes: 2 additions & 3 deletions docker/import-wikidata/wikidata/osm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@



ID_SELECT = '''
select distinct tags->'wikidata' AS id
from {table}
Expand All @@ -13,6 +10,7 @@
where tags ? 'wikipedia' and not tags ? 'wikidata'
'''


def get_ids(tables, cur):
parts = map(lambda t: ID_SELECT.format(table=t), tables)
q = 'select t.*'
Expand All @@ -23,6 +21,7 @@ def get_ids(tables, cur):
ids = list(map(lambda t: t[0], cur.fetchall()))
return ids


def get_pages(tables, cur):
parts = map(lambda t: PAGE_SELECT.format(table=t), tables)
q = 'select t.*'
Expand Down

0 comments on commit a9f04b8

Please sign in to comment.