Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a lighter format for wikipedia importance tables #3424

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 59 additions & 3 deletions lib-sql/functions/importance.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,54 @@ CREATE TYPE place_importance as (
wikipedia TEXT
);

{% if 'wikimedia_importance' in db.tables %}

CREATE OR REPLACE FUNCTION get_wikipedia_match(extratags HSTORE, country_code varchar(2))
RETURNS wikipedia_article_match
AS $$
DECLARE
i INT;
wiki_article_title TEXT;
wiki_article_language TEXT;
result wikipedia_article_match;
entry RECORD;
BEGIN
IF extratags ? 'wikipedia' and strpos(extratags->'wikipedia', ':') IN (3,4) THEN
wiki_article_language := lower(trim(split_part(extratags->'wikipedia', ':', 1)));
wiki_article_title := trim(substr(extratags->'wikipedia',
strpos(extratags->'wikipedia', ':') + 1));

FOR result IN
SELECT language, title, importance FROM wikimedia_importance
WHERE language = wiki_article_language
and title = replace(wiki_article_title, ' ', '_')
LOOP
RETURN result;
END LOOP;
END IF;

FOREACH wiki_article_language IN ARRAY ARRAY['ar','bg','ca','cs','da','de','en','es','eo','eu','fa','fr','ko','hi','hr','id','it','he','lt','hu','ms','nl','ja','no','pl','pt','kk','ro','ru','sk','sl','sr','fi','sv','tr','uk','vi','vo','war','zh']
LOOP
IF extratags ? ('wikipedia:' || wiki_article_language) THEN
wiki_article_title := extratags->('wikipedia:' || wiki_article_language);

FOR result IN
SELECT language, title, importance FROM wikimedia_importance
WHERE language = wiki_article_language
and title = replace(wiki_article_title, ' ', '_')
LOOP
RETURN result;
END LOOP;
END IF;

END LOOP;

RETURN NULL;
END;
$$
LANGUAGE plpgsql IMMUTABLE;

{% else %}

-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql
CREATE OR REPLACE FUNCTION decode_url_part(p varchar)
Expand Down Expand Up @@ -93,6 +141,7 @@ END;
$$
LANGUAGE plpgsql STABLE;

{% endif %}

CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE,
country_code varchar(2),
Expand All @@ -118,9 +167,16 @@ BEGIN

-- Nothing? Then try with the wikidata tag.
IF result.importance is null AND extratags ? 'wikidata' THEN
FOR match IN SELECT * FROM wikipedia_article
WHERE wd_page_title = extratags->'wikidata'
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
FOR match IN
{% if 'wikimedia_importance' in db.tables %}
SELECT * FROM wikimedia_importance
WHERE wikidata = extratags->'wikidata'
LIMIT 1
{% else %}
SELECT * FROM wikipedia_article
WHERE wd_page_title = extratags->'wikidata'
ORDER BY language = 'en' DESC, langcount DESC LIMIT 1
{% endif %}
LOOP
result.importance := match.importance;
result.wikipedia := match.language || ':' || match.title;
Expand Down
31 changes: 9 additions & 22 deletions lib-sql/tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -273,28 +273,15 @@ GRANT SELECT ON import_polygon_delete TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS file;
CREATE SEQUENCE file start 1;

-- null table so it won't error
-- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it
CREATE TABLE IF NOT EXISTS wikipedia_article (
language text NOT NULL,
title text NOT NULL,
langcount integer,
othercount integer,
totalcount integer,
lat double precision,
lon double precision,
importance double precision,
osm_type character(1),
osm_id bigint,
wd_page_title text,
instance_of text
);

CREATE TABLE IF NOT EXISTS wikipedia_redirect (
language text,
from_title text,
to_title text
);
{% if 'wikimedia_importance' not in db.tables and 'wikipedia_article' not in db.tables %}
-- create dummy tables here, if nothing was imported
CREATE TABLE wikimedia_importance (
language TEXT NOT NULL,
title TEXT NOT NULL,
importance double precision NOT NULL,
wikidata TEXT
) {{db.tablespace.address_data}};
{% endif %}

-- osm2pgsql does not create indexes on the middle tables for Nominatim
-- Add one for lookup of associated street relations.
Expand Down
19 changes: 11 additions & 8 deletions nominatim/clicmd/refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
from ..tools import refresh, postcodes
from ..indexer.indexer import Indexer

need_function_refresh = args.functions

if args.postcodes:
if postcodes.can_compute(args.config.get_libpq_dsn()):
Expand Down Expand Up @@ -131,13 +132,7 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
args.project_dir) > 0:
LOG.fatal('FATAL: Cannot update secondary importance raster data')
return 1

if args.functions:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)
need_function_refresh = True

if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
Expand All @@ -147,8 +142,16 @@ def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, t
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
return 1
need_function_refresh = True

if need_function_refresh:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)

# Attention: importance MUST come after wiki data import.
# Attention: importance MUST come after wiki data import and after functions.
if args.importance:
LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn:
Expand Down
8 changes: 8 additions & 0 deletions nominatim/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ def __enter__(self) -> 'CopyBuffer':
return self


def size(self) -> int:
""" Return the number of bytes the buffer currently contains.
"""
return self.buffer.tell()

def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
if self.buffer is not None:
self.buffer.close()
Expand All @@ -115,7 +120,10 @@ def add(self, *data: Any) -> None:

def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
""" Copy all collected data into the given table.

The buffer is empty and reusable after this operation.
"""
if self.buffer.tell() > 0:
self.buffer.seek(0)
cur.copy_from(self.buffer, table, columns=columns)
self.buffer = io.StringIO()
5 changes: 4 additions & 1 deletion nominatim/tools/check_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,10 @@ def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult
return CheckState.NOT_APPLICABLE

with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
if conn.table_exists('wikimedia_importance'):
cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
else:
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')

return CheckState.WARN if cnt == 0 else CheckState.OK

Expand Down
80 changes: 75 additions & 5 deletions nominatim/tools/refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
Functions for bringing auxiliary data in the database up-to-date.
"""
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
import csv
import gzip
import logging
from textwrap import dedent
from pathlib import Path
Expand All @@ -16,7 +18,7 @@

from nominatim.config import Configuration
from nominatim.db.connection import Connection, connect
from nominatim.db.utils import execute_file
from nominatim.db.utils import execute_file, CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.version import NOMINATIM_VERSION

Expand Down Expand Up @@ -132,21 +134,89 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'wikimedia-importance.sql.gz'
if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
ignore_errors) == 0:
return 0

if not datafile.exists():
return 1


def import_importance_csv(dsn: str, data_file: Path) -> int:
""" Replace wikipedia importance table with data from a
single CSV file.

The file must be a gzipped CSV and have the following columns:
language, title, importance, wikidata_id

Other columns may be present but will be ignored.
"""
if not data_file.exists():
return 1

# Only import the first occurance of a wikidata ID.
# This keeps indexes and table small.
wd_done = set()

with connect(dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('wikipedia_article')
cur.drop_table('wikipedia_redirect')
cur.drop_table('wikimedia_importance')
cur.execute("""CREATE TABLE wikimedia_importance (
language TEXT NOT NULL,
title TEXT NOT NULL,
importance double precision NOT NULL,
wikidata TEXT
) """)

with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
wd_id = int(row['wikidata_id'][1:])
buf.add(row['language'], row['title'], row['importance'],
None if wd_id in wd_done else row['wikidata_id'])
wd_done.add(wd_id)

if buf.size() > 10000000:
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance',
'wikidata'])

with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance', 'wikidata'])

with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
ON wikimedia_importance (title)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
ON wikimedia_importance (wikidata)
WHERE wikidata is not null""")

conn.commit()

return 0


def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
""" Replace wikipedia importance table with data from an SQL file.
"""
if not data_file.exists():
return 1

pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect"
DROP TABLE IF EXISTS "wikipedia_redirect";
DROP TABLE IF EXISTS "wikipedia_importance";
"""
post_code = "COMMIT"
execute_file(dsn, datafile, ignore_errors=ignore_errors,
execute_file(dsn, data_file, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)

return 0


def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the secondary importance raster data table with new data.

Expand Down
6 changes: 5 additions & 1 deletion test/python/cli/test_cmd_refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def setup_cli_call(self, cli_call, temp_db, cli_tokenizer_mock):
('website', 'setup_website'),
])
def test_refresh_command(self, mock_func_factory, command, func):
mock_func_factory(nominatim.tools.refresh, 'create_functions')
func_mock = mock_func_factory(nominatim.tools.refresh, func)

assert self.call_nominatim('refresh', '--' + command) == 0
Expand Down Expand Up @@ -71,6 +72,7 @@ def test_refresh_wikidata_file_not_found(self, monkeypatch):

assert self.call_nominatim('refresh', '--wiki-data') == 1


def test_refresh_secondary_importance_file_not_found(self):
assert self.call_nominatim('refresh', '--secondary-importance') == 1

Expand All @@ -84,16 +86,18 @@ def test_refresh_secondary_importance_new_table(self, mock_func_factory):
assert mocks[1].called == 1


def test_refresh_importance_computed_after_wiki_import(self, monkeypatch):
def test_refresh_importance_computed_after_wiki_import(self, monkeypatch, mock_func_factory):
calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
lambda *args, **kwargs: calls.append('import') or 0)
monkeypatch.setattr(nominatim.tools.refresh, 'recompute_importance',
lambda *args, **kwargs: calls.append('update'))
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')

assert self.call_nominatim('refresh', '--importance', '--wiki-data') == 0

assert calls == ['import', 'update']
assert func_mock.called == 1

@pytest.mark.parametrize('params', [('--data-object', 'w234'),
('--data-object', 'N23', '--data-object', 'N24'),
Expand Down
9 changes: 5 additions & 4 deletions test/python/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,17 @@ class text,

def add(self, osm_type='N', osm_id=None, cls='amenity', typ='cafe', names=None,
admin_level=None, address=None, extratags=None, geom='POINT(10 4)',
country=None, housenumber=None):
country=None, housenumber=None, rank_search=30):
with self.conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("""INSERT INTO placex (place_id, osm_type, osm_id, class,
type, name, admin_level, address,
housenumber,
housenumber, rank_search,
extratags, geometry, country_code)
VALUES(nextval('seq_place'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
VALUES(nextval('seq_place'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(osm_type, osm_id or next(self.idseq), cls, typ, names,
admin_level, address, housenumber, extratags, 'SRID=4326;' + geom,
admin_level, address, housenumber, rank_search,
extratags, 'SRID=4326;' + geom,
country))
self.conn.commit()

Expand Down
3 changes: 1 addition & 2 deletions test/python/tools/test_refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn,
@pytest.mark.parametrize("replace", (True, False))
def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace):
if replace:
table_factory('wikipedia_article')
table_factory('wikipedia_redirect')
table_factory('wikimedia_importance')

# use the small wikipedia file for the API testdb
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
Expand Down
Loading
Loading