diff --git a/mypy.ini b/mypy.ini index f17ceaed6..f5b4fd823 100644 --- a/mypy.ini +++ b/mypy.ini @@ -40,3 +40,5 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-requests.packages.urllib3.util.retry] ignore_missing_imports = True +[mypy-wikitextparser.*] +ignore_missing_imports = True diff --git a/plugins/TagFix_Deprecated.py b/plugins/TagFix_Deprecated.py index 2f816d53f..8760b9cf2 100644 --- a/plugins/TagFix_Deprecated.py +++ b/plugins/TagFix_Deprecated.py @@ -23,7 +23,7 @@ from plugins.Plugin import Plugin from modules.downloader import urlread from modules.Stablehash import stablehash -import re +from plugins.modules.wikiReader import read_wiki_templates,wikitag2text class TagFix_Deprecated(Plugin): @@ -31,55 +31,31 @@ def deprecated_list(self): wikiRoot = 'https://wiki.openstreetmap.org/wiki' data = urlread(wikiRoot + '/Template:Deprecated_features?action=raw', 1) - # Tidy data up for processing - # Eliminate wiki bold formatting - data = data.replace("'''", "") - - # Remove HTML newlines - data = re.sub(r'', ' ', data) - # Remove excess whitespace (also removes all newlines) data = " ".join(data.split()) - # Eliminate any whitespace around pipe characters - # This makes reading the template parameters simple - data = re.sub(r'\s?\|\s?', '|', data) - - # Eliminate templates to prevent unexpected pipe characters - data = re.sub(r'{{{\s?lang\s?\|?\s?}}}', '', data, flags=re.I) - # Tag template can take one or two params, with trailing | possible - data = re.sub( - r'{{(?:Tag|Key)\s?\|(.+?)\|?\s?}}', - lambda x: '`{}`'.format(x.group(1).replace("||", "=").replace("|", "=")), - data, - flags=re.I - ) - - # Resolve interwiki links now - data = re.sub( - r'\[\[(.+?)\]\]', - lambda x: '[{}]({}/{})'.format(x.group(1), wikiRoot, x.group(1).replace(" ", "_")), - data - ) + data = read_wiki_templates(data, "Deprecated features/item") deprecated = {} - for feature in data.split(r'{{Deprecated features/item')[1:]: - # Unaccounted for template present in this feature - if r'{{' in feature: - continue - + for feature in data: src_key, src_val, dest = None, None, None - for param in feature.split('|'): + for param in feature[2:]: + # Convert {{Tag|k|v}} to k=v + param = wikitag2text(param, quote = True, star_value = False) if '=' not in param: continue + if '{{' in param: + # Unaccounted for template present in this feature + src_key, src_val, dest = None, None, None + break k, v = param.split('=', 1) - # k will always start with the param because we removed whitespace around | earlier - if (k.rstrip() == 'dkey'): + k = k.rstrip() + if k == 'dkey': src_key = v - elif (k.rstrip() == 'dvalue'): + elif k == 'dvalue': src_val = v - elif (k.rstrip() == 'suggestion'): + elif k == 'suggestion': dest = v # Sanity check in case formatting changes or something diff --git a/plugins/TagFix_Postcode.py b/plugins/TagFix_Postcode.py index 896598f8e..36da4dea5 100644 --- a/plugins/TagFix_Postcode.py +++ b/plugins/TagFix_Postcode.py @@ -22,6 +22,7 @@ from modules.OsmoseTranslation import T_ from plugins.Plugin import Plugin from modules.downloader import urlread +from plugins.modules.wikiReader import read_wiki_table import re @@ -50,21 +51,18 @@ def parse_format(self, reline, format): elif len(regexs) == 1: return "^"+regexs[0]+"$" - def clean_line(self, line): - # Clean wiki templates and links - line = re.sub(self.reWikiTemplate, "", line) # Remove all templates, e.g. {{Date|2000-01-01}}, may contain pipes - return re.sub(self.reWikiPageLink, "\\1", line) # Replace all links by their text value, so [[x|y]] and [[y]] both become y - def list_postcode(self): reline = re.compile("^[-CAN ]+$") - # remline = re.compile("^[-CAN ]+ *\([-CAN ]+\)$") data = urlread(u"https://en.wikipedia.org/wiki/List_of_postal_codes?action=raw", 1) - data = filter(lambda t: len(t) > 2 and ("no codes" not in t[1].lower() or t[2] != ""), map(lambda x: list(map(lambda y: y.strip(), self.clean_line(x).split("|")))[3:6], data.split("|-")[1:-1])) + data = read_wiki_table(data) + postcode = {} - for line in data: - iso = line[0][0:2] - format_area = line[1] - format_street = line[2] + for row in data: + iso = row[2] + format_area = row[3] + format_street = row[4] + if (not format_area or "no codes" in format_area.lower()) and not format_street: + continue postcode[iso] = {} if format_area != '': @@ -91,8 +89,6 @@ def init(self, logger): [Wikipedia](https://en.wikipedia.org/wiki/List_of_postal_codes)'''), resource = 'https://en.wikipedia.org/wiki/List_of_postal_codes') - self.reWikiTemplate = re.compile(r'\{\{[^}]+\}\}') - self.reWikiPageLink = re.compile(r'\[\[[^]]*?([^]|]+)\]\]') self.Country = None if self.father.config.options.get("country"): self.Country = self.father.config.options.get("country").split("-")[0] diff --git a/plugins/TagFix_Tree.py b/plugins/TagFix_Tree.py index 6bfad7fa2..a00439a1c 100644 --- a/plugins/TagFix_Tree.py +++ b/plugins/TagFix_Tree.py @@ -22,6 +22,7 @@ from modules.OsmoseTranslation import T_ from plugins.Plugin import Plugin from modules.downloader import urlread +from plugins.modules.wikiReader import read_wiki_table class TagFix_Tree(Plugin): @@ -34,7 +35,7 @@ def _read_leaf_properties_table(self): allowed_leaf_cycle = ("evergreen", "deciduous") data = urlread(u"https://wiki.openstreetmap.org/w/index.php?title=Tag:natural%3Dtree/List_of_Species&action=raw", 1) - data = list(map(lambda x: list(filter(lambda z: len(z) > 0, map(lambda y: y.strip(), x.split("|")))), data.split("|-")[1:-1])) + data = read_wiki_table(data) species_map = {} for row in data: # data: list of [species, species:wikidata, leaf_cycle, leaf_type] this_species = {} diff --git a/plugins/TagWatchFrViPofm.py b/plugins/TagWatchFrViPofm.py index f13ddc01b..3244f88a3 100644 --- a/plugins/TagWatchFrViPofm.py +++ b/plugins/TagWatchFrViPofm.py @@ -25,6 +25,7 @@ from modules.Stablehash import stablehash, stablehash64 import re from collections import defaultdict +from plugins.modules.wikiReader import read_wiki_table, wikitag2text class TagWatchFrViPofm(Plugin): @@ -52,43 +53,43 @@ def init(self, logger): self._update_ks_vr = defaultdict(dict) self._update_kr_vr = defaultdict(dict) - reline = re.compile(r"^\|([^|]*)\|\|([^|]*)\|\|([^|]*)\|\|([^|]*).*") - # Obtain the info from https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes data = urlread(u"https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes&action=raw", 1) - data = data.split("\n") - for line in data: - for res in reline.findall(line): - only_for = res[3].strip() - if only_for in (None, '', country, language) or (country and country.startswith(only_for)): - r = res[1].strip() - c0 = res[2].strip() - tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"] - c = stablehash(c0) - self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags, - title = {'en': c0}, - detail = T_( + data = read_wiki_table(data, skip_headers = False)[1:] # Headers in the middle of the table, not supported yet in read_wiki_table + + for row in data: + only_for = row[3] + if only_for in (None, '', country, language) or (country and country.startswith(only_for)) or only_for.lower().startswith("{{taginfo"): # This also filters out the alphabetical headers + r = wikitag2text(row[1]) # replace-value + f = wikitag2text(row[0]) # to-be-replaced value + c0 = row[2] # the Osmose issue tag and issue title + tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"] + c = stablehash(c0) + self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags, + title = {'en': c0}, + detail = T_( '''Simple and frequent errors, can be updated by anyone on the wiki.'''), - resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes') - if u"=" in res[0]: - k = res[0].split(u"=")[0].strip() - v = res[0].split(u"=")[1].strip() - if self.quoted(k): - k = self.quoted2re(k) - if self.quoted(v): - self._update_kr_vr[k][self.quoted2re(v)] = [r, c] - else: - self._update_kr_vs[k][v] = [r, c] + resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes') + + if "=" in f: + k = f.split("=")[0].strip() + v = f.split("=")[1].strip() + if self.quoted(k): + k = self.quoted2re(k) + if self.quoted(v): + self._update_kr_vr[k][self.quoted2re(v)] = [r, c] else: - if self.quoted(v): - self._update_ks_vr[k][self.quoted2re(v)] = [r, c] - else: - self._update_ks_vs[k][v] = [r, c] + self._update_kr_vs[k][v] = [r, c] else: - if self.quoted(res[0]): - self._update_kr[self.quoted2re(res[0])] = [r, c] + if self.quoted(v): + self._update_ks_vr[k][self.quoted2re(v)] = [r, c] else: - self._update_ks[res[0]] = [r, c] + self._update_ks_vs[k][v] = [r, c] + else: + if self.quoted(f): + self._update_kr[self.quoted2re(f)] = [r, c] + else: + self._update_ks[f] = [r, c] def node(self, data, tags): err = [] @@ -142,6 +143,7 @@ class father: self.check_err(a.node(None, {"administrative": "boundary"})) self.check_err(a.node(None, {"name": "FIXME"})) self.check_err(a.node(None, {"Area": "plop"})) + self.check_err(a.node(None, {"access": "public"})) self.check_err(a.node(None, {"Fixme": "yes"})) self.check_err(a.node(None, {"voltage": "10kV"})) assert not a.node(None, {"area": "plop"}) diff --git a/plugins/modules/wikiReader.py b/plugins/modules/wikiReader.py new file mode 100644 index 000000000..d9532b9db --- /dev/null +++ b/plugins/modules/wikiReader.py @@ -0,0 +1,136 @@ +#-*- coding: utf-8 -*- + +########################################################################### +## ## +## Copyrights Osmose project 2024 ## +## ## +## This program is free software: you can redistribute it and/or modify ## +## it under the terms of the GNU General Public License as published by ## +## the Free Software Foundation, either version 3 of the License, or ## +## (at your option) any later version. ## +## ## +## This program is distributed in the hope that it will be useful, ## +## but WITHOUT ANY WARRANTY; without even the implied warranty of ## +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## +## GNU General Public License for more details. ## +## ## +## You should have received a copy of the GNU General Public License ## +## along with this program. If not, see . ## +## ## +########################################################################### + + +# This module file contains functions to read MediaWiki markup tables, templates, lists, ... + +import wikitextparser +from typing import Union, Optional + +# Get a list of lists containing all cells of a table. +# Parameters: +# wikitext - the text of a wikipedia page +# tab_index - the index of the table (if there's multiple tables on the wiki) +# keep_markup - if False, everything (except Templates) will be converted to plain text +# skip_headers - if True, header rows are removed. Assumes all headers are on top +# Returns: +# The cell contents, specified as a list in a list. +# The outer list is the rows, the inner list are the cells in that row +# Throws: +# If the table at the specified index isn't found +def read_wiki_table(wikitext: str, tab_index: int = 0, keep_markup: bool = False, skip_headers: bool = True) -> list[list[Optional[str]]]: + # Drops all markup, such as italics, hyperlinks, ... + if not keep_markup: + wikitext = wikitextparser.remove_markup(wikitext, replace_tables=False, replace_templates=False) + + t = wikitextparser.parse(wikitext).tables[tab_index] + + # Remove header rows if desired + removable_header_rows = 0 + if skip_headers: + removable_header_rows = len(list(filter(lambda c: c.is_header, t.cells(column=0)))) + t = t.data()[removable_header_rows:] + + # Remove whitespace around the cells + return list(map(lambda row: list(map(lambda c: c.strip() if isinstance(c, str) else c, row)), t)) + + +# Get all instances of a certain wiki template within wikitext +# Parameters: +# wikitext - the text of a wikipedia page +# template_name - the name or names of the template to locate, e.g. 'Deprecated features/item' +# keep_markup - if False, everything (except Templates) will be converted to plain text +# Returns: +# A list containing lists of strings with values [template_string, template_name, argument1, argument2, argument3, ...] +# Example: ["{{Tag | key | value}}", "Tag", "key", "value"] +# (Note that the template_string is affected by the markup removal, so for string replace purposes, use keep_markup=True) +def read_wiki_templates(wikitext: str, template_name: Union[str, list[str]], keep_markup: bool = False) -> list[list[str]]: + if isinstance(template_name, str): + template_name = [template_name] + template_name = list(map(str.lower, template_name)) + + # Drops all markup, such as italics, hyperlinks, ... + if not keep_markup: + wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False) + + # Get all templates that match the filter + template_objects = list(filter(lambda t: t.name.strip().lower() in template_name, wikitextparser.parse(wikitext).templates)) + + return list(map(lambda t: [t.string, t.name.strip()] + [str(a)[1:].strip() for a in t.arguments], template_objects)) + + +# Get all entries in a list within wikitext +# Parameters: +# wikitext - the text of a wikipedia page +# list_index - the index of the list (if there's multiple lists on the wiki) +# keep_markup - if False, everything (except Templates) will be converted to plain text +# include_sublists - if true, include subitems. If false, only include the highest level items +# When true, the list item symbol (*, **, #, ##, :, ...) will also be included in the output +# Returns: +# A list with all list items +# Throws: +# If the list at index list_index doesn't exist +def read_wiki_list(wikitext: str, list_index: int = 0, keep_markup: bool = False, include_sublists: bool = False) -> list[str]: + if not keep_markup: + wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False) + + lst = wikitextparser.parse(wikitext).get_lists()[list_index] + if include_sublists: + # Note this contains the list identifier, e.g. *, **, #, ## + return list(map(str.strip, lst.fullitems)) + return list(map(str.strip, lst.items)) + + +# Get all list entries within wikitext +# See read_wiki_list for details (excluding list_index) +def read_all_wiki_lists(wikitext: str, keep_markup: bool = False, include_sublists: bool = False) -> list[str]: + res = [] + if not keep_markup: + wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False) + + try: + list_index = 0 + while True: + res.extend(read_wiki_list(wikitext, list_index=list_index, keep_markup=True, include_sublists=include_sublists)) + list_index += 1 + except IndexError: + return res + + +# Convert all instances of Tag-templates to textual tags, e.g. {{Tag|oneway|yes}} -> "oneway=yes" +# Parameters: +# wikitext - the text of a wikipedia page +# quote - whether the tag should be wrapped in `` +# star_value - whether empty tag values should be represented by * +# Returns: +# The wikitext with {{Tag|*}} replaced by the textual tag +def wikitag2text(wikitext: str, quote: bool = False, star_value: bool = True) -> str: + tag_templates = read_wiki_templates(wikitext, ["Tag", "Key"], keep_markup = True) + for t in tag_templates: + k = t[2] + # This part isn't perfect yet, there's special syntax for ;-separated, :-subkeys, :-subvalues, languages, ... + v = "*" if star_value else "" + if len(t) > 3: + v = "".join(t[3:]) or v + if v: + v = "=" + v + wikitext = wikitext.replace(t[0], "{2}{0}{1}{2}".format(k, v, "`" if quote else "")) + return wikitext diff --git a/plugins/tests/wikireader_test.py b/plugins/tests/wikireader_test.py new file mode 100644 index 000000000..c6fe0c5ab --- /dev/null +++ b/plugins/tests/wikireader_test.py @@ -0,0 +1,68 @@ +#-*- coding: utf-8 -*- +from plugins.Plugin import TestPluginCommon +from plugins.modules.wikiReader import read_wiki_table, read_wiki_templates, wikitag2text + +class Test(TestPluginCommon): + def test_wikitag2text(self): + for k in ["{{tag|abc|def}}", "{{Tag|abc|def}}", "{{ Tag | abc | def }}", "{{Key|abc|def}}", "{{Tag|abc||def}}", ]: + assert wikitag2text(k) == "abc=def" + + for k in ["{{Tag|abc|}}", "{{tag|abc}}", "{{Key|abc}}"]: + assert wikitag2text(k) == "abc=*" + assert wikitag2text(k, star_value=False) == "abc" + + assert wikitag2text("{{tag|abc|def}} and {{tag|ghi|jkl}}", quote=True) == "`abc=def` and `ghi=jkl`" + + + def test_wikitable(self): + t = """ +{| class="wikitable" +! species || species:wikidata || {{key|leaf_cycle}} || {{key|leaf_type}} +|- +| Abies alba || [[:d:Q146992|Q146992]] || evergreen || '''needleleaved''' +|- +|Abies pinsapo +|[[:d:Q849381|Q849381]] +|evergreen +|needleleaved +|- +| Ziziphus jujuba || [[:d:Q11181633|Q11181633]] || deciduous +|}""" + # Basic table reading + missing cell + assert read_wiki_table(t) == [ + ["Abies alba", "Q146992", "evergreen", "needleleaved"], + ["Abies pinsapo", "Q849381", "evergreen", "needleleaved"], + ["Ziziphus jujuba", "Q11181633", "deciduous", None]] + + # Header retention and ensuring templates like {{key|*}} are retained + assert read_wiki_table(t, skip_headers=False) == [ + ["species", "species:wikidata", "{{key|leaf_cycle}}", "{{key|leaf_type}}"], + ["Abies alba", "Q146992", "evergreen", "needleleaved"], + ["Abies pinsapo", "Q849381", "evergreen", "needleleaved"], + ["Ziziphus jujuba", "Q11181633", "deciduous", None]] + + # Ensure we can use markup if needed + assert read_wiki_table(t, keep_markup=True) == [ + ["Abies alba", "[[:d:Q146992|Q146992]]", "evergreen", "'''needleleaved'''"], + ["Abies pinsapo", "[[:d:Q849381|Q849381]]", "evergreen", "needleleaved"], + ["Ziziphus jujuba", "[[:d:Q11181633|Q11181633]]", "deciduous", None]] + + + def test_wikitemplate(self): + t = """ +{{Deprecated features/item|lang={{{lang|}}} +|suggestion={{Tag|leaf_type}} '''or''' {{Tag|leaf_cycle}} +| 22 }} +""" + assert read_wiki_templates(t, "Deprecated features/item")[0] == [ + "{{Deprecated features/item|lang=\n|suggestion={{Tag|leaf_type}} or {{Tag|leaf_cycle}}\n| 22 }}", + "Deprecated features/item", + "lang=", + "suggestion={{Tag|leaf_type}} or {{Tag|leaf_cycle}}", + "22"] + assert read_wiki_templates(t, "Deprecated features/item", keep_markup = True)[0] == [ + t.strip(), + "Deprecated features/item", + "lang={{{lang|}}}", + "suggestion={{Tag|leaf_type}} '''or''' {{Tag|leaf_cycle}}", + "22"] diff --git a/requirements.txt b/requirements.txt index 0e51994ab..3445e1a77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ protobuf < 4 # 4.x binary not yet compatible with system package, deps of vt2geo vt2geojson tiletanic sentry-sdk +wikitextparser # Tests pytest == 7.4.4 # In v8 it skips the plugins folder, see our issue #2266 and https://github.com/pytest-dev/pytest/issues/12605