diff --git a/mypy.ini b/mypy.ini
index f17ceaed6..f5b4fd823 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -40,3 +40,5 @@ ignore_missing_imports = True
ignore_missing_imports = True
[mypy-requests.packages.urllib3.util.retry]
ignore_missing_imports = True
+[mypy-wikitextparser.*]
+ignore_missing_imports = True
diff --git a/plugins/TagFix_Deprecated.py b/plugins/TagFix_Deprecated.py
index 2f816d53f..8760b9cf2 100644
--- a/plugins/TagFix_Deprecated.py
+++ b/plugins/TagFix_Deprecated.py
@@ -23,7 +23,7 @@
from plugins.Plugin import Plugin
from modules.downloader import urlread
from modules.Stablehash import stablehash
-import re
+from plugins.modules.wikiReader import read_wiki_templates,wikitag2text
class TagFix_Deprecated(Plugin):
@@ -31,55 +31,31 @@ def deprecated_list(self):
wikiRoot = 'https://wiki.openstreetmap.org/wiki'
data = urlread(wikiRoot + '/Template:Deprecated_features?action=raw', 1)
- # Tidy data up for processing
- # Eliminate wiki bold formatting
- data = data.replace("'''", "")
-
- # Remove HTML newlines
- data = re.sub(r'
', ' ', data)
-
# Remove excess whitespace (also removes all newlines)
data = " ".join(data.split())
- # Eliminate any whitespace around pipe characters
- # This makes reading the template parameters simple
- data = re.sub(r'\s?\|\s?', '|', data)
-
- # Eliminate templates to prevent unexpected pipe characters
- data = re.sub(r'{{{\s?lang\s?\|?\s?}}}', '', data, flags=re.I)
- # Tag template can take one or two params, with trailing | possible
- data = re.sub(
- r'{{(?:Tag|Key)\s?\|(.+?)\|?\s?}}',
- lambda x: '`{}`'.format(x.group(1).replace("||", "=").replace("|", "=")),
- data,
- flags=re.I
- )
-
- # Resolve interwiki links now
- data = re.sub(
- r'\[\[(.+?)\]\]',
- lambda x: '[{}]({}/{})'.format(x.group(1), wikiRoot, x.group(1).replace(" ", "_")),
- data
- )
+ data = read_wiki_templates(data, "Deprecated features/item")
deprecated = {}
- for feature in data.split(r'{{Deprecated features/item')[1:]:
- # Unaccounted for template present in this feature
- if r'{{' in feature:
- continue
-
+ for feature in data:
src_key, src_val, dest = None, None, None
- for param in feature.split('|'):
+ for param in feature[2:]:
+ # Convert {{Tag|k|v}} to k=v
+ param = wikitag2text(param, quote = True, star_value = False)
if '=' not in param:
continue
+ if '{{' in param:
+ # Unaccounted for template present in this feature
+ src_key, src_val, dest = None, None, None
+ break
k, v = param.split('=', 1)
- # k will always start with the param because we removed whitespace around | earlier
- if (k.rstrip() == 'dkey'):
+ k = k.rstrip()
+ if k == 'dkey':
src_key = v
- elif (k.rstrip() == 'dvalue'):
+ elif k == 'dvalue':
src_val = v
- elif (k.rstrip() == 'suggestion'):
+ elif k == 'suggestion':
dest = v
# Sanity check in case formatting changes or something
diff --git a/plugins/TagFix_Postcode.py b/plugins/TagFix_Postcode.py
index 896598f8e..36da4dea5 100644
--- a/plugins/TagFix_Postcode.py
+++ b/plugins/TagFix_Postcode.py
@@ -22,6 +22,7 @@
from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin
from modules.downloader import urlread
+from plugins.modules.wikiReader import read_wiki_table
import re
@@ -50,21 +51,18 @@ def parse_format(self, reline, format):
elif len(regexs) == 1:
return "^"+regexs[0]+"$"
- def clean_line(self, line):
- # Clean wiki templates and links
- line = re.sub(self.reWikiTemplate, "", line) # Remove all templates, e.g. {{Date|2000-01-01}}, may contain pipes
- return re.sub(self.reWikiPageLink, "\\1", line) # Replace all links by their text value, so [[x|y]] and [[y]] both become y
-
def list_postcode(self):
reline = re.compile("^[-CAN ]+$")
- # remline = re.compile("^[-CAN ]+ *\([-CAN ]+\)$")
data = urlread(u"https://en.wikipedia.org/wiki/List_of_postal_codes?action=raw", 1)
- data = filter(lambda t: len(t) > 2 and ("no codes" not in t[1].lower() or t[2] != ""), map(lambda x: list(map(lambda y: y.strip(), self.clean_line(x).split("|")))[3:6], data.split("|-")[1:-1]))
+ data = read_wiki_table(data)
+
postcode = {}
- for line in data:
- iso = line[0][0:2]
- format_area = line[1]
- format_street = line[2]
+ for row in data:
+ iso = row[2]
+ format_area = row[3]
+ format_street = row[4]
+ if (not format_area or "no codes" in format_area.lower()) and not format_street:
+ continue
postcode[iso] = {}
if format_area != '':
@@ -91,8 +89,6 @@ def init(self, logger):
[Wikipedia](https://en.wikipedia.org/wiki/List_of_postal_codes)'''),
resource = 'https://en.wikipedia.org/wiki/List_of_postal_codes')
- self.reWikiTemplate = re.compile(r'\{\{[^}]+\}\}')
- self.reWikiPageLink = re.compile(r'\[\[[^]]*?([^]|]+)\]\]')
self.Country = None
if self.father.config.options.get("country"):
self.Country = self.father.config.options.get("country").split("-")[0]
diff --git a/plugins/TagFix_Tree.py b/plugins/TagFix_Tree.py
index 6bfad7fa2..a00439a1c 100644
--- a/plugins/TagFix_Tree.py
+++ b/plugins/TagFix_Tree.py
@@ -22,6 +22,7 @@
from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin
from modules.downloader import urlread
+from plugins.modules.wikiReader import read_wiki_table
class TagFix_Tree(Plugin):
@@ -34,7 +35,7 @@ def _read_leaf_properties_table(self):
allowed_leaf_cycle = ("evergreen", "deciduous")
data = urlread(u"https://wiki.openstreetmap.org/w/index.php?title=Tag:natural%3Dtree/List_of_Species&action=raw", 1)
- data = list(map(lambda x: list(filter(lambda z: len(z) > 0, map(lambda y: y.strip(), x.split("|")))), data.split("|-")[1:-1]))
+ data = read_wiki_table(data)
species_map = {}
for row in data: # data: list of [species, species:wikidata, leaf_cycle, leaf_type]
this_species = {}
diff --git a/plugins/TagWatchFrViPofm.py b/plugins/TagWatchFrViPofm.py
index f13ddc01b..3244f88a3 100644
--- a/plugins/TagWatchFrViPofm.py
+++ b/plugins/TagWatchFrViPofm.py
@@ -25,6 +25,7 @@
from modules.Stablehash import stablehash, stablehash64
import re
from collections import defaultdict
+from plugins.modules.wikiReader import read_wiki_table, wikitag2text
class TagWatchFrViPofm(Plugin):
@@ -52,43 +53,43 @@ def init(self, logger):
self._update_ks_vr = defaultdict(dict)
self._update_kr_vr = defaultdict(dict)
- reline = re.compile(r"^\|([^|]*)\|\|([^|]*)\|\|([^|]*)\|\|([^|]*).*")
-
# Obtain the info from https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes
data = urlread(u"https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes&action=raw", 1)
- data = data.split("\n")
- for line in data:
- for res in reline.findall(line):
- only_for = res[3].strip()
- if only_for in (None, '', country, language) or (country and country.startswith(only_for)):
- r = res[1].strip()
- c0 = res[2].strip()
- tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
- c = stablehash(c0)
- self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
- title = {'en': c0},
- detail = T_(
+ data = read_wiki_table(data, skip_headers = False)[1:] # Headers in the middle of the table, not supported yet in read_wiki_table
+
+ for row in data:
+ only_for = row[3]
+ if only_for in (None, '', country, language) or (country and country.startswith(only_for)) or only_for.lower().startswith("{{taginfo"): # This also filters out the alphabetical headers
+ r = wikitag2text(row[1]) # replace-value
+ f = wikitag2text(row[0]) # to-be-replaced value
+ c0 = row[2] # the Osmose issue tag and issue title
+ tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
+ c = stablehash(c0)
+ self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
+ title = {'en': c0},
+ detail = T_(
'''Simple and frequent errors, can be updated by anyone on the wiki.'''),
- resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')
- if u"=" in res[0]:
- k = res[0].split(u"=")[0].strip()
- v = res[0].split(u"=")[1].strip()
- if self.quoted(k):
- k = self.quoted2re(k)
- if self.quoted(v):
- self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
- else:
- self._update_kr_vs[k][v] = [r, c]
+ resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')
+
+ if "=" in f:
+ k = f.split("=")[0].strip()
+ v = f.split("=")[1].strip()
+ if self.quoted(k):
+ k = self.quoted2re(k)
+ if self.quoted(v):
+ self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
else:
- if self.quoted(v):
- self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
- else:
- self._update_ks_vs[k][v] = [r, c]
+ self._update_kr_vs[k][v] = [r, c]
else:
- if self.quoted(res[0]):
- self._update_kr[self.quoted2re(res[0])] = [r, c]
+ if self.quoted(v):
+ self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
else:
- self._update_ks[res[0]] = [r, c]
+ self._update_ks_vs[k][v] = [r, c]
+ else:
+ if self.quoted(f):
+ self._update_kr[self.quoted2re(f)] = [r, c]
+ else:
+ self._update_ks[f] = [r, c]
def node(self, data, tags):
err = []
@@ -142,6 +143,7 @@ class father:
self.check_err(a.node(None, {"administrative": "boundary"}))
self.check_err(a.node(None, {"name": "FIXME"}))
self.check_err(a.node(None, {"Area": "plop"}))
+ self.check_err(a.node(None, {"access": "public"}))
self.check_err(a.node(None, {"Fixme": "yes"}))
self.check_err(a.node(None, {"voltage": "10kV"}))
assert not a.node(None, {"area": "plop"})
diff --git a/plugins/modules/wikiReader.py b/plugins/modules/wikiReader.py
new file mode 100644
index 000000000..d9532b9db
--- /dev/null
+++ b/plugins/modules/wikiReader.py
@@ -0,0 +1,136 @@
+#-*- coding: utf-8 -*-
+
+###########################################################################
+## ##
+## Copyrights Osmose project 2024 ##
+## ##
+## This program is free software: you can redistribute it and/or modify ##
+## it under the terms of the GNU General Public License as published by ##
+## the Free Software Foundation, either version 3 of the License, or ##
+## (at your option) any later version. ##
+## ##
+## This program is distributed in the hope that it will be useful, ##
+## but WITHOUT ANY WARRANTY; without even the implied warranty of ##
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ##
+## GNU General Public License for more details. ##
+## ##
+## You should have received a copy of the GNU General Public License ##
+## along with this program. If not, see . ##
+## ##
+###########################################################################
+
+
+# This module file contains functions to read MediaWiki markup tables, templates, lists, ...
+
+import wikitextparser
+from typing import Union, Optional
+
+# Get a list of lists containing all cells of a table.
+# Parameters:
+# wikitext - the text of a wikipedia page
+# tab_index - the index of the table (if there's multiple tables on the wiki)
+# keep_markup - if False, everything (except Templates) will be converted to plain text
+# skip_headers - if True, header rows are removed. Assumes all headers are on top
+# Returns:
+# The cell contents, specified as a list in a list.
+# The outer list is the rows, the inner list are the cells in that row
+# Throws:
+# If the table at the specified index isn't found
+def read_wiki_table(wikitext: str, tab_index: int = 0, keep_markup: bool = False, skip_headers: bool = True) -> list[list[Optional[str]]]:
+ # Drops all markup, such as italics, hyperlinks, ...
+ if not keep_markup:
+ wikitext = wikitextparser.remove_markup(wikitext, replace_tables=False, replace_templates=False)
+
+ t = wikitextparser.parse(wikitext).tables[tab_index]
+
+ # Remove header rows if desired
+ removable_header_rows = 0
+ if skip_headers:
+ removable_header_rows = len(list(filter(lambda c: c.is_header, t.cells(column=0))))
+ t = t.data()[removable_header_rows:]
+
+ # Remove whitespace around the cells
+ return list(map(lambda row: list(map(lambda c: c.strip() if isinstance(c, str) else c, row)), t))
+
+
+# Get all instances of a certain wiki template within wikitext
+# Parameters:
+# wikitext - the text of a wikipedia page
+# template_name - the name or names of the template to locate, e.g. 'Deprecated features/item'
+# keep_markup - if False, everything (except Templates) will be converted to plain text
+# Returns:
+# A list containing lists of strings with values [template_string, template_name, argument1, argument2, argument3, ...]
+# Example: ["{{Tag | key | value}}", "Tag", "key", "value"]
+# (Note that the template_string is affected by the markup removal, so for string replace purposes, use keep_markup=True)
+def read_wiki_templates(wikitext: str, template_name: Union[str, list[str]], keep_markup: bool = False) -> list[list[str]]:
+ if isinstance(template_name, str):
+ template_name = [template_name]
+ template_name = list(map(str.lower, template_name))
+
+ # Drops all markup, such as italics, hyperlinks, ...
+ if not keep_markup:
+ wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+ # Get all templates that match the filter
+ template_objects = list(filter(lambda t: t.name.strip().lower() in template_name, wikitextparser.parse(wikitext).templates))
+
+ return list(map(lambda t: [t.string, t.name.strip()] + [str(a)[1:].strip() for a in t.arguments], template_objects))
+
+
+# Get all entries in a list within wikitext
+# Parameters:
+# wikitext - the text of a wikipedia page
+# list_index - the index of the list (if there's multiple lists on the wiki)
+# keep_markup - if False, everything (except Templates) will be converted to plain text
+# include_sublists - if true, include subitems. If false, only include the highest level items
+# When true, the list item symbol (*, **, #, ##, :, ...) will also be included in the output
+# Returns:
+# A list with all list items
+# Throws:
+# If the list at index list_index doesn't exist
+def read_wiki_list(wikitext: str, list_index: int = 0, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
+ if not keep_markup:
+ wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+ lst = wikitextparser.parse(wikitext).get_lists()[list_index]
+ if include_sublists:
+ # Note this contains the list identifier, e.g. *, **, #, ##
+ return list(map(str.strip, lst.fullitems))
+ return list(map(str.strip, lst.items))
+
+
+# Get all list entries within wikitext
+# See read_wiki_list for details (excluding list_index)
+def read_all_wiki_lists(wikitext: str, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
+ res = []
+ if not keep_markup:
+ wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+ try:
+ list_index = 0
+ while True:
+ res.extend(read_wiki_list(wikitext, list_index=list_index, keep_markup=True, include_sublists=include_sublists))
+ list_index += 1
+ except IndexError:
+ return res
+
+
+# Convert all instances of Tag-templates to textual tags, e.g. {{Tag|oneway|yes}} -> "oneway=yes"
+# Parameters:
+# wikitext - the text of a wikipedia page
+# quote - whether the tag should be wrapped in ``
+# star_value - whether empty tag values should be represented by *
+# Returns:
+# The wikitext with {{Tag|*}} replaced by the textual tag
+def wikitag2text(wikitext: str, quote: bool = False, star_value: bool = True) -> str:
+ tag_templates = read_wiki_templates(wikitext, ["Tag", "Key"], keep_markup = True)
+ for t in tag_templates:
+ k = t[2]
+ # This part isn't perfect yet, there's special syntax for ;-separated, :-subkeys, :-subvalues, languages, ...
+ v = "*" if star_value else ""
+ if len(t) > 3:
+ v = "".join(t[3:]) or v
+ if v:
+ v = "=" + v
+ wikitext = wikitext.replace(t[0], "{2}{0}{1}{2}".format(k, v, "`" if quote else ""))
+ return wikitext
diff --git a/plugins/tests/wikireader_test.py b/plugins/tests/wikireader_test.py
new file mode 100644
index 000000000..c6fe0c5ab
--- /dev/null
+++ b/plugins/tests/wikireader_test.py
@@ -0,0 +1,68 @@
+#-*- coding: utf-8 -*-
+from plugins.Plugin import TestPluginCommon
+from plugins.modules.wikiReader import read_wiki_table, read_wiki_templates, wikitag2text
+
+class Test(TestPluginCommon):
+ def test_wikitag2text(self):
+ for k in ["{{tag|abc|def}}", "{{Tag|abc|def}}", "{{ Tag | abc | def }}", "{{Key|abc|def}}", "{{Tag|abc||def}}", ]:
+ assert wikitag2text(k) == "abc=def"
+
+ for k in ["{{Tag|abc|}}", "{{tag|abc}}", "{{Key|abc}}"]:
+ assert wikitag2text(k) == "abc=*"
+ assert wikitag2text(k, star_value=False) == "abc"
+
+ assert wikitag2text("{{tag|abc|def}} and {{tag|ghi|jkl}}", quote=True) == "`abc=def` and `ghi=jkl`"
+
+
+ def test_wikitable(self):
+ t = """
+{| class="wikitable"
+! species || species:wikidata || {{key|leaf_cycle}} || {{key|leaf_type}}
+|-
+| Abies alba || [[:d:Q146992|Q146992]] || evergreen || '''needleleaved'''
+|-
+|Abies pinsapo
+|[[:d:Q849381|Q849381]]
+|evergreen
+|needleleaved
+|-
+| Ziziphus jujuba || [[:d:Q11181633|Q11181633]] || deciduous
+|}"""
+ # Basic table reading + missing cell
+ assert read_wiki_table(t) == [
+ ["Abies alba", "Q146992", "evergreen", "needleleaved"],
+ ["Abies pinsapo", "Q849381", "evergreen", "needleleaved"],
+ ["Ziziphus jujuba", "Q11181633", "deciduous", None]]
+
+ # Header retention and ensuring templates like {{key|*}} are retained
+ assert read_wiki_table(t, skip_headers=False) == [
+ ["species", "species:wikidata", "{{key|leaf_cycle}}", "{{key|leaf_type}}"],
+ ["Abies alba", "Q146992", "evergreen", "needleleaved"],
+ ["Abies pinsapo", "Q849381", "evergreen", "needleleaved"],
+ ["Ziziphus jujuba", "Q11181633", "deciduous", None]]
+
+ # Ensure we can use markup if needed
+ assert read_wiki_table(t, keep_markup=True) == [
+ ["Abies alba", "[[:d:Q146992|Q146992]]", "evergreen", "'''needleleaved'''"],
+ ["Abies pinsapo", "[[:d:Q849381|Q849381]]", "evergreen", "needleleaved"],
+ ["Ziziphus jujuba", "[[:d:Q11181633|Q11181633]]", "deciduous", None]]
+
+
+ def test_wikitemplate(self):
+ t = """
+{{Deprecated features/item|lang={{{lang|}}}
+|suggestion={{Tag|leaf_type}} '''or''' {{Tag|leaf_cycle}}
+| 22 }}
+"""
+ assert read_wiki_templates(t, "Deprecated features/item")[0] == [
+ "{{Deprecated features/item|lang=\n|suggestion={{Tag|leaf_type}} or {{Tag|leaf_cycle}}\n| 22 }}",
+ "Deprecated features/item",
+ "lang=",
+ "suggestion={{Tag|leaf_type}} or {{Tag|leaf_cycle}}",
+ "22"]
+ assert read_wiki_templates(t, "Deprecated features/item", keep_markup = True)[0] == [
+ t.strip(),
+ "Deprecated features/item",
+ "lang={{{lang|}}}",
+ "suggestion={{Tag|leaf_type}} '''or''' {{Tag|leaf_cycle}}",
+ "22"]
diff --git a/requirements.txt b/requirements.txt
index 0e51994ab..3445e1a77 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,7 @@ protobuf < 4 # 4.x binary not yet compatible with system package, deps of vt2geo
vt2geojson
tiletanic
sentry-sdk
+wikitextparser
# Tests
pytest == 7.4.4 # In v8 it skips the plugins folder, see our issue #2266 and https://github.com/pytest-dev/pytest/issues/12605