osmose-qa · frodrigo · Oct 20, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 14, 2024
diff --git a/mypy.ini b/mypy.ini
@@ -40,3 +40,5 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-requests.packages.urllib3.util.retry]
 ignore_missing_imports = True
+[mypy-wikitextparser.*]
+ignore_missing_imports = True
diff --git a/plugins/TagFix_Deprecated.py b/plugins/TagFix_Deprecated.py
@@ -23,63 +23,39 @@
 from plugins.Plugin import Plugin
 from modules.downloader import urlread
 from modules.Stablehash import stablehash
-import re
+from plugins.modules.wikiReader import read_wiki_templates,wikitag2text
 
 
 class TagFix_Deprecated(Plugin):
     def deprecated_list(self):
         wikiRoot = 'https://wiki.openstreetmap.org/wiki'
         data = urlread(wikiRoot + '/Template:Deprecated_features?action=raw', 1)
 
-        # Tidy data up for processing
-        # Eliminate wiki bold formatting
-        data = data.replace("'''", "")
-
-        # Remove HTML newlines
-        data = re.sub(r'<br\s*/>', ' ', data)
-
         # Remove excess whitespace (also removes all newlines)
         data = " ".join(data.split())
 
-        # Eliminate any whitespace around pipe characters
-        # This makes reading the template parameters simple
-        data = re.sub(r'\s?\|\s?', '|', data)
-
-        # Eliminate templates to prevent unexpected pipe characters
-        data = re.sub(r'{{{\s?lang\s?\|?\s?}}}', '', data, flags=re.I)
-        # Tag template can take one or two params, with trailing | possible
-        data = re.sub(
-            r'{{(?:Tag|Key)\s?\|(.+?)\|?\s?}}',
-            lambda x: '`{}`'.format(x.group(1).replace("||", "=").replace("|", "=")),
-            data,
-            flags=re.I
-        )
-
-        # Resolve interwiki links now
-        data = re.sub(
-            r'\[\[(.+?)\]\]',
-            lambda x: '[{}]({}/{})'.format(x.group(1), wikiRoot, x.group(1).replace(" ", "_")),
-            data
-        )
+        data = read_wiki_templates(data, "Deprecated features/item")
 
         deprecated = {}
-        for feature in data.split(r'{{Deprecated features/item')[1:]:
-            # Unaccounted for template present in this feature
-            if r'{{' in feature:
-                continue
-
+        for feature in data:
             src_key, src_val, dest = None, None, None
-            for param in feature.split('|'):
+            for param in feature[2:]:
+                # Convert {{Tag|k|v}} to k=v
+                param = wikitag2text(param, quote = True, star_value = False)
                 if '=' not in param:
                     continue
+                if '{{' in param:
+                    # Unaccounted for template present in this feature
+                    src_key, src_val, dest = None, None, None
+                    break
 
                 k, v = param.split('=', 1)
-                # k will always start with the param because we removed whitespace around | earlier
-                if (k.rstrip() == 'dkey'):
+                k = k.rstrip()
+                if k == 'dkey':
                     src_key = v
-                elif (k.rstrip() == 'dvalue'):
+                elif k == 'dvalue':
                     src_val = v
-                elif (k.rstrip() == 'suggestion'):
+                elif k == 'suggestion':
                     dest = v
 
             # Sanity check in case formatting changes or something

diff --git a/plugins/TagFix_Postcode.py b/plugins/TagFix_Postcode.py
@@ -22,6 +22,7 @@
 from modules.OsmoseTranslation import T_
 from plugins.Plugin import Plugin
 from modules.downloader import urlread
+from plugins.modules.wikiReader import read_wiki_table
 import re
 
 
@@ -50,21 +51,18 @@ def parse_format(self, reline, format):
         elif len(regexs) == 1:
             return "^"+regexs[0]+"$"
 
-    def clean_line(self, line):
-        # Clean wiki templates and links
-        line = re.sub(self.reWikiTemplate, "", line) # Remove all templates, e.g. {{Date|2000-01-01}}, may contain pipes
-        return re.sub(self.reWikiPageLink, "\\1", line) # Replace all links by their text value, so [[x|y]] and [[y]] both become y
-
     def list_postcode(self):
         reline = re.compile("^[-CAN ]+$")
-        # remline = re.compile("^[-CAN ]+ *\([-CAN ]+\)$")
         data = urlread(u"https://en.wikipedia.org/wiki/List_of_postal_codes?action=raw", 1)
-        data = filter(lambda t: len(t) > 2 and ("no codes" not in t[1].lower() or t[2] != ""), map(lambda x: list(map(lambda y: y.strip(), self.clean_line(x).split("|")))[3:6], data.split("|-")[1:-1]))
+        data = read_wiki_table(data)
+
         postcode = {}
-        for line in data:
-            iso = line[0][0:2]
-            format_area = line[1]
-            format_street = line[2]
+        for row in data:
+            iso = row[2]
+            format_area = row[3]
+            format_street = row[4]
+            if (not format_area or "no codes" in format_area.lower()) and not format_street:
+                continue
 
             postcode[iso] = {}
             if format_area != '':
@@ -91,8 +89,6 @@ def init(self, logger):
 [Wikipedia](https://en.wikipedia.org/wiki/List_of_postal_codes)'''),
             resource = 'https://en.wikipedia.org/wiki/List_of_postal_codes')
 
-        self.reWikiTemplate = re.compile(r'\{\{[^}]+\}\}')
-        self.reWikiPageLink = re.compile(r'\[\[[^]]*?([^]|]+)\]\]')
         self.Country = None
         if self.father.config.options.get("country"):
             self.Country = self.father.config.options.get("country").split("-")[0]

diff --git a/plugins/TagFix_Tree.py b/plugins/TagFix_Tree.py
@@ -22,6 +22,7 @@
 from modules.OsmoseTranslation import T_
 from plugins.Plugin import Plugin
 from modules.downloader import urlread
+from plugins.modules.wikiReader import read_wiki_table
 
 
 class TagFix_Tree(Plugin):
@@ -34,7 +35,7 @@ def _read_leaf_properties_table(self):
         allowed_leaf_cycle = ("evergreen", "deciduous")
 
         data = urlread(u"https://wiki.openstreetmap.org/w/index.php?title=Tag:natural%3Dtree/List_of_Species&action=raw", 1)
-        data = list(map(lambda x: list(filter(lambda z: len(z) > 0, map(lambda y: y.strip(), x.split("|")))), data.split("|-")[1:-1]))
+        data = read_wiki_table(data)
         species_map = {}
         for row in data: # data: list of [species, species:wikidata, leaf_cycle, leaf_type]
             this_species = {}

diff --git a/plugins/TagWatchFrViPofm.py b/plugins/TagWatchFrViPofm.py
@@ -25,6 +25,7 @@
 from modules.Stablehash import stablehash, stablehash64
 import re
 from collections import defaultdict
+from plugins.modules.wikiReader import read_wiki_table, wikitag2text
 
 
 class TagWatchFrViPofm(Plugin):
@@ -52,43 +53,43 @@ def init(self, logger):
         self._update_ks_vr = defaultdict(dict)
         self._update_kr_vr = defaultdict(dict)
 
-        reline = re.compile(r"^\|([^|]*)\|\|([^|]*)\|\|([^|]*)\|\|([^|]*).*")
-
         # Obtain the info from https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes
         data = urlread(u"https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes&action=raw", 1)
-        data = data.split("\n")
-        for line in data:
-            for res in reline.findall(line):
-                only_for = res[3].strip()
-                if only_for in (None, '', country, language) or (country and country.startswith(only_for)):
-                    r = res[1].strip()
-                    c0 = res[2].strip()
-                    tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
-                    c = stablehash(c0)
-                    self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
-                        title = {'en': c0},
-                        detail = T_(
+        data = read_wiki_table(data, skip_headers = False)[1:] # Headers in the middle of the table, not supported yet in read_wiki_table
+
+        for row in data:
+            only_for = row[3]
+            if only_for in (None, '', country, language) or (country and country.startswith(only_for)) or only_for.lower().startswith("{{taginfo"): # This also filters out the alphabetical headers
+                r = wikitag2text(row[1]) # replace-value
+                f = wikitag2text(row[0]) # to-be-replaced value
+                c0 = row[2] # the Osmose issue tag and issue title
+                tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
+                c = stablehash(c0)
+                self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
+                    title = {'en': c0},
+                    detail = T_(
 '''Simple and frequent errors, can be updated by anyone on the wiki.'''),
-                        resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')
-                    if u"=" in res[0]:
-                        k = res[0].split(u"=")[0].strip()
-                        v = res[0].split(u"=")[1].strip()
-                        if self.quoted(k):
-                            k = self.quoted2re(k)
-                            if self.quoted(v):
-                                self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
-                            else:
-                                self._update_kr_vs[k][v] = [r, c]
+                    resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')
+
+                if "=" in f:
+                    k = f.split("=")[0].strip()
+                    v = f.split("=")[1].strip()
+                    if self.quoted(k):
+                        k = self.quoted2re(k)
+                        if self.quoted(v):
+                            self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
                         else:
-                            if self.quoted(v):
-                                self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
-                            else:
-                                self._update_ks_vs[k][v] = [r, c]
+                            self._update_kr_vs[k][v] = [r, c]
                     else:
-                        if self.quoted(res[0]):
-                            self._update_kr[self.quoted2re(res[0])] = [r, c]
+                        if self.quoted(v):
+                            self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
                         else:
-                            self._update_ks[res[0]] = [r, c]
+                            self._update_ks_vs[k][v] = [r, c]
+                else:
+                    if self.quoted(f):
+                        self._update_kr[self.quoted2re(f)] = [r, c]
+                    else:
+                        self._update_ks[f] = [r, c]
 
     def node(self, data, tags):
         err = []
@@ -142,6 +143,7 @@ class father:
         self.check_err(a.node(None, {"administrative": "boundary"}))
         self.check_err(a.node(None, {"name": "FIXME"}))
         self.check_err(a.node(None, {"Area": "plop"}))
+        self.check_err(a.node(None, {"access": "public"}))
         self.check_err(a.node(None, {"Fixme": "yes"}))
         self.check_err(a.node(None, {"voltage": "10kV"}))
         assert not a.node(None, {"area": "plop"})

diff --git a/plugins/modules/wikiReader.py b/plugins/modules/wikiReader.py
@@ -0,0 +1,136 @@
+#-*- coding: utf-8 -*-
+
+###########################################################################
+##                                                                       ##
+## Copyrights Osmose project 2024                                        ##
+##                                                                       ##
+## This program is free software: you can redistribute it and/or modify  ##
+## it under the terms of the GNU General Public License as published by  ##
+## the Free Software Foundation, either version 3 of the License, or     ##
+## (at your option) any later version.                                   ##
+##                                                                       ##
+## This program is distributed in the hope that it will be useful,       ##
+## but WITHOUT ANY WARRANTY; without even the implied warranty of        ##
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         ##
+## GNU General Public License for more details.                          ##
+##                                                                       ##
+## You should have received a copy of the GNU General Public License     ##
+## along with this program.  If not, see <http://www.gnu.org/licenses/>. ##
+##                                                                       ##
+###########################################################################
+
+
+# This module file contains functions to read MediaWiki markup tables, templates, lists, ...
+
+import wikitextparser
+from typing import Union, Optional
+
+# Get a list of lists containing all cells of a table.
+# Parameters:
+#   wikitext - the text of a wikipedia page
+#   tab_index - the index of the table (if there's multiple tables on the wiki)
+#   keep_markup - if False, everything (except Templates) will be converted to plain text
+#   skip_headers - if True, header rows are removed. Assumes all headers are on top
+# Returns:
+#   The cell contents, specified as a list in a list.
+#   The outer list is the rows, the inner list are the cells in that row
+# Throws:
+#   If the table at the specified index isn't found
+def read_wiki_table(wikitext: str, tab_index: int = 0, keep_markup: bool = False, skip_headers: bool = True) -> list[list[Optional[str]]]:
+    # Drops all markup, such as italics, hyperlinks, ...
+    if not keep_markup:
+        wikitext = wikitextparser.remove_markup(wikitext, replace_tables=False, replace_templates=False)
+
+    t = wikitextparser.parse(wikitext).tables[tab_index]
+
+    # Remove header rows if desired
+    removable_header_rows = 0
+    if skip_headers:
+        removable_header_rows = len(list(filter(lambda c: c.is_header, t.cells(column=0))))
+    t = t.data()[removable_header_rows:]
+
+    # Remove whitespace around the cells
+    return list(map(lambda row: list(map(lambda c: c.strip() if isinstance(c, str) else c, row)), t))
+
+
+# Get all instances of a certain wiki template within wikitext
+# Parameters:
+#   wikitext - the text of a wikipedia page
+#   template_name - the name or names of the template to locate, e.g. 'Deprecated features/item'
+#   keep_markup - if False, everything (except Templates) will be converted to plain text
+# Returns:
+#   A list containing lists of strings with values [template_string, template_name, argument1, argument2, argument3, ...]
+#   Example: ["{{Tag | key | value}}", "Tag", "key", "value"]
+#   (Note that the template_string is affected by the markup removal, so for string replace purposes, use keep_markup=True)
+def read_wiki_templates(wikitext: str, template_name: Union[str, list[str]], keep_markup: bool = False) -> list[list[str]]:
+    if isinstance(template_name, str):
+        template_name = [template_name]
+    template_name = list(map(str.lower, template_name))
+
+    # Drops all markup, such as italics, hyperlinks, ...
+    if not keep_markup:
+        wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+    # Get all templates that match the filter
+    template_objects = list(filter(lambda t: t.name.strip().lower() in template_name, wikitextparser.parse(wikitext).templates))
+
+    return list(map(lambda t: [t.string, t.name.strip()] + [str(a)[1:].strip() for a in t.arguments], template_objects))
+
+
+# Get all entries in a list within wikitext
+# Parameters:
+#   wikitext - the text of a wikipedia page
+#   list_index - the index of the list (if there's multiple lists on the wiki)
+#   keep_markup - if False, everything (except Templates) will be converted to plain text
+#   include_sublists - if true, include subitems. If false, only include the highest level items
+#       When true, the list item symbol (*, **, #, ##, :, ...) will also be included in the output
+# Returns:
+#   A list with all list items
+# Throws:
+#   If the list at index list_index doesn't exist
+def read_wiki_list(wikitext: str, list_index: int = 0, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
+    if not keep_markup:
+        wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+    lst = wikitextparser.parse(wikitext).get_lists()[list_index]
+    if include_sublists:
+        # Note this contains the list identifier, e.g. *, **, #, ##
+        return list(map(str.strip, lst.fullitems))
+    return list(map(str.strip, lst.items))
+
+
+# Get all list entries within wikitext
+# See read_wiki_list for details (excluding list_index)
+def read_all_wiki_lists(wikitext: str, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
+    res = []
+    if not keep_markup:
+        wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)
+
+    try:
+        list_index = 0
+        while True:
+            res.extend(read_wiki_list(wikitext, list_index=list_index, keep_markup=True, include_sublists=include_sublists))
+            list_index += 1
+    except IndexError:
+        return res
+
+
+# Convert all instances of Tag-templates to textual tags, e.g. {{Tag|oneway|yes}} -> "oneway=yes"
+# Parameters:
+#   wikitext - the text of a wikipedia page
+#   quote - whether the tag should be wrapped in ``
+#   star_value - whether empty tag values should be represented by *
+# Returns:
+#   The wikitext with {{Tag|*}} replaced by the textual tag
+def wikitag2text(wikitext: str, quote: bool = False, star_value: bool = True) -> str:
+    tag_templates = read_wiki_templates(wikitext, ["Tag", "Key"], keep_markup = True)
+    for t in tag_templates:
+        k = t[2]
+        # This part isn't perfect yet, there's special syntax for ;-separated, :-subkeys, :-subvalues, languages, ...
+        v = "*" if star_value else ""
+        if len(t) > 3:
+            v = "".join(t[3:]) or v
+        if v:
+            v = "=" + v
+        wikitext = wikitext.replace(t[0], "{2}{0}{1}{2}".format(k, v, "`" if quote else ""))
+    return wikitext