Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@ ignore_missing_imports = True
ignore_missing_imports = True
[mypy-requests.packages.urllib3.util.retry]
ignore_missing_imports = True
[mypy-wikitextparser.*]
ignore_missing_imports = True
52 changes: 14 additions & 38 deletions plugins/TagFix_Deprecated.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,63 +23,39 @@
from plugins.Plugin import Plugin
from modules.downloader import urlread
from modules.Stablehash import stablehash
import re
from plugins.modules.wikiReader import read_wiki_templates,wikitag2text


class TagFix_Deprecated(Plugin):
def deprecated_list(self):
wikiRoot = 'https://wiki.openstreetmap.org/wiki'
data = urlread(wikiRoot + '/Template:Deprecated_features?action=raw', 1)

# Tidy data up for processing
# Eliminate wiki bold formatting
data = data.replace("'''", "")

# Remove HTML newlines
data = re.sub(r'<br\s*/>', ' ', data)

# Remove excess whitespace (also removes all newlines)
data = " ".join(data.split())

# Eliminate any whitespace around pipe characters
# This makes reading the template parameters simple
data = re.sub(r'\s?\|\s?', '|', data)

# Eliminate templates to prevent unexpected pipe characters
data = re.sub(r'{{{\s?lang\s?\|?\s?}}}', '', data, flags=re.I)
# Tag template can take one or two params, with trailing | possible
data = re.sub(
r'{{(?:Tag|Key)\s?\|(.+?)\|?\s?}}',
lambda x: '`{}`'.format(x.group(1).replace("||", "=").replace("|", "=")),
data,
flags=re.I
)

# Resolve interwiki links now
data = re.sub(
r'\[\[(.+?)\]\]',
lambda x: '[{}]({}/{})'.format(x.group(1), wikiRoot, x.group(1).replace(" ", "_")),
data
)
data = read_wiki_templates(data, "Deprecated features/item")

deprecated = {}
for feature in data.split(r'{{Deprecated features/item')[1:]:
# Unaccounted for template present in this feature
if r'{{' in feature:
continue

for feature in data:
src_key, src_val, dest = None, None, None
for param in feature.split('|'):
for param in feature[2:]:
# Convert {{Tag|k|v}} to k=v
param = wikitag2text(param, quote = True, star_value = False)
if '=' not in param:
continue
if '{{' in param:
# Unaccounted for template present in this feature
src_key, src_val, dest = None, None, None
break

k, v = param.split('=', 1)
# k will always start with the param because we removed whitespace around | earlier
if (k.rstrip() == 'dkey'):
k = k.rstrip()
if k == 'dkey':
src_key = v
elif (k.rstrip() == 'dvalue'):
elif k == 'dvalue':
src_val = v
elif (k.rstrip() == 'suggestion'):
elif k == 'suggestion':
dest = v

# Sanity check in case formatting changes or something
Expand Down
22 changes: 9 additions & 13 deletions plugins/TagFix_Postcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin
from modules.downloader import urlread
from plugins.modules.wikiReader import read_wiki_table
import re


Expand Down Expand Up @@ -50,21 +51,18 @@ def parse_format(self, reline, format):
elif len(regexs) == 1:
return "^"+regexs[0]+"$"

def clean_line(self, line):
# Clean wiki templates and links
line = re.sub(self.reWikiTemplate, "", line) # Remove all templates, e.g. {{Date|2000-01-01}}, may contain pipes
return re.sub(self.reWikiPageLink, "\\1", line) # Replace all links by their text value, so [[x|y]] and [[y]] both become y

def list_postcode(self):
reline = re.compile("^[-CAN ]+$")
# remline = re.compile("^[-CAN ]+ *\([-CAN ]+\)$")
data = urlread(u"https://en.wikipedia.org/wiki/List_of_postal_codes?action=raw", 1)
data = filter(lambda t: len(t) > 2 and ("no codes" not in t[1].lower() or t[2] != ""), map(lambda x: list(map(lambda y: y.strip(), self.clean_line(x).split("|")))[3:6], data.split("|-")[1:-1]))
data = read_wiki_table(data)

postcode = {}
for line in data:
iso = line[0][0:2]
format_area = line[1]
format_street = line[2]
for row in data:
iso = row[2]
format_area = row[3]
format_street = row[4]
if (not format_area or "no codes" in format_area.lower()) and not format_street:
continue

postcode[iso] = {}
if format_area != '':
Expand All @@ -91,8 +89,6 @@ def init(self, logger):
[Wikipedia](https://en.wikipedia.org/wiki/List_of_postal_codes)'''),
resource = 'https://en.wikipedia.org/wiki/List_of_postal_codes')

self.reWikiTemplate = re.compile(r'\{\{[^}]+\}\}')
self.reWikiPageLink = re.compile(r'\[\[[^]]*?([^]|]+)\]\]')
self.Country = None
if self.father.config.options.get("country"):
self.Country = self.father.config.options.get("country").split("-")[0]
Expand Down
3 changes: 2 additions & 1 deletion plugins/TagFix_Tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin
from modules.downloader import urlread
from plugins.modules.wikiReader import read_wiki_table


class TagFix_Tree(Plugin):
Expand All @@ -34,7 +35,7 @@ def _read_leaf_properties_table(self):
allowed_leaf_cycle = ("evergreen", "deciduous")

data = urlread(u"https://wiki.openstreetmap.org/w/index.php?title=Tag:natural%3Dtree/List_of_Species&action=raw", 1)
data = list(map(lambda x: list(filter(lambda z: len(z) > 0, map(lambda y: y.strip(), x.split("|")))), data.split("|-")[1:-1]))
data = read_wiki_table(data)
species_map = {}
for row in data: # data: list of [species, species:wikidata, leaf_cycle, leaf_type]
this_species = {}
Expand Down
64 changes: 33 additions & 31 deletions plugins/TagWatchFrViPofm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from modules.Stablehash import stablehash, stablehash64
import re
from collections import defaultdict
from plugins.modules.wikiReader import read_wiki_table, wikitag2text


class TagWatchFrViPofm(Plugin):
Expand Down Expand Up @@ -52,43 +53,43 @@ def init(self, logger):
self._update_ks_vr = defaultdict(dict)
self._update_kr_vr = defaultdict(dict)

reline = re.compile(r"^\|([^|]*)\|\|([^|]*)\|\|([^|]*)\|\|([^|]*).*")

# Obtain the info from https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes
data = urlread(u"https://wiki.openstreetmap.org/index.php?title=Tagging_mistakes&action=raw", 1)
data = data.split("\n")
for line in data:
for res in reline.findall(line):
only_for = res[3].strip()
if only_for in (None, '', country, language) or (country and country.startswith(only_for)):
r = res[1].strip()
c0 = res[2].strip()
tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
c = stablehash(c0)
self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
title = {'en': c0},
detail = T_(
data = read_wiki_table(data, skip_headers = False)[1:] # Headers in the middle of the table, not supported yet in read_wiki_table

for row in data:
only_for = row[3]
if only_for in (None, '', country, language) or (country and country.startswith(only_for)) or only_for.lower().startswith("{{taginfo"): # This also filters out the alphabetical headers
r = wikitag2text(row[1]) # replace-value
f = wikitag2text(row[0]) # to-be-replaced value
c0 = row[2] # the Osmose issue tag and issue title
tags = ["fix:chair"] if c0 == "" else [c0, "fix:chair"]
c = stablehash(c0)
self.errors[c] = self.def_class(item = 3030, level = 2, tags = tags,
title = {'en': c0},
detail = T_(
'''Simple and frequent errors, can be updated by anyone on the wiki.'''),
resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')
if u"=" in res[0]:
k = res[0].split(u"=")[0].strip()
v = res[0].split(u"=")[1].strip()
if self.quoted(k):
k = self.quoted2re(k)
if self.quoted(v):
self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
else:
self._update_kr_vs[k][v] = [r, c]
resource = 'https://wiki.openstreetmap.org/wiki/Tagging_mistakes')

if "=" in f:
k = f.split("=")[0].strip()
v = f.split("=")[1].strip()
if self.quoted(k):
k = self.quoted2re(k)
if self.quoted(v):
self._update_kr_vr[k][self.quoted2re(v)] = [r, c]
else:
if self.quoted(v):
self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
else:
self._update_ks_vs[k][v] = [r, c]
self._update_kr_vs[k][v] = [r, c]
else:
if self.quoted(res[0]):
self._update_kr[self.quoted2re(res[0])] = [r, c]
if self.quoted(v):
self._update_ks_vr[k][self.quoted2re(v)] = [r, c]
else:
self._update_ks[res[0]] = [r, c]
self._update_ks_vs[k][v] = [r, c]
else:
if self.quoted(f):
self._update_kr[self.quoted2re(f)] = [r, c]
else:
self._update_ks[f] = [r, c]

def node(self, data, tags):
err = []
Expand Down Expand Up @@ -142,6 +143,7 @@ class father:
self.check_err(a.node(None, {"administrative": "boundary"}))
self.check_err(a.node(None, {"name": "FIXME"}))
self.check_err(a.node(None, {"Area": "plop"}))
self.check_err(a.node(None, {"access": "public"}))
self.check_err(a.node(None, {"Fixme": "yes"}))
self.check_err(a.node(None, {"voltage": "10kV"}))
assert not a.node(None, {"area": "plop"})
Expand Down
136 changes: 136 additions & 0 deletions plugins/modules/wikiReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#-*- coding: utf-8 -*-

###########################################################################
## ##
## Copyrights Osmose project 2024 ##
## ##
## This program is free software: you can redistribute it and/or modify ##
## it under the terms of the GNU General Public License as published by ##
## the Free Software Foundation, either version 3 of the License, or ##
## (at your option) any later version. ##
## ##
## This program is distributed in the hope that it will be useful, ##
## but WITHOUT ANY WARRANTY; without even the implied warranty of ##
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ##
## GNU General Public License for more details. ##
## ##
## You should have received a copy of the GNU General Public License ##
## along with this program. If not, see <http://www.gnu.org/licenses/>. ##
## ##
###########################################################################


# This module file contains functions to read MediaWiki markup tables, templates, lists, ...

import wikitextparser
from typing import Union, Optional

# Get a list of lists containing all cells of a table.
# Parameters:
# wikitext - the text of a wikipedia page
# tab_index - the index of the table (if there's multiple tables on the wiki)
# keep_markup - if False, everything (except Templates) will be converted to plain text
# skip_headers - if True, header rows are removed. Assumes all headers are on top
# Returns:
# The cell contents, specified as a list in a list.
# The outer list is the rows, the inner list are the cells in that row
# Throws:
# If the table at the specified index isn't found
def read_wiki_table(wikitext: str, tab_index: int = 0, keep_markup: bool = False, skip_headers: bool = True) -> list[list[Optional[str]]]:
# Drops all markup, such as italics, hyperlinks, ...
if not keep_markup:
wikitext = wikitextparser.remove_markup(wikitext, replace_tables=False, replace_templates=False)

t = wikitextparser.parse(wikitext).tables[tab_index]

# Remove header rows if desired
removable_header_rows = 0
if skip_headers:
removable_header_rows = len(list(filter(lambda c: c.is_header, t.cells(column=0))))
t = t.data()[removable_header_rows:]

# Remove whitespace around the cells
return list(map(lambda row: list(map(lambda c: c.strip() if isinstance(c, str) else c, row)), t))


# Get all instances of a certain wiki template within wikitext
# Parameters:
# wikitext - the text of a wikipedia page
# template_name - the name or names of the template to locate, e.g. 'Deprecated features/item'
# keep_markup - if False, everything (except Templates) will be converted to plain text
# Returns:
# A list containing lists of strings with values [template_string, template_name, argument1, argument2, argument3, ...]
# Example: ["{{Tag | key | value}}", "Tag", "key", "value"]
# (Note that the template_string is affected by the markup removal, so for string replace purposes, use keep_markup=True)
def read_wiki_templates(wikitext: str, template_name: Union[str, list[str]], keep_markup: bool = False) -> list[list[str]]:
if isinstance(template_name, str):
template_name = [template_name]
template_name = list(map(str.lower, template_name))

# Drops all markup, such as italics, hyperlinks, ...
if not keep_markup:
wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)

# Get all templates that match the filter
template_objects = list(filter(lambda t: t.name.strip().lower() in template_name, wikitextparser.parse(wikitext).templates))

return list(map(lambda t: [t.string, t.name.strip()] + [str(a)[1:].strip() for a in t.arguments], template_objects))


# Get all entries in a list within wikitext
# Parameters:
# wikitext - the text of a wikipedia page
# list_index - the index of the list (if there's multiple lists on the wiki)
# keep_markup - if False, everything (except Templates) will be converted to plain text
# include_sublists - if true, include subitems. If false, only include the highest level items
# When true, the list item symbol (*, **, #, ##, :, ...) will also be included in the output
# Returns:
# A list with all list items
# Throws:
# If the list at index list_index doesn't exist
def read_wiki_list(wikitext: str, list_index: int = 0, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
if not keep_markup:
wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)

lst = wikitextparser.parse(wikitext).get_lists()[list_index]
if include_sublists:
# Note this contains the list identifier, e.g. *, **, #, ##
return list(map(str.strip, lst.fullitems))
return list(map(str.strip, lst.items))


# Get all list entries within wikitext
# See read_wiki_list for details (excluding list_index)
def read_all_wiki_lists(wikitext: str, keep_markup: bool = False, include_sublists: bool = False) -> list[str]:
res = []
if not keep_markup:
wikitext = wikitextparser.remove_markup(wikitext, replace_templates=False)

try:
list_index = 0
while True:
res.extend(read_wiki_list(wikitext, list_index=list_index, keep_markup=True, include_sublists=include_sublists))
list_index += 1
except IndexError:
return res


# Convert all instances of Tag-templates to textual tags, e.g. {{Tag|oneway|yes}} -> "oneway=yes"
# Parameters:
# wikitext - the text of a wikipedia page
# quote - whether the tag should be wrapped in ``
# star_value - whether empty tag values should be represented by *
# Returns:
# The wikitext with {{Tag|*}} replaced by the textual tag
def wikitag2text(wikitext: str, quote: bool = False, star_value: bool = True) -> str:
tag_templates = read_wiki_templates(wikitext, ["Tag", "Key"], keep_markup = True)
for t in tag_templates:
k = t[2]
# This part isn't perfect yet, there's special syntax for ;-separated, :-subkeys, :-subvalues, languages, ...
v = "*" if star_value else ""
if len(t) > 3:
v = "".join(t[3:]) or v
if v:
v = "=" + v
wikitext = wikitext.replace(t[0], "{2}{0}{1}{2}".format(k, v, "`" if quote else ""))
return wikitext
Loading
Loading