# Extract vocabulary from Japanese Wiktionary

In [407]:
from pathlib import Path
from tqdm.notebook import tqdm

LATEST_WIKITIONARY_DUMP_URL = "https://dumps.wikimedia.org/jawiktionary/latest/jawiktionary-latest-pages-articles-multistream.xml.bz2"

OUTPUT_DIR = Path("../output")
wiktionary_ainu_entries_json_path = OUTPUT_DIR / "wiktionary_ainu_entries.json"

## Extract Ainu entries from Japanese Wiktionary dump data

In [408]:
# import tempfile
# import wiktionary_dump_extractor
# import bz2
# import requests


# with tempfile.TemporaryDirectory() as temp_dir:
#     downloaded_path = Path(temp_dir) / "jawiktionary-latest-pages-articles-multistream.xml.bz2"

#     decompressed_path = Path(temp_dir) / "jawiktionary-latest-pages-articles-multistream.xml"

#     # Only download and process if dump is newer or output doesn't exist
#     with requests.get(LATEST_WIKITIONARY_DUMP_URL, stream=True) as r:
#         r.raise_for_status()
#         total_size = int(r.headers.get('content-length', 0))
#         progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc="Downloading")
#         with open(downloaded_path, "wb") as f:
#             for chunk in r.iter_content(chunk_size=8192):
#                 f.write(chunk)
#                 progress_bar.update(len(chunk))
#         progress_bar.close()

#     print("Finished downloading")

#     with open(decompressed_path, "wb") as f:
#         f.write(bz2.open(downloaded_path, "rb").read())

#     print("Finished decompressing")

#     wiktionary_dump_extractor.extract_ainu_entries(
#         str(decompressed_path), str(wiktionary_ainu_entries_json_path)
#     )

#     print("Finished extracting")

In [409]:
from typing import TypedDict

class Entry(TypedDict):
    title: str
    text: str

## Filter out non-Ainu sections


In [410]:
import json
import regex as re

wiktionary_ainu_entries_filtered_json_path = OUTPUT_DIR / "wiktionary_ainu_entries_filtered.json"

with open(wiktionary_ainu_entries_json_path, "r") as f:
    wiktionary_ainu_entries: list[Entry] = json.load(f)

filtered_entries = [
    entry for entry in wiktionary_ainu_entries
    if re.match(r"^[\p{sc=Latn}=\- '’]+$", entry["title"])
]
# wiktionary_ainu_entries = [
#     entry for entry in wiktionary_ainu_entries if entry["title"]
# ]

In [411]:
from wiktionary.document import Document

filtered_ainu_entries: dict[str, str] = {}
for entry in wiktionary_ainu_entries:
    doc = Document.from_wikitext(entry["text"])
    for section in doc.sections:
        if section.title not in [
            "{{ain}}",
            "{{L|ain}}",
            "アイヌ語",
            "[[:Category:{{ain}}|{{ain}}]]",
            "[[:Category:{{ain}}|{{ain}}]]== <!--これは標準の内容の展開です。書き換えないでください-->",
        ]:
            continue
        filtered_ainu_entries[entry["title"]] = str(object=section)
        break
    # except Exception as e:
    #     print(e)
    #     print(entry["text"])

In [412]:
with open(wiktionary_ainu_entries_filtered_json_path, "w", encoding="utf-8") as f:
    json.dump(filtered_ainu_entries, f, ensure_ascii=False, indent=4)

## Extract Part of Speech information from extracted Ainu entries

In [413]:
import json
import regex as re

with open(wiktionary_ainu_entries_filtered_json_path, "r") as f:
    wiktionary_ainu_entries: dict[str, str] = json.load(f)


valid_entries = {}
for title, text in tqdm(list(wiktionary_ainu_entries.items())):
    if re.search(r"^[a-z=\-]+$", title):
        valid_entries[title] = text

MAP = {
    "後置副詞": "postpadv",
    "助詞": "parti",
    "助動詞": "auxverb",
    "位置名詞": "noun",
    "動詞": "verb",
    "副詞": "adv",
    "疑問{{pronoun": "pronoun",
    "関係詞": "rel",
    "代名詞": "pron",
    "接尾辞": "suffix",
    "名詞": "noun",
    "連体詞": "adnom",
    "間投詞": "interj",
    "数詞": "num",
    "adverb": "adv",
    "interjection": "interj",
    "adjc": "verb",
    "adjective": "verb",
    "形容詞": "verb",
    "conjunction": "conj",
    "adnominal": "adnom",
    "numeral": "num",
    "pronoun": "pron",
    "pref": "prefix",
    "人称接辞": None,
    # non-pos
    "雨": None,
    "鳥": None,
    "魚": None,
    "色": None,
    "動物": None,
    "擬音語": None,
    "オノマトペ": None,
    "果実": None,
    "植物": None,
    "食品": None,
    "家族": None,
    "神事": None,
}

result = {}
for title, text in valid_entries.items():
    found = re.findall(r"\{\{head\|ain\|(?:head=.*?\|)?([^\|\}]+)[^\}]*\}\}", text)

    if "{{ain-verb" in text or "項動詞" in text:
        found.append("verb")

    if "===成句===" in text:
        found.append("colloc")

    found += re.findall(
        r"\[\[(?:Category|カテゴリ):(?:\{\{ain\}\}|アイヌ語)[_ ]\{?\{?([^\|\}\]]+)\}?\}?",
        text,
    )

    if title.startswith("-") or title.startswith("="):
        found.append("suffix")
    if title.endswith("-") or title.endswith("="):
        found.append("prefix")

    filtered = set(MAP[f] if f in MAP else f for f in found if f not in MAP or MAP[f])
    result[title] = filtered

result["tuki"] = {"noun"}

for title, pos in result.items():
    if not pos:
        print(title)
    if any(not re.match(r"^\p{sc=Latn}+$", p) for p in pos):
        print(title, pos)

with open(
    OUTPUT_DIR / "wiktionary_ainu_part_of_speech.json", "w", encoding="utf-8"
) as f:
    json.dump({k: list(v) for k, v in result.items()}, f, ensure_ascii=False, indent=4)

print("-" * 100)
all_pos = set()
for v in result.values():
    all_pos.update(v)

for pos in sorted(all_pos):
    print(pos)

# TODO: UPOS - XPOS

  0%|          | 0/2259 [00:00<?, ?it/s]

orun
----------------------------------------------------------------------------------------------------
adnom
adv
auxverb
colloc
conj
determiner
interj
noun
num
parti
postpadv
prefix
pron
pronoun
rel
root
suffix
verb


## Extract Etymology and Word Compositions from extracted Ainu entries


In [414]:
import json
import regex as re

with open(wiktionary_ainu_entries_filtered_json_path, "r") as f:
    wiktionary_ainu_entries: dict[str, str] = json.load(f)

from collections import defaultdict
# map from morpheme to its etymology
dictionary: dict[str, set[str]] = defaultdict(set)

# map from term to its composition and glossing
composition_dictionary: dict[str, list[tuple[str, str]]] = defaultdict(list)

for title, text in tqdm(list(wiktionary_ainu_entries.items())):
    if re.search(r"\{\{affix\|ain\|.*?\}\}", text):
        for affix in re.finditer(r"\{\{affix\|ain\|(.*)\}\}[。<]?", text):
            arguments = affix.group(1).replace("{{=}}", "=").split("|")
            # print(arguments)

            positional_arguments = []
            keyword_arguments = {}

            for argument in arguments:
                if "=" in argument:
                    key, value = argument.split("=")
                    keyword_arguments[key] = value
                else:
                    positional_arguments.append(argument)

            # print(positional_arguments)
            # print(keyword_arguments)

            # TODO: handle {{m|ain|...}} + {{m|ain|...}}

            for i, argument in enumerate(positional_arguments):
                if f"t{i+1}" in keyword_arguments and keyword_arguments[f"t{i+1}"] != "":
                    gloss = keyword_arguments[f"t{i+1}"].replace("～を", "").split("}}")[0]
                    dictionary[argument].add(gloss)
                    composition_dictionary[title].append((argument, gloss))
with open(OUTPUT_DIR / "wiktionary_ainu_glossed_morphemes.json", "w", encoding="utf-8") as f:
    json.dump(
        {k: list(v) for k, v in dictionary.items()}, f, ensure_ascii=False, indent=4
    )

with open(OUTPUT_DIR / "wiktionary_ainu_word_compositions.json", "w", encoding="utf-8") as f:
    json.dump(
        {k: [tuple(t) for t in v] for k, v in composition_dictionary.items()}, f, ensure_ascii=False, indent=4
    )

    # print(entry["text"])

  0%|          | 0/2259 [00:00<?, ?it/s]

## Extract Word Glosses



In [415]:
import json
import regex as re
from typing import TypedDict

with open(wiktionary_ainu_entries_filtered_json_path, "r") as f:
    wiktionary_ainu_entries: dict[str, str] = json.load(f)

from wiktionary.document import Document


class Entry(TypedDict):
    lemma: str
    pos: str
    glosses: list[str]


gloss_dictionary: dict[str, Entry] = {}

for title, text in tqdm(list(wiktionary_ainu_entries.items())):
    # print(title)
    if not re.match(r"^[\p{scx=Latn}=\- '’0-9_'’]+$", title):
        continue

    doc = Document.from_wikitext(text)
    for section in doc.sections[0].subsections:
        if section.title in [
            "{{pron}}",
            "{{etym}}",
            "{{etym}}1",
            "{{etym}}2",
            "参考文献",
            "出典",
        ]:
            continue
        # print("  " + section.title)
        result_glosses = set()

        for line in section.content.splitlines():

            if line.startswith("#"):
                if (
                    line.startswith("#*")
                    or line.startswith("#**")
                    or line.startswith("#:")
                    or line.startswith("##")
                ):
                    continue
                glosses = line.split("#", 1)[1].strip()
                glosses = re.sub(r"（.*?）", "", glosses)
                glosses = re.sub(r"\(.*?\)", "", glosses)
                glosses = re.sub(r"<!--.*?-->", "", glosses)
                glosses = re.sub(r"→.*$", "", glosses)
                for gloss in re.split(r"[、。]", glosses):
                    gloss = gloss.strip()
                    if not gloss:
                        continue
                    gloss = gloss.replace("〜", "～")
                    gloss = gloss.replace("……", "～")
                    gloss = gloss.replace("…", "～")
                    gloss = gloss.replace("...", "～")
                    gloss = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", gloss)
                    gloss = re.sub(r"\[\[(.*?)\]\]", r"\1", gloss)
                    gloss = re.sub(r"［.*?］", "", gloss)
                    gloss = re.sub(r"\{\{ふりがな\|(.*?)\|(.*?)\}\}", r"\1", gloss)
                    gloss = re.sub(
                        r"\{\{おくりがな2\|(.*?)\|(.*?)\|(.*?)\|(.*?)\}\}",
                        r"\1\3",
                        gloss,
                    )
                    gloss = re.sub(
                        r"\{\{おくりがな3\|(.*?)\|(.*?)\|(.*?)\|(.*?)\|(.*?)\|(.*?)\|(.*?)\}\}",
                        r"\1\3\4\6",
                        gloss,
                    )
                    gloss = re.sub(r"\{\{(?:lb|context|タグ|l)\|(.*?)\}\}", r"", gloss)
                    gloss = re.sub(r"<ref.*", r"", gloss)
                    gloss = gloss.replace("'", "")
                    gloss = gloss.strip()
                    if gloss.startswith("cf."):
                        continue
                    if "noun form of" in gloss:
                        gloss = re.sub(
                            r"\{\{noun form of ?\|ain\|(.*?)\|.*\|所属形.*",
                            r"\1の所属形",
                            gloss,
                        )
                    if "form of" in gloss:
                        gloss = re.sub(
                            r"\{\{form of ?\|.*所属形\|(?:tr=.*?\|)?(.*?)(?:\}|\|)",
                            r"\1の所属形",
                            gloss,
                        )
                    if "alternative form of" in gloss:
                        if "t=" in gloss:
                            gloss = re.sub(
                                r"\{\{alternative form of\|ain\|(.*?)(?:\|t=(.*?))?\}\}",
                                r"\2の別形",
                                gloss,
                            )
                        else:
                            gloss = re.sub(
                                r"\{\{alternative form of\|ain\|(.*?)\}\}",
                                r"\1の別形",
                                gloss,
                            )

                    if "verb form of" in gloss:
                        gloss = re.sub(
                            r"\{\{verb form of\|ain\|(.*?)\|.*\|(?:tr=.*?\|)?t=(.*?)($|\}\})",
                            r"\2",
                            gloss,
                        )
                        # plural of|a|lang=ain
                    if "plural of" in gloss:
                        gloss = re.sub(
                            r"\{\{plural of\|(.*?)\|",
                            r"\1",
                            gloss,
                        )

                        # {'verb form of|ain|rewsian||p|tr={{ain-kana-conv|rewsian}}|t=一晩泊まる'}
                    gloss = gloss.strip("{}")

                    if "{{quote|ain" in gloss:
                        continue
                    if "|from=" in gloss:
                        continue
                    if "ux" in gloss:
                        continue
                    if "|ref=" in gloss:
                        continue
                    if "akana" in gloss:
                        continue
                    if "|t=" in gloss:
                        continue
                    if "lang|" in gloss:
                        continue
                    if gloss:
                        result_glosses.add(gloss)

        # print("    " + str(result_glosses))

        cleaned_pos = section.title.strip("{}")

        pos = MAP[cleaned_pos] if cleaned_pos in MAP else cleaned_pos

        gloss_dictionary[title] = {
            "lemma": title,
            "pos": pos,
            "glosses": list(result_glosses),
        }

        # recleaned_glosses = []
        # for gloss in cleaned_glosses:
        #     if "noun form of" in gloss:
        #         repl = re.sub(
        #             r"\{\{noun form of\|ain\|(.*?)\|\|所属形\|(.*?)\}\}\|(.*?\}\}",
        #             r"\1",
        #             gloss,
        #         )
        # recleaned_glosses.append(cleaned_gloss[repl])
        #
        # print(cleaned_glosses)
# filtered_ainu_entries: dict[str, str] = {}
# for entry in wiktionary_ainu_entries:
#     doc = Document.from_wikitext(entry["text"])
#     for section in doc.sections:
#         if section.title not in [
#             "{{ain}}",
#             "{{L|ain}}",
#             "アイヌ語",
#             "[[:Category:{{ain}}|{{ain}}]]",
#             "[[:Category:{{ain}}|{{ain}}]]== <!--これは標準の内容の展開です。書き換えないでください-->",
#         ]:
#             continue
#         filtered_ainu_entries[entry["title"]] = str(object=section)
#         break
#     # except Exception as e:
#     #     print(e)
#     #     print(entry["text"])

  0%|          | 0/2259 [00:00<?, ?it/s]

In [416]:
for i, entry in enumerate(gloss_dictionary.values()):
    for gloss in entry["glosses"]:
        if "の所属形" in gloss:
            continue
        if "の別形" in gloss:
            continue
        if "参照" in gloss:
            continue
        if "を見よ" in gloss:
            continue
        if re.match(r"[^\p{scx=Han}\p{scx=Hira}\p{scx=Kana}～]", gloss):
            print(entry["lemma"], gloss)
            del gloss_dictionary[entry["lemma"]]

i tranverb}}を{{intrverb}}化する


RuntimeError: dictionary changed size during iteration

In [406]:
for i, entry in enumerate(gloss_dictionary.values()):
    if i > 10:
        break
    print(entry)

with open(OUTPUT_DIR / "wiktionary_ainu_glosses.json", "w", encoding="utf-8") as f:
    json.dump(gloss_dictionary, f, ensure_ascii=False, indent=4)


{'lemma': 'wan', 'pos': 'num', 'glosses': ['十人', '十']}
{'lemma': 'tu', 'pos': 'num', 'glosses': ['両方', '第二', '二つ', '多くの', '二人', '沢山の']}
{'lemma': 'rak', 'pos': 'verb', 'glosses': ['～の気配がある', '～の匂いがする', '～の味がある']}
{'lemma': 'mi', 'pos': 'verb', 'glosses': ['～を着る']}
{'lemma': 'on', 'pos': 'verb', 'glosses': ['発酵する']}
{'lemma': 'ona', 'pos': 'noun', 'glosses': ['父親']}
{'lemma': 'ay', 'pos': 'noun', 'glosses': ['矢']}
{'lemma': 'oro', 'pos': 'noun', 'glosses': ['強調する', '～の所', '場所をあらわす名詞の後に置いて', '～の場所として扱えない名詞の後に置いて場所を表す名詞句を作る', '所属形 oro ですでに言及した場所を示す']}
{'lemma': 'he', 'pos': 'parti', 'glosses': ['～か']}
{'lemma': 'i', 'pos': 'prefix', 'glosses': ['意味的に目的語を補い', 'それ', 'tranverb}}を{{intrverb}}化する']}
{'lemma': 'ne', 'pos': 'parti', 'glosses': ['～に']}
