# Extract vocabulary from Japanese Wiktionary

In [2]:
from pathlib import Path
from tqdm.notebook import tqdm

LATEST_WIKITIONARY_DUMP_URL = "https://dumps.wikimedia.org/jawiktionary/latest/jawiktionary-latest-pages-articles-multistream.xml.bz2"

OUTPUT_DIR = Path("../output")
wiktionary_ainu_entries_json_path = OUTPUT_DIR / "wiktionary_ainu_entries.json"

### Extract Ainu entries from Japanese Wiktionary dump data

In [4]:
import tempfile
import wiktionary_dump_extractor
import bz2
import requests



with tempfile.TemporaryDirectory() as temp_dir:
    downloaded_path = Path(temp_dir) / "jawiktionary-latest-pages-articles-multistream.xml.bz2"

    decompressed_path = Path(temp_dir) / "jawiktionary-latest-pages-articles-multistream.xml"

    # Only download and process if dump is newer or output doesn't exist
    with requests.get(LATEST_WIKITIONARY_DUMP_URL, stream=True) as r:
        r.raise_for_status()
        total_size = int(r.headers.get('content-length', 0))
        progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc="Downloading")
        with open(downloaded_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
                progress_bar.update(len(chunk))
        progress_bar.close()

    print("Finished downloading")

    with open(decompressed_path, "wb") as f:
        f.write(bz2.open(downloaded_path, "rb").read())

    print("Finished decompressing")

    wiktionary.extract_ainu_entries(str(decompressed_path), str(wiktionary_ainu_entries_json_path))

    print("Finished extracting")


Downloading:   0%|          | 0.00/91.8M [00:00<?, ?iB/s]

Finished downloading
Finished decompressing
Finished extracting


### Extract Part of Speech information from extracted Ainu entries

In [5]:
import json
import regex as re

with open(wiktionary_ainu_entries_json_path, "r") as f:
    wiktionary_ainu_entries = json.load(f)


valid_entries = {}
for entry in tqdm(wiktionary_ainu_entries):
    if re.search(r"^[a-z=\-]+$", entry["title"]):
        valid_entries[entry["title"]] = entry["text"]

MAP = {
    "後置副詞": "postpadv",
    "助詞": "parti",
    "助動詞": "auxverb",
    "位置名詞": "noun",
    "動詞": "verb",
    "副詞": "adv",
    "疑問{{pronoun": "pronoun",
    "関係詞": "rel",
    "代名詞": "pron",
    "接尾辞": "suffix",
    "名詞": "noun",
    "連体詞": "adnom",
    "間投詞": "interj",
    "数詞": "num",
    "adverb": "adv",
    "interjection": "interj",
    "adjc": "verb",
    "adjective": "verb",
    "形容詞": "verb",
    "conjunction": "conj",
    "adnominal": "adnom",
    "numeral": "num",
    "pronoun": "pron",
    "pref": "prefix",
    "人称接辞": None,
    # non-pos
    "雨": None,
    "鳥": None,
    "魚": None,
    "色": None,
    "動物": None,
    "擬音語": None,
    "オノマトペ": None,
    "果実": None,
    "植物": None,
    "食品": None,
    "家族": None,
    "神事": None,
}

result = {}
for title, text in valid_entries.items():
    found = re.findall(r"\{\{head\|ain\|(?:head=.*?\|)?([^\|\}]+)[^\}]*\}\}", text)

    if "{{ain-verb" in text or "項動詞" in text:
        found.append("verb")

    if "===成句===" in text:
        found.append("colloc")

    found += re.findall(
        r"\[\[(?:Category|カテゴリ):(?:\{\{ain\}\}|アイヌ語)[_ ]\{?\{?([^\|\}\]]+)\}?\}?",
        text,
    )

    if title.startswith("-") or title.startswith("="):
        found.append("suffix")
    if title.endswith("-") or title.endswith("="):
        found.append("prefix")

    filtered = set(MAP[f] if f in MAP else f for f in found if f not in MAP or MAP[f])
    result[title] = filtered

result["tuki"] = {"noun"}

for title, pos in result.items():
    if not pos:
        print(title)
    if any(not re.match(r"^\p{sc=Latn}+$", p) for p in pos):
        print(title, pos)

with open(
    OUTPUT_DIR / "wiktionary_ainu_part_of_speech.json", "w", encoding="utf-8"
) as f:
    json.dump({k: list(v) for k, v in result.items()}, f, ensure_ascii=False, indent=4)

print("-" * 100)
all_pos = set()
for v in result.values():
    all_pos.update(v)

for pos in sorted(all_pos):
    print(pos)

# TODO: UPOS - XPOS

  0%|          | 0/2318 [00:00<?, ?it/s]

ma
hima
tagis
asu
num
pitu
siva
chwast
----------------------------------------------------------------------------------------------------
adnom
adv
auxverb
colloc
conj
determiner
interj
noun
num
parti
postpadv
prefix
pron
pronoun
rel
root
suffix
verb


### Extract Etymology and Word Compositions from extracted Ainu entries


In [3]:
import json
import regex as re

with open(wiktionary_ainu_entries_json_path, "r") as f:
    wiktionary_ainu_entries = json.load(f)

from collections import defaultdict
# map from morpheme to its etymology
dictionary: dict[str, set[str]] = defaultdict(set)

for entry in tqdm(iterable=wiktionary_ainu_entries):
    if re.search(r"\{\{affix\|ain\|.*?\}\}", entry["text"]):
        for affix in re.finditer(r"\{\{affix\|ain\|(.*)\}\}[。<]?", entry["text"]):
            arguments = affix.group(1).replace("{{=}}", "=").split("|")
            # print(arguments)

            positional_arguments = []
            keyword_arguments = {}

            for argument in arguments:
                if "=" in argument:
                    key, value = argument.split("=")
                    keyword_arguments[key] = value
                else:
                    positional_arguments.append(argument)

            # print(positional_arguments)
            # print(keyword_arguments)

            # TODO: handle {{m|ain|...}} + {{m|ain|...}}

            for i, argument in enumerate(positional_arguments):
                if f"t{i+1}" in keyword_arguments and keyword_arguments[f"t{i+1}"] != "":
                    gloss = keyword_arguments[f"t{i+1}"].replace("～を", "").split("}}")[0]
                    dictionary[argument].add(gloss)

with open(OUTPUT_DIR / "wiktionary_ainu_glossed_morphemes.json", "w", encoding="utf-8") as f:
    json.dump(
        {k: list(v) for k, v in dictionary.items()}, f, ensure_ascii=False, indent=4
    )

    # print(entry["text"])

  0%|          | 0/2318 [00:00<?, ?it/s]

In [5]:
# find ?
for entry in tqdm(iterable=wiktionary_ainu_entries):
    text = entry["text"]
    if "{{etym}}" in text or "語源" in text:
        print(text)


  0%|          | 0/2318 [00:00<?, ?it/s]