In [1]:
import json
with open('output/ff-ainu-karahuto-terms.json') as f:
    terms = json.load(f)

In [7]:
SAKHALIN_BOOKS = {
    "からふとのアイヌご（入門）",
    "カラフトのアイヌ語（中級）",
    "カラフトのアイヌ語（初級）",
    "ニューエクスプレス・スペシャル 日本語の隣人たち I+II",
    "ピウスツキ記念碑",
    "千徳太郎治のピウスツキ宛書簡",
    "浅井タケ昔話全集I,II",
}

words: set[tuple[str, int]] = set()

for book in SAKHALIN_BOOKS:
    with open("../corpus/output/words_by_book/" + book + ".tsv") as f:
        for line in f:
            word, freq = line.strip().split("\t")
            words.add((word, int(freq)))


In [44]:
import regex
from utils.sakhalin import extrapolate_sakhalin_from_hokkaido
from typing import TypedDict


terms_index = [term["lemma"] for term in terms]

class Word(TypedDict):
    lemma: str
    glosses: list[str]
    poses: list[str]
    notes: str
    frequency: int

with open("output/wiktionary_ainu_part_of_speech.json") as f:
    part_of_speech = json.load(f)
    extrapolated_part_of_speech = {}
    for lemma, poses in part_of_speech.items():
        replaced = extrapolate_sakhalin_from_hokkaido(lemma)
        if replaced not in part_of_speech:
                extrapolated_part_of_speech[replaced] = poses

combined_part_of_speech = {**part_of_speech, **extrapolated_part_of_speech}

extrapolated_glosses = {}

with open("output/wiktionary_ainu_glossed_morphemes.json") as f:
    glossed_morphemes = json.load(f)
    for morpheme, glosses in glossed_morphemes.items():
        replaced = extrapolate_sakhalin_from_hokkaido(morpheme)
        if replaced not in glossed_morphemes:
            extrapolated_glosses[replaced] = glosses

combined_glosses = {**glossed_morphemes, **extrapolated_glosses}

for (word, freq) in words:
    if word not in terms_index:
        terms.append({"lemma": word, "glosses": combined_glosses.get(word, []), "poses": combined_part_of_speech.get(word, []), "frequency": freq})
    else:
        term = next(term for term in terms if term["lemma"] == word)
        term["frequency"] = freq

for term in terms:
    if term["lemma"].endswith("hci"):
        if term["lemma"].endswith("ahci"):
            singular = term["lemma"][:-3]
        else:
            singular = term["lemma"][:-2]
        if singular in terms_index:
            term["glosses"] = next(t for t in terms if t["lemma"] == singular)["glosses"]
            term["poses"] = next(t for t in terms if t["lemma"] == singular)["poses"]


POS_TABLE = {
    "自動詞": "vt",
    "他動詞": "vi",
    "複他動詞": "vd",
    "完全動詞": "vc",
    "名詞": "n",
    "連体詞": "adn",
    "形容詞": "adj",
    "副詞": "adv",
    "接続詞": "cconj",
    "助詞": "post",
    "助動詞": "auxv",
    "auxverb": "auxv",
    "終助詞": "sfp",
    "接尾辞": "sfx",
    "接頭辞": "pfx",
    "間投詞": "intj",
    "interj": "intj",
    "後置副詞": "padv",
    "人称接辞": "pers",
    "繋辞": "cop",
    "位置名詞": "nl",
    "複数形": "pl",
    "副助詞": "advp",
    "suffix": "sfx",
    "prefix": "pfx",
    "verb": "v", # TODO: Get verb slot from Wiktionary
    "代名詞": "pron",
    "格助詞": "postp",
    "接続助詞": "sconj",
    "疑問詞": "int",
    "形式名詞": "nmlz",
    "noun": "n"
}


for term in terms:
    term["poses"] = [POS_TABLE.get(p, None) or p for p in term["poses"]]

all_poses = set()
for term in terms:
    for p in term["poses"]:
        all_poses.add(p)
for p in sorted(all_poses):
    print(p)

terms = [t for t in terms if t.get("frequency", 0) > 1 or t["glosses"]]

terms = [t for t in terms if not regex.match(r"^\d+$", t["lemma"])]


with open("output/ff-ainu-karahuto-terms-with-corpus.json", "w") as f:
    json.dump(terms, f, ensure_ascii=False)


adn
adv
advp
aux
auxv
colloc
conj
cop
int
intj
n
nl
nmlz
num
padv
parti
pers
pfx
pl
postp
pron
rel
root
sfp
sfx
v
vc
vd
vi
vt
