In [89]:
import json
with open('output/ff-ainu-karahuto-terms.json') as f:
    terms = json.load(f)

In [90]:
SAKHALIN_BOOKS = {
    "からふとのアイヌご（入門）",
    "カラフトのアイヌ語（中級）",
    "カラフトのアイヌ語（初級）",
    "ニューエクスプレス・スペシャル 日本語の隣人たち I+II",
    "ピウスツキ記念碑",
    "千徳太郎治のピウスツキ宛書簡",
    "浅井タケ昔話全集I,II",
}

words: set[tuple[str, int]] = set()

for book in SAKHALIN_BOOKS:
    with open("../corpus/output/words_by_book/" + book + ".tsv") as f:
        for line in f:
            word, freq = line.strip().split("\t")
            words.add((word, int(freq)))

print(sorted(words, key=lambda x: x[1], reverse=True)[:10])


[('taa', 6697), ('manu', 2356), ('teh', 1320), ('ike', 1198), ('kusu', 1107), ('nah', 1054), ('tani', 1016), ('neampe', 919), ('horokewpo', 892), ('orowa', 854)]


In [91]:
terms = [
    {
        "lemma": term["lemma"],
        "ja": term["glosses"],
        "en": [],
        "ru": [],
        "poses": term["poses"],
        "frequency": 0,
    }
    for term in terms
]

for t in terms[0:5]:
    print(t["ja"])


['５つの']
['５つ']
['歩く']
['弟', '坊や']
['弟', '坊や']


In [92]:
from pathlib import Path
import regex
from utils.sakhalin import extrapolate_sakhalin_from_hokkaido
from typing import TypedDict


terms_index = [term["lemma"] for term in terms]

class Word(TypedDict):
    lemma: str
    glosses: list[str]
    poses: list[str]
    notes: str
    frequency: int

with open("output/wiktionary_ainu_part_of_speech.json") as f:
    part_of_speech = json.load(f)
    extrapolated_part_of_speech = {}
    for lemma, poses in part_of_speech.items():
        replaced = extrapolate_sakhalin_from_hokkaido(lemma)
        if replaced not in part_of_speech:
                extrapolated_part_of_speech[replaced] = poses

combined_part_of_speech = {**part_of_speech, **extrapolated_part_of_speech}

extrapolated_glosses = {}

with open("output/wiktionary_ainu_glossed_morphemes.json") as f:
    glossed_morphemes = json.load(f)
    for morpheme, glosses in glossed_morphemes.items():
        replaced = extrapolate_sakhalin_from_hokkaido(morpheme)
        if replaced not in glossed_morphemes:
            extrapolated_glosses[replaced] = glosses

combined_glosses = {**glossed_morphemes, **extrapolated_glosses}

for (word, freq) in words:
    if word not in terms_index:
        terms.append({"lemma": word, "ja": combined_glosses.get(word, []), "poses": combined_part_of_speech.get(word, []), "frequency": freq})
    else:
        term = next(term for term in terms if term["lemma"] == word)
        term["frequency"] = freq

for file in (Path("input") / "sakhalin").glob(
    "ff-ainu-karahuto-terms-with-corpus-translated-*.json"
):
    print(file)
    with open(file) as f:
        translated = json.load(f)

        for term in terms:
            if term["lemma"] in translated:
                term["ja"] = translated[term["lemma"]]["ja"]
                term["en"] = translated[term["lemma"]]["en"]
                term["ru"] = translated[term["lemma"]]["ru"]

for term in terms:
    if term["lemma"].endswith("hci"):
        if term["lemma"].endswith("ahci"):
            singular = term["lemma"][:-3]
        else:
            singular = term["lemma"][:-2]

        if singular in terms_index and not term["ja"]:
            term["ja"] = next(t for t in terms if t["lemma"] == singular)["ja"]
            term["en"] = next(t for t in terms if t["lemma"] == singular)["en"]
            term["ru"] = next(t for t in terms if t["lemma"] == singular)["ru"]

        if singular in terms_index and not term["poses"]:
            term["poses"] = next(t for t in terms if t["lemma"] == singular)["poses"]

    if term["lemma"] == "hcihi":
        if term["lemma"].endswith("ahcihi"):
            singular = term["lemma"][:-4]
        else:
            singular = term["lemma"][:-3]  
        
        if not term["poses"]:
            term["poses"] = ["n"]

        if singular in terms_index and not term["ja"]:
            term["ja"] = [g + "こと" for g in next(t for t in terms if t["lemma"] == singular)["ja"] if g.startswith("n") ]
            term["en"] = ["that " + g for g in next(t for t in terms if t["lemma"] == singular)["en"] if g.startswith("n") ]
            term["ru"] = ["то " + g for g in next(t for t in terms if t["lemma"] == singular)["ru"] if g.startswith("n") ]

POS_TABLE = {
    "自動詞": "vt",
    "他動詞": "vi",
    "複他動詞": "vd",
    "完全動詞": "vc",
    "名詞": "n",
    "連体詞": "adn",
    "形容詞": "adj",
    "副詞": "adv",
    "接続詞": "cconj",
    "助詞": "post",
    "助動詞": "auxv",
    "auxverb": "auxv",
    "終助詞": "sfp",
    "接尾辞": "sfx",
    "接頭辞": "pfx",
    "間投詞": "intj",
    "interj": "intj",
    "後置副詞": "padv",
    "人称接辞": "pers",
    "繋辞": "cop",
    "位置名詞": "nl",
    "複数形": "pl",
    "副助詞": "advp",
    "suffix": "sfx",
    "prefix": "pfx",
    "verb": "v", # TODO: Get verb slot from Wiktionary
    "代名詞": "pron",
    "格助詞": "postp",
    "接続助詞": "sconj",
    "疑問詞": "int",
    "形式名詞": "nmlz",
    "noun": "n"
}


for term in terms:
    term["poses"] = [POS_TABLE.get(p, None) or p for p in term["poses"]]

all_poses = set()
for term in terms:
    for p in term["poses"]:
        all_poses.add(p)
for p in sorted(all_poses):
    print(p)

terms = [t for t in terms if t.get("frequency", 0) > 1 or t["ja"]]

terms = [t for t in terms if not regex.match(r"^\d+$", t["lemma"])]


# for lemma, glosses in translated.items():
#     if lemma in terms_index:
#         terms[terms_index.index(lemma)]["en"] = glosses["en"]
#         terms[terms_index.index(lemma)]["ru"] = glosses["ru"]


input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-12.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-2.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-20.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-8.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-15.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-4.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-1.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-13.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-16.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-10.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-19.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-7.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-14.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-17.json
input/sakhalin/ff-ainu-karahuto-terms-w

In [93]:
with open("input/sakhalin/additional_terms.json") as f:
    additional_terms = json.load(f)
    terms.extend(additional_terms)


In [94]:
with open("output/ff-ainu-karahuto-terms-with-corpus.json", "w") as f:
    json.dump(terms, f, ensure_ascii=False)

## AI Translations from Japanese to English and Russian

In [95]:
print("All terms:", len(terms))

all_translated = [t for t in terms if t["ja"]]
print("All translated:", len(all_translated))

translated_only_ja = [t for t in all_translated if "en" not in t or not t["en"]]
print("Translated only ja:", len(translated_only_ja))

All terms: 2832
All translated: 1188
Translated only ja: 0


In [96]:
# Generate templates for AI to fill in

import json

untranslated = []

split_terms = [
    translated_only_ja[i : i + 250] for i in range(0, len(translated_only_ja), 250)
]

print(len(split_terms))

UNTRANSLATED_DIR = Path("output/karahuto-untranslated")
UNTRANSLATED_DIR.mkdir(parents=True, exist_ok=True)

for file in UNTRANSLATED_DIR.glob("*.json"):
    file.unlink()

for i, t in enumerate(split_terms):
    print(len(t))
    with open(
        UNTRANSLATED_DIR / f"ff-ainu-karahuto-terms-with-corpus-untranslated-{i}.json",
        "w",
    ) as f:
        json.dump(
            {
                term["lemma"]: {"ja": term["ja"], "en": [], "ru": []}
                for term in t
                if term["ja"]
            },
            f,
            ensure_ascii=False,
        )

0
