# Generate Karahuto data

## Load and clean ff-ainu data

In [120]:
import json
from typing import TypedDict, cast

class RawTerm(TypedDict):
    lemma: str
    glosses: list[str]
    poses: list[str]

with open('output/ff-ainu-karahuto-terms.json') as f:
    raw_terms = cast(list[RawTerm], json.load(f))

In [121]:
class Term(TypedDict):
    lemma: str
    ja: list[str]
    en: list[str]
    ru: list[str]
    poses: list[str]
    frequency: int

terms: list[Term] = [
    {
        "lemma": term["lemma"],
        "ja": term["glosses"],
        "en": [],
        "ru": [],
        "poses": term["poses"],
        "frequency": 0,
    }
    for term in raw_terms
]

for t in terms[0:5]:
    print(t["ja"])

['５つの']
['５つ']
['歩く']
['弟', '坊や']
['弟', '坊や']


In [122]:
terms_index = set(term["lemma"] for term in terms)
print("Unique lemmas:", len(terms_index))

Unique lemmas: 1076


## Get morphological data from Wiktionary

### Part of speech

In [123]:
from utils.sakhalin import extrapolate_sakhalin_from_hokkaido

with open("output/wiktionary_ainu_part_of_speech.json") as f:
    part_of_speech = json.load(f)
    extrapolated_part_of_speech = {}
    for lemma, poses in part_of_speech.items():
        replaced = extrapolate_sakhalin_from_hokkaido(lemma)
        if replaced not in part_of_speech:
            extrapolated_part_of_speech[replaced] = poses

combined_part_of_speech = {**part_of_speech, **extrapolated_part_of_speech}

### Morpheme glosses

In [124]:

extrapolated_glosses = {}

with open("output/wiktionary_ainu_glossed_morphemes.json") as f:
    glossed_morphemes = json.load(f)
    for morpheme, glosses in glossed_morphemes.items():
        replaced = extrapolate_sakhalin_from_hokkaido(morpheme)
        if replaced not in glossed_morphemes:
            extrapolated_glosses[replaced] = glosses

combined_glosses = {**glossed_morphemes, **extrapolated_glosses}

## Extend with AI translations

In [125]:
from pathlib import Path


for file in (Path("input") / "sakhalin").glob(
    "ff-ainu-karahuto-terms-with-corpus-translated-*.json"
):
    print(file)
    with open(file) as f:
        translated = json.load(f)

        for term in terms:
            if term["lemma"] in translated:
                term["ja"] = translated[term["lemma"]]["ja"]
                term["en"] = translated[term["lemma"]]["en"]
                term["ru"] = translated[term["lemma"]]["ru"]

input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-12.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-2.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-20.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-8.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-15.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-4.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-1.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-13.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-16.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-10.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-19.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-7.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-14.json
input/sakhalin/ff-ainu-karahuto-terms-with-corpus-translated-17.json
input/sakhalin/ff-ainu-karahuto-terms-w

## Add words from corpus

In [126]:
from collections import defaultdict
import regex

SAKHALIN_BOOKS = {
    "からふとのアイヌご（入門）",
    "カラフトのアイヌ語（中級）",
    "カラフトのアイヌ語（初級）",
    "ニューエクスプレス・スペシャル 日本語の隣人たち I+II",
    "ピウスツキ記念碑",
    "千徳太郎治のピウスツキ宛書簡",
    "浅井タケ昔話全集I,II",
}

words: dict[str, int] = defaultdict(int)

for book in SAKHALIN_BOOKS:
    with open("../corpus/output/words_by_book/" + book + ".tsv") as f:
        for line in f:
            word, freq = line.strip().split("\t")

            if "mp" in word:
                word = word.replace("mp", "np")

            if word in ["m", "horo", "hetunaa", "pii", "cooruntee", "okta"]:
                continue

            if (len(word) > 3 and word.endswith("=")) or word.endswith("?"):
                word = word[:-1]
            if not word:
                continue

            if regex.match(r"^[\d\p{P}]+$", word):
                continue

            words[word] += int(freq)

print(len(words))

print(sorted(words.items(), key=lambda x: x[1], reverse=True)[:10])

4403
[('taa', 6713), ('manu', 2388), ('teh', 1353), ('ike', 1208), ('kusu', 1197), ('nah', 1092), ('tani', 1072), ('neanpe', 934), ('orowa', 898), ('horokewpo', 897)]


In [127]:
for word, freq in words.items():
    if word not in terms_index:
        terms.append(
            {
                "lemma": word,
                "ja": combined_glosses.get(word, []),
                "poses": combined_part_of_speech.get(word, []),
                "frequency": freq,
            }
        )
    else:
        term = next(term for term in terms if term["lemma"] == word)
        term["frequency"] = freq

In [128]:
from pathlib import Path
import regex
from typing import TypedDict

class Word(TypedDict):
    lemma: str
    glosses: list[str]
    poses: list[str]
    notes: str
    frequency: int

for term in terms:
    if term["lemma"].endswith("hci"):
        if term["lemma"].endswith("ahci"):
            singular = term["lemma"][:-3]
        else:
            singular = term["lemma"][:-2]

        if singular in terms_index and not term["ja"]:
            term["ja"] = next(t for t in terms if t["lemma"] == singular)["ja"]
            term["en"] = next(t for t in terms if t["lemma"] == singular)["en"]
            term["ru"] = next(t for t in terms if t["lemma"] == singular)["ru"]

        if singular in terms_index and not term["poses"]:
            term["poses"] = next(t for t in terms if t["lemma"] == singular)["poses"]

    if term["lemma"] == "hcihi":
        if term["lemma"].endswith("ahcihi"):
            singular = term["lemma"][:-4]
        else:
            singular = term["lemma"][:-3]  
        
        if not term["poses"]:
            term["poses"] = ["n"]

        if singular in terms_index and not term["ja"]:
            term["ja"] = [g + "こと" for g in next(t for t in terms if t["lemma"] == singular)["ja"] if g.startswith("n") ]
            term["en"] = ["that " + g for g in next(t for t in terms if t["lemma"] == singular)["en"] if g.startswith("n") ]
            term["ru"] = ["то " + g for g in next(t for t in terms if t["lemma"] == singular)["ru"] if g.startswith("n") ]

POS_TABLE = {
    "自動詞": "vt",
    "他動詞": "vi",
    "複他動詞": "vd",
    "完全動詞": "vc",
    "名詞": "n",
    "連体詞": "adn",
    "形容詞": "adj",
    "副詞": "adv",
    "接続詞": "cconj",
    "助詞": "post",
    "助動詞": "auxv",
    "auxverb": "auxv",
    "終助詞": "sfp",
    "接尾辞": "sfx",
    "接頭辞": "pfx",
    "間投詞": "intj",
    "interj": "intj",
    "後置副詞": "padv",
    "人称接辞": "pers",
    "繋辞": "cop",
    "位置名詞": "nl",
    "複数形": "pl",
    "副助詞": "advp",
    "suffix": "sfx",
    "prefix": "pfx",
    "verb": "v", # TODO: Get verb slot from Wiktionary
    "代名詞": "pron",
    "格助詞": "postp",
    "接続助詞": "sconj",
    "疑問詞": "int",
    "形式名詞": "nmlz",
    "noun": "n"
}


for term in terms:
    term["poses"] = [POS_TABLE.get(p, None) or p for p in term["poses"]]

all_poses = set()
for term in terms:
    for p in term["poses"]:
        all_poses.add(p)
for p in sorted(all_poses):
    print(p)

terms = [t for t in terms if t.get("frequency", 0) > 1 or t["ja"]]

terms = [t for t in terms if not regex.match(r"^\d+$", t["lemma"])]


# for lemma, glosses in translated.items():
#     if lemma in terms_index:
#         terms[terms_index.index(lemma)]["en"] = glosses["en"]
#         terms[terms_index.index(lemma)]["ru"] = glosses["ru"]


adn
adv
advp
auxv
cconj
colloc
cop
int
intj
n
nl
nmlz
num
padv
parti
pers
pfx
pl
postp
pron
rel
root
sconj
sfp
sfx
v
vc
vd
vi
vt


## Add additional terms

In [129]:
with open("input/sakhalin/additional_terms.json") as f:
    additional_terms = cast(list[Term], json.load(f))

    for term in additional_terms:
        if term["lemma"] not in terms_index:
            terms.append(term)
        else:
            t = next(t for t in terms if t["lemma"] == term["lemma"])
            t["ja"] = t["ja"] + term["ja"]
            t["en"] = t.get("en", []) + term["en"]
            t["ru"] = t.get("ru", []) + term["ru"]
            t["frequency"] = t.get("frequency", 0) + term.get("frequency", 0)

In [130]:
from typing import TypedDict, cast, NotRequired

PartialTerm = TypedDict("PartialTerm", {"ja": list[str], "en": list[str], "ru": list[str], "poses": list[str], "frequency": int}, total=False)

DerivedFrom = TypedDict("DerivedFrom", {"lemma": str, "from": str, "overwrite": NotRequired[PartialTerm]  })

with open("input/sakhalin/derived_from.json") as f:
    derived_from = cast(list[DerivedFrom], json.load(f))

    for term in derived_from:
        if term["from"] not in terms_index:
            print(f"{term['from']} not found")
            continue

        found_term = next(t for t in terms if t["lemma"] == term["from"])
        overwrite: PartialTerm = term.get("overwrite", {})

        constructed_term: Term = {
            "lemma": term["lemma"],
            "ja": overwrite.get("ja", []) or found_term["ja"],
            "en": term.get("overwrite", {}).get("en", []) or found_term["en"],
            "ru": term.get("overwrite", {}).get("ru", []) or found_term["ru"],
            "poses": term.get("overwrite", {}).get("poses", []) or found_term["poses"],
            "frequency": term.get("overwrite", {}).get("frequency", 0)
            or found_term["frequency"],
        }

        print(constructed_term)

        target_term = next((t for t in terms if t["lemma"] == term["lemma"]), None)

        if target_term is None:
            terms.append(constructed_term)
        else:
            target_term["ja"] = constructed_term["ja"]
            target_term["en"] = constructed_term["en"]
            target_term["ru"] = constructed_term["ru"]
            target_term["poses"] = constructed_term["poses"]
            target_term["frequency"] = constructed_term["frequency"]


{'lemma': 'manuy', 'ja': ['という', 'そうだ'], 'en': ['called', 'it seems'], 'ru': ['называется', 'кажется'], 'poses': ['auxv'], 'frequency': 2388}


## Export the result

In [131]:
with open("output/ff-ainu-karahuto-terms-with-corpus.json", "w") as f:
    json.dump(terms, f, ensure_ascii=False)

## Inspect the result

In [132]:
import json
for t in sorted(terms, key=lambda x: x.get("frequency", 0), reverse=True):
    if not t["ja"]:
        print(json.dumps({
            "lemma": t["lemma"],
            "ja": [],
            "en": [],
            "ru": [],
            "poses": t["poses"],
            "frequency": 0
        }, ensure_ascii=False) + ",")

{"lemma": "neya", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "ohta", "ja": [], "en": [], "ru": [], "poses": ["colloc"], "frequency": 0},
{"lemma": "neeteh", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "acahcipo", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "omantene", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "iineahsuy", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "anayne", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "uta", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "hii", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "okaaketa", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "waa", "ja": [], "en": [], "ru": [], "poses": [], "frequency": 0},
{"lemma": "hawehe", "ja": [], "en": [], "ru": [], "poses": ["n"], "frequency": 0},
{"lemma": "otakaata", "ja": [], "

## AI Translations from Japanese to English and Russian

In [133]:
print("All terms:", len(terms))

all_translated = [t for t in terms if t["ja"]]
print("All translated:", len(all_translated))

translated_only_ja = [t for t in all_translated if "en" not in t or not t["en"]]
print("Translated only ja:", len(translated_only_ja))

All terms: 2817
All translated: 1225
Translated only ja: 48


In [134]:
# Generate templates for AI to fill in

import json

untranslated = []

split_terms = [
    translated_only_ja[i : i + 250] for i in range(0, len(translated_only_ja), 250)
]

print(len(split_terms))

UNTRANSLATED_DIR = Path("output/karahuto-untranslated")
UNTRANSLATED_DIR.mkdir(parents=True, exist_ok=True)

for file in UNTRANSLATED_DIR.glob("*.json"):
    file.unlink()

for i, t in enumerate(split_terms):
    print(len(t))
    with open(
        UNTRANSLATED_DIR / f"ff-ainu-karahuto-terms-with-corpus-untranslated-{i}.json",
        "w",
    ) as f:
        json.dump(
            {
                term["lemma"]: {"ja": term["ja"], "en": [], "ru": []}
                for term in t
                if term["ja"]
            },
            f,
            ensure_ascii=False,
        )

1
48
