In [8]:
from pathlib import Path

OUTPUT_DIR = Path("output")

In [9]:
import json
from typing import TypedDict


class TommyEntry(TypedDict):
    lemma: str
    glosses: list[str]


with open(OUTPUT_DIR / "tommy1949_aynudictionary_glosses.json", "r") as f:
    tommy1949_glosses: list[TommyEntry] = json.load(f)


print("Tommy1949:", len(tommy1949_glosses))

for i, gloss in enumerate(tommy1949_glosses):
    print(f"{i}\t{gloss['lemma']}\t{gloss['glosses']}")
    if i > 10:
        break


class WiktionaryGloss(TypedDict):
    lemma: str
    pos: str
    glosses: list[str]


with open(OUTPUT_DIR / "wiktionary_ainu_glosses.json", "r") as f:
    wiktionary_glosses: dict[str, WiktionaryGloss] = json.load(f)

print("Wiktionary:", len(wiktionary_glosses))

for i, (lemma, gloss) in enumerate(wiktionary_glosses.items()):
    print(f"{i}\t{lemma}\t{gloss['glosses']}")
    if i > 10:
        break

class FFEntry(TypedDict):
    kana: str
    latn: str
    glss: str
    pos: str

with open(OUTPUT_DIR / "ff-ainu-saru-terms.json", "r") as f:
    ff_saru_terms: list[FFEntry] = json.load(f)

print("FF Saru:", len(ff_saru_terms))

# with open(OUTPUT_DIR / "wiktionary_ainu_word_compositions.json", "r") as f:
#     wiktionary_morphemes = json.load(f)

# print("Wiktionary:", len(wiktionary_morphemes))

# max = 10
# count = 0
# for word, components in wiktionary_morphemes.items():
#     print(f"{word}\t{components}")
#     count += 1
#     if count > max:
#         break

# with open(OUTPUT_DIR / "wiktionary_ainu_glossed_morphemes.json", "r") as f:
#     glossed_morphemes = json.load(f)

# single_meaning_morphemes = [
#     m for m in glossed_morphemes if len(glossed_morphemes[m]) == 1
# ]

# max = 10
# count = 0
# for word, components in wiktionary_morphemes.items():
#     print(f"{word}\t{components}")
#     count += 1
#     if count > max:
#         break

Tommy1949: 14540
0	=an	['人は', '人が']
1	=an	['その人が']
2	=as	['話し相手を含まない私たち']
3	a	['煮物の汁が多い']
4	a	['名詞を作るときの接頭辞']
5	a	['～したなあ！～だなあ！']
6	a	['～なのか！']
7	a （pl.rok )	['座る', '起きている']
8	-a　（複数 -rok )	['過去に話題にしたことを表す']
9	a p	['～したものの～したのだが～だったが']
10	a=	['一般的に人は～する', 'ここの皆で～する']
11	a=	['その人が']
Wiktionary: 2231
0	wan	['十人', '十']
1	tu	['両方', '第二', '二つ', '多くの', '二人', '沢山の']
2	rak	['～の気配がある', '～の匂いがする', '～の味がある']
3	mi	['～を着る']
4	on	['発酵する']
5	ona	['父親']
6	ay	['矢']
7	oro	['強調する', '～の所', '場所をあらわす名詞の後に置いて', '～の場所として扱えない名詞の後に置いて場所を表す名詞句を作る', '所属形 oro ですでに言及した場所を示す']
8	he	['～か']
9	i	['意味的に目的語を補い', 'それ', 'tranverb}}を{{intrverb}}化する']
10	ne	['～に']
11	si	[]
FF Saru: 975


In [10]:
# Combine glosses

combined_glosses = {
    k: v["glosses"] for k, v in wiktionary_glosses.items() if v["glosses"]
}

for entry in ff_saru_terms:
    if entry["kana"] in combined_glosses and combined_glosses[entry["kana"]]:
        continue
    combined_glosses[entry["kana"]] = list(entry["glss"])

for entry in tommy1949_glosses:
    if entry["lemma"] in combined_glosses and combined_glosses[entry["lemma"]]:
        continue
    if not entry["glosses"]:
        continue

    combined_glosses[entry["lemma"]] = entry["glosses"]


print(len(combined_glosses))
for i, (lemma, gloss) in enumerate(combined_glosses.items()):
    print(f"{lemma}\t{gloss}")
    if i > 10:
        break

with open(OUTPUT_DIR / "combined_glosses.json", "w") as f:
    json.dump(combined_glosses, f, ensure_ascii=False)

16508
wan	['十人', '十']
tu	['両方', '第二', '二つ', '多くの', '二人', '沢山の']
rak	['～の気配がある', '～の匂いがする', '～の味がある']
mi	['～を着る']
on	['発酵する']
ona	['父親']
ay	['矢']
oro	['強調する', '～の所', '場所をあらわす名詞の後に置いて', '～の場所として扱えない名詞の後に置いて場所を表す名詞句を作る', '所属形 oro ですでに言及した場所を示す']
he	['～か']
i	['意味的に目的語を補い', 'それ', 'tranverb}}を{{intrverb}}化する']
ne	['～に']
ni	['木']
