In [3]:
from pathlib import Path

OUTPUT_DIR = Path("output")

In [5]:
import json

with open(OUTPUT_DIR / "tommy1949_decomposed_words.json", "r") as f:
    decomposed_words = json.load(f)

max = 10
count = 0
for word, components in decomposed_words.items():
    print(f"{word}\t{components}")
    count += 1
    if count > max:
        break

with open(OUTPUT_DIR / "wiktionary_ainu_word_compositions.json", "r") as f:
    wiktionary_morphemes = json.load(f)

max = 10
count = 0
for word, components in wiktionary_morphemes.items():
    print(f"{word}\t{components}")
    count += 1
    if count > max:
        break

with open(OUTPUT_DIR / "wiktionary_ainu_glossed_morphemes.json", "r") as f:
    glossed_morphemes = json.load(f)

single_meaning_morphemes = [
    m for m in glossed_morphemes if len(glossed_morphemes[m]) == 1
]

max = 10
count = 0
for word, components in wiktionary_morphemes.items():
    print(f"{word}\t{components}")
    count += 1
    if count > max:
        break

ahupkarpo	['ahupkar', 'po']
aniuske	['ani', 'uske']
arnorayke	['ar', 'no', 'rayke']
arustekka	['ar', 'ustek', 'ka']
aynukor	['aynu', 'kor']
aynukorkur	['aynu', 'kor', 'kur']
catcari	['cat', 'cari']
a=eanasappe	['a', 'e', 'anasap', 'pe']
earmuye	['ear', 'muye']
easipamam	['e', 'asip', 'amam']
a=eatup	['a', 'e', 'atu', 'p']
rep	[['re', '三つの'], ['-p', 'もの']]
tanpa	[['tan', 'この'], ['pa', '年']]
oyapa	[['oya', '他の'], ['pa', '年']]
ahupte	[['ahup', '入る'], ['-te', 'させる']]
ahunke	[['ahun', '入る'], ['-ke', 'させる']]
ari	[['a', '多い'], ['-re', '〜させる']]
anpe	[['an', 'ある'], ['pe', 'もの']]
hawean	[['hawe', '声・言葉'], ['an', 'ある']]
ikure	[['iku', '酒を飲む'], ['-re', 'させる']]
ikuruy	[['iku', '酒を飲む'], ['ruy', '激しく〜する']]
isepo	[['-se', '〜と鳴く']]


In [76]:
# Combine the two dictionaries with unmarked morphemes

ambiguous_morphemes: dict[str, tuple[set[str], set[str]]] = {}


# Process decomposed_words: if morpheme has single meaning, add to composition dictionary
for word, components in decomposed_words.items():
    marked_components = []
    for i, morpheme in enumerate(components):
        affix_map = {
            "p": "-p",
            "i": "i=",
            "u": "u-",
            "yay": "yay-",
            "no": "-no",
            "e": "e-",
            "ar": "ar-",
            "pe": "-pe",
            "hi": "-hi",
        }

        # handle special cases
        if morpheme in affix_map:
            morpheme = affix_map[morpheme]

        manual_gloss = {
            "yay-": "自分を",
            "e-": "～について",
            "ko-": "～とともに",
            "o-": "～に",
            "po": "子",
            "ar-": "片",
            "kar": "作る",
            "kor": "持つ",
            "u-": "互い",
            "sir": "世界",
            "o": "付く",
            "kamuy": "神",
            "ramu": "心",
            "un": "にある",
            "itak": "話す",
            "or": "場所",
            "nu": "感じる",
            "ram": "心",
            "kus": "通す",
            "us": "付く",
            "wa": "から",
            "sak": "欠く",
            "kusu": "だから",
            "pan": "衰える",
            "tar": "荷縄",
            "sik": "目",
            "kunne": "暗い",
            "ne": "である",
            "resu": "育てる",
            "mo": "静かな",
            "pena": "上の方",
            "ta": "に",
            "iki": "する",
            'sesek': '熱い',
            'ekotanne': '～で村になる',
            'renka': '望み',
            'hum': '響き',
            'an': 'ある',
            'ru': 'すこし',
            're': '名',
            'pa': '頭',
            'pus': '弾ける',
            'anu': '残す'
        }

        manual_suffix = {
            "-p": "もの",
            "-pe": "もの",  # pe 水気,
            "-hi": "ところ",
        }

        if morpheme in manual_suffix:
            if i == len(components) - 1:
                marked_components.append([morpheme, manual_suffix[morpheme]])
        elif morpheme in manual_gloss:
            marked_components.append([morpheme, manual_gloss[morpheme]])
        elif morpheme in single_meaning_morphemes:
            # Add morpheme with its single meaning
            marked_components.append([morpheme, glossed_morphemes[morpheme][0]])
        else:

            # check if morpheme is in wiktionary_morphemes
            if morpheme in glossed_morphemes:
                # not single meaning
                cleaned_gloss = set()
                for gloss in glossed_morphemes[morpheme]:
                    if gloss.startswith("～を") or gloss.startswith("〜を"):
                        cleaned_gloss.add(gloss[2:])
                    else:
                        cleaned_gloss.add(gloss)

                if len(cleaned_gloss) > 1:
                    if morpheme not in ambiguous_morphemes:
                        ambiguous_morphemes[morpheme] = (set(), set())
                    ambiguous_morphemes[morpheme][0].add(word)
                    ambiguous_morphemes[morpheme][1].update(cleaned_gloss)
            else:
                # print(f"{word} has unknown morpheme: {morpheme}")
                pass

    if marked_components:
        if word not in wiktionary_morphemes:
            wiktionary_morphemes[word] = marked_components
        else:
            # print(f"Word {word} already exists in wiktionary_morphemes")
            pass

for morpheme, (words, glosses) in ambiguous_morphemes.items():
    print(f"{morpheme} has ambiguous morpheme: {len(words)} with meanings: {glosses}")
    for word in words:
        print(f"    {word}")

# Save updated composition dictionary
with open(OUTPUT_DIR / "combined_word_compositions.json", "w", encoding="utf-8") as f:
    json.dump(wiktionary_morphemes, f, ensure_ascii=False, indent=4)