In [10]:
import json

with open("../../dictionary/output/wiktionary_ainu_glossed_morphemes.json", "r") as f:
    data = json.load(f)

with open("../output/ainu_words_all.tsv", "r") as f:
    words = [line.split("\t", 1)[0] for line in f.read().splitlines()]

with open("../output/ainu_words_all.tsv", "r") as f:
    words = [line.split("\t", 1)[0] for line in f.read().splitlines()]

print("morphemes       ", list(data.keys()) [:30])

print("words_in_corpora", words[:30])

morphemes        ['re', '-p', 'tan', 'pa', 'oya', 'ahup', '-te', 'ahun', '-ke', 'a', '-re', 'an', 'pe', 'hawe', 'iku', 'ruy', '-se', 'ipe', 'yayirayke', 'rara', 'pasuy', 'ape', 'icakkere', 'meru', 'ruska', 'u-', 'paskuma', 'ikonnup', 'ko-', 'si-']
words_in_corpora ['a=', 'ne', 'wa', '=an', 'kor', 'an', 'ka', 'ta', 'hine', 'e=', 'ki', 'kusu', 'an=', 'pe', 'ruwe', 'i=', 'kamuy', 'p', 'oka', 'hi', 'a', 'kane', 'or', 'taa', 'na', 'ku=', 'utar', 'sekor', 'ye', 'ene']


In [5]:
import regex as re

# Assume `data.keys()` are your known morphemes and `words` is your word list.

morphemes = set(data.keys())  # or just use list(data.keys()) but a set is faster.


def can_segment(word, morphemes):
    """
    Returns True if 'word' can be segmented entirely by the morphemes in 'morphemes'.
    Uses a simple dynamic programming approach (word-break).
    """
    dp = [False] * (len(word) + 1)
    dp[0] = True

    for i in range(len(word)):
        if dp[i]:
            for mor in morphemes:
                # Check if the morpheme matches starting at i
                if word.startswith(mor, i):
                    dp[i + len(mor)] = True
    return dp[-1]


covered = []
uncovered = []

for w in words:
    # Remove boundary markers like '=' and '-' (and their combinations if any)
    w_normalized = re.sub(r"[=\-]+", "", w)

    if can_segment(w_normalized, morphemes):
        covered.append(w)
    else:
        uncovered.append(w)

print("Number of covered words: ", len(covered))
print("Number of uncovered words:", len(uncovered))

Number of covered words:  6439
Number of uncovered words: 27168


In [8]:
freq_map: dict[str, int] = {}

with open("../output/ainu_words_all.tsv", "r") as f:
    for line in f.read().splitlines():
        word, freq = line.split("\t", 1)
        freq_map[word] = int(freq)

In [9]:
for word in sorted(uncovered, key=lambda x: freq_map[x], reverse=True):
    print(word, freq_map[word])


ka 21514
ruwe 12544
i= 11116
p 9752
na 7433
ku= 7180
yakka 5314
no 4260
nep 3820
isam 3562
korka 3489
kuni 3443
ayne 3214
manu 3033
k= 2915
konno 2905
ya 2889
siri 2873
menoko 2782
suy 2770
yan 2655
pakno 2546
tura 2539
cis 2279
eci= 2224
rusuy 2157
sine 2035
okay 1887
nankor 1828
rok 1824
poronno 1729
hawki 1706
v 1554
okkaypo 1405
teh 1355
neno 1352
yupo 1295
sirkunpato 1249
nen 1247
ike 1238
sonno 1232
en= 1222
koraci 1207
neya 1198
pet 1173
turano 1165
humi 1146
ekimne 1134
suke 1126
nah 1094
wano 1078
ci= 1059
ray 1035
nekon 1031
apkas 987
poho 975
tu 973
i 966
cip 956
poka 955
eino 939
okake 922
neampe 920
en 917
noyne 910
hotke 898
horokewpo 895
ney 870
ponno 866
huci 862
uske 846
pirkano 844
monimahpo 809
nani 795
konto 790
hetap 783
an? 774
soy 764
sap 757
yupihi 750
naa 721
tapne 720
tap 703
uk 696
inne 692
sanke 688
sino 679
hawan 676
kasi 672
okere 666
hemanta 664
asinuma 655
tono 651
haw 650
hawas 637
iruska 610
oske 591
hokure 589
okkayo 589
iteki 584
ponmenoko 582
onkami