In [3]:
import json

with open("../../dictionary/output/wiktionary_ainu_glossed_morphemes.json", "r") as f:
    data = json.load(f)
    print("morphemes       ", list(data.keys()) [:30])

with open("../output/ainu_words_all.tsv", "r") as f:
    words = [line.split("\t", 1)[0] for line in f.read().splitlines()]
    print("words           ", words[:30])

# POS for unbound morphemes
with open("../../dictionary/output/wiktionary_ainu_part_of_speech.json", "r") as f:
    part_of_speech = json.load(f)
    print("part_of_speech  ", list(part_of_speech.keys())[:30])

with open("../../dictionary/output/sakhali-terms-extended.json", "r") as f:
    karahuto_terms = json.load(f)
    karahuto_terms_translated = [
        term["lemma"] for term in karahuto_terms if "ja" in term and term["ja"]
    ]
    print("karahuto_terms  ", karahuto_terms_translated[:30])


with open("../output/ainu_words_all.tsv", "r") as f:
    words = [line.split("\t", 1)[0] for line in f.read().splitlines()]
    print("words_in_corpora", words[:30])


morphemes        ['re', '-p', 'tan', 'pa', 'oya', 'ahup', '-te', 'ahun', '-ke', 'a', '-re', 'an', 'pe', 'hawe', 'iku', 'ruy', '-se', 'ipe', 'yayirayke', 'rara', 'pasuy', 'ape', 'icakkere', 'meru', 'ruska', 'u-', 'paskuma', 'ikonnup', 'ko-', 'si-']
words            ['a=', 'ne', 'wa', '=an', 'kor', 'an', 'ka', 'ta', 'hine', 'e=', 'ki', 'kusu', 'an=', 'pe', 'ruwe', 'i=', 'kamuy', 'p', 'oka', 'hi', 'a', 'kane', 'or', 'taa', 'na', 'ku=', 'utar', 'sekor', 'ye', 'ene']
part_of_speech   ['wan', 'tu', 'rak', 'ci', 'mi', 'on', 'ona', 'o', 'ay', 'oro', 'he', 'i', 'ne', 'si', 'ni', 're', 'as', 'te', 'no', 'un', 'os', 'or', 'pet', 'ma', 'e', 'mon', 'an', 'am', 'us', 'a']
karahuto_terms   ['asne', 'asneh', 'ahkas', 'ahkapo', 'ahkapoho', 'ahkari', 'ahsuy', 'ahci', 'ahcihi', 'ahte', 'ahto', 'ahturi', 'ahtopokun', 'ahni', 'ahrus', 'aa', 'aatay', 'aaca', 'aacaha', 'aacapo', 'aane', 'aahunka', 'ay', 'aynu', 'aynuitah', 'ayne', 'ayhe', 'aw', 'awehe', 'asi']
words_in_corpora ['a=', 'ne', 'wa', '=an', 'kor'

In [4]:
freq_map: dict[str, int] = {}

with open("../output/ainu_words_all.tsv", "r") as f:
    for line in f.read().splitlines():
        word, freq = line.split("\t", 1)
        freq_map[word] = int(freq)

In [5]:
import regex as re


def can_segment(word, known_morphemes):
    """
    Return True if 'word' can be segmented entirely by
    the set/list of known morphemes.
    """
    dp = [False] * (len(word) + 1)
    dp[0] = True
    for i in range(len(word)):
        if dp[i]:
            for morpheme in known_morphemes:
                if word.startswith(morpheme, i):
                    dp[i + len(morpheme)] = True
    return dp[-1]


covered = []
uncovered = []

for w in words:
    # Strip out boundary markers
    w_normalized = re.sub(r"[=\-]+", "", w)

    # If the normalized form appears in 'part_of_speech', count it as covered
    if w_normalized in part_of_speech or w_normalized in karahuto_terms_translated:
        covered.append(w)
    else:
        # Otherwise, try to segment it with known morphemes
        if can_segment(w_normalized, data.keys()):
            covered.append(w)
        else:
            uncovered.append(w)

uncovered_words_with_freq = [w for w in uncovered if freq_map[w] > 1]

print("Number of covered words: ", len(covered))
print("Number of uncovered words:", len(uncovered))
print(
    "Number of uncovered words with frequency > 1: ",
    len(uncovered_words_with_freq),
)

Number of covered words:  8142
Number of uncovered words: 25465
Number of uncovered words with frequency > 1:  11852


In [6]:
for word in sorted(uncovered_words_with_freq, key=lambda x: freq_map[x], reverse=True):
    print(f"{word}", freq_map[word])

k= 2915
konno 2905
v 1554
yupo 1295
sirkunpato 1249
en= 1222
nekon 1031
eino 939
neampe 920
en 917
hotke 898
hetap 783
an? 774
ponmenoko 582
epunkine 566
ounno 558
korsi 515
rewsi 509
itekke 508
ya? 465
? 450
hemnoye 448
sini 445
rametok 435
okkay 428
kuskeraypo 411
iwak 392
nepki 367
iskar 364
eattukonnaan 344
néno 332
hikeka 329
inu 328
hoskino 326
heino 321
usi 320
hoppa 314
sinna 313
konna 307
unno 295
koonkami 286
isitoma 281
enon 280
yupke 276
otasut 275
rerko 272
yuputari 262
omap 261
kunine 261
sinki 258
iwor 252
reyep 250
pewre 245
cik 241
enta 238
tutko 231
etoranne 217
kosonte 216
takup 214
ronno 214
pito 211
moyre 210
kotankonnispa 210
nope 208
c= 205
néwaanpe 205
huciape 204
utura 202
pak 200
híne 200
eyam 196
mintar 193
son 192
tasum 192
iokunnuka 190
esirkirap 185
yuptek 185
weniruska 185
tewano 183
b 183
húci 181
sísam 180
kaspaotte 179
kemeyki 172
pokon 172
eytasa 171
emko 169
1 168
sinne 167
hawoka 167
ratcitara 165
tunas 165
neyakka 165
ruwe? 164
rataskep 164
toyta 1