In [12]:
from pathlib import Path

OUTPUT_DIR = Path("../output")

In [26]:
import csv
import regex
import unicodedata

decomposed_words = {}

with open(OUTPUT_DIR / "tommy1949_aynudictionary.tsv", "r") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        if "-" not in row["etymology"]:
            continue
        
        if row["etymology"].startswith("-"):
            continue

        lemma = unicodedata.normalize("NFKC", row["lemma"])
        lemma = regex.sub(r"[\(（\[].*[\)）\]]", "", lemma)
        lemma = regex.sub(r"[\s+]", "", lemma)
        lemma = regex.sub(r"([^aiueoAIUEO])-([^aiueoAIUEO])", r"\1\2", lemma)

        if lemma.startswith("a=") and row["etymology"].startswith("a=") and not row["etymology"].endswith("p") and not row["etymology"].endswith("pe"):
            decomposed_words[lemma[2:]] = regex.split(r"[-=]", row["etymology"][2:])
        else:
            decomposed_words[lemma] = regex.split(r"[-=]", row["etymology"])

In [27]:
items = list(decomposed_words.items())
print(len(items))

10269


In [28]:
incorrect_words = []

for word, components in items:
    if regex.search(r"[^a-zA-Z]", word):
        incorrect_words.append(word)

print(len(incorrect_words))

for word in incorrect_words:
    print(word)


387
a=eanasappe
a=eatup
a=ecatkep
a=ehotkep
a=eisramnep
a=eiwankep
a=ekarpe
a=ekiroroanpe
a=ekuwakorpe
a=eminap
a=eninuype
a=enupurpe
a=enuwapcup
a=eosirokpe
a=epanup
a=epkoasuras
a=eramekotep
a=eramusarakpe
a=eranakpe
a=eratkip
a=erayappe
a=esapamuyep
a=esinap
a=esisip
a=esiyukamip
a=esukep
a=etasumpe
a=etoytap
a=eusitomap
a=eyampe
a=eyayramekotep
a=eynup
a=eywankep
hoku-nispake
a=huskorep
a=kemekarpe
a=omappe
a=oyanenep
a=oypep
a=racitkerep
a=ramuosmap
a=sikakustep
a=tekekarpe
a=ukoresup
a=ykoep
a=ykoykarpe
an=ekaripo
-ap
apa-harkiso
apausta-kartono
apa-utur
ape-etoho
apekes-utur
ape-una
apkas-easkay
apkas-enitan
ar-uweun
asirpeker-epkes
asur-orke
atane-ceppo
-ay
ay-eekimne
ay-episte
aynu-puri
aynu-ramat
aynu-uhoppa
aynu-uttap
ay-uk
Caca-nupuri
car-osuke
ceo-kurki
cep-atte
cep-etu
cepkmiataykamkuma→ay
cikap-amihi
cikap-etu
cikapkina-apappo
cikapkina-epuy
cikir-unrukep
cikuni-etoho
cima-amuspe
cipiyep-atte
cipkoyki-rera
cise-huraye-apto
cis-ekot
citarpe-kamasu
citatoy-onne
ci-turepkop

In [16]:
import json
with open(OUTPUT_DIR / "tommy1949_decomposed_words.json", "w") as f:
    json.dump(decomposed_words, f, ensure_ascii=False)