In [63]:
import pandas as pd
import re

In [96]:
accented_map = {
    "á": "a", "à": "a", "â": "a", "ä": "a", "ã": "a", "å": "a", "ā": "a", "ǎ": "a",
    "é": "e", "è": "e", "ê": "e", "ë": "e", "ē": "e", "ė": "e", "ę": "e",
    "í": "i", "ì": "i", "î": "i", "ï": "i", "ī": "i", "į": "i", "ǐ": "i",
    "ó": "o", "ò": "o", "ô": "o", "ö": "o", "õ": "o", "ø": "o", "ō": "o", "ǫ": "o",
    "ú": "u", "ù": "u", "û": "u", "ü": "u", "ū": "u",
    "ý": "y", "ÿ": "y"
}

vowels = "aeiou:"
consonants = "ʷʲbcdfghjklmnpqrstvwxyz"

def replace_Vconditional(char, replacement, word):
    return re.sub(r'(?<=[' + vowels + r'])' + char + r'(?=[' + vowels + r'])', replacement, word)

def replace_Cconditional(char, replacement, word):
    return re.sub(r'(?<=[' + consonants + r'])' + char + r'(?=[' + consonants + r'])', replacement, word)

def replace_CVconditional(char, replacement, word):
    return re.sub(r'(?<=[' + consonants + r'])' + char + r'(?=[' + vowels + r'])', replacement, word)

def replace_VCconditional(char, replacement, word):
    return re.sub(r'(?<=[' + vowels + r'])' + char + r'(?=[' + consonants + r'])', replacement, word)


def remove_accents(word):
    return "".join(accented_map.get(c, c) for c in word)

Bardi (Nyulnyulan)

In [65]:
BN_corpus1 = ["*wa:mba", "*wa:ɭi", "*ŋaɟi:din", "*ni:miɲ", "*ba:ɳi", "*babur", "*baɟalbara", "*baɲɟuɖa", "*binkara", "*dumbar", "*-kabu", "*di:lba", "*ɟabal", "*kabiɻ", "*kuɭibil", "*makuri", "*ŋa:nka", "*waɻa", "*jaŋki", "*ji:la", "*mu:kaɳ", "*lijan"]

BN_corpus2 = ["a:mba", "a:ɭi", "ŋaji:din", "ni:mi", "ba:ɳi", "bawur", "bajalbar", "baɲɟuɖ", "binkar", "dumbar", "-ko", "di:lba", "ɟawal", "kawiɻ", "kuɭil", "mor", "ŋa:nka", "aɻa", "aŋki", "i:la", "mu:waɳ", "lijan"]

In [77]:
new_corpus = ["" + w[1:] if w[0] in "*" else w for w in BN_corpus1]

new_corpus = ["" + w[1:] if w[0] in "w" else w for w in new_corpus]
new_corpus = ["" + w[1:] if w[0] in "j" else w for w in new_corpus]
new_corpus = [w[:-1] + "" if w[-1] in "ɲ" else w for w in new_corpus]

new_corpus = [w[:-1] + "" if w[-1] in "a" and w[-2] in "rɖ" else w for w in new_corpus]

new_corpus = [re.sub(r'(?<=a)ɟ', 'j', w) for w in new_corpus]
new_corpus = [re.sub(r'(?<=r)i', '', w) for w in new_corpus]

new_corpus = [replace_Vconditional("b", "w", w) if sum(1 for char in w if char in vowels) <= 2 and w not in "-kabu" else w for w in new_corpus] # b > w for all 2 syllabic words
new_corpus = [w.replace("mu:kaɳ", "mu:waɳ") for w in new_corpus]                                                                                # k > w for all 2 syllabic words

new_corpus = [replace_Vconditional("k", "", w) for w in new_corpus]
new_corpus = [replace_Vconditional("b", "", w) for w in new_corpus]

new_corpus = [w.replace("au", "o") for w in new_corpus]
new_corpus = [w.replace("aa", "a") for w in new_corpus]
new_corpus = [w.replace("ii", "i") for w in new_corpus]

dfBN = pd.DataFrame({"Proto-form": BN_corpus1,"Attested form": BN_corpus2,"try": new_corpus})
dfBN["right?"] = dfBN["Attested form"] == dfBN["try"]
missing = dfBN[dfBN["Attested form"] != dfBN["try"]]
dfBN

Unnamed: 0,Proto-form,Attested form,try,right?
0,*wa:mba,a:mba,a:mba,True
1,*wa:ɭi,a:ɭi,a:ɭi,True
2,*ŋaɟi:din,ŋaji:din,ŋaji:din,True
3,*ni:miɲ,ni:mi,ni:mi,True
4,*ba:ɳi,ba:ɳi,ba:ɳi,True
5,*babur,bawur,bawur,True
6,*baɟalbara,bajalbar,bajalbar,True
7,*baɲɟuɖa,baɲɟuɖ,baɲɟuɖ,True
8,*binkara,binkar,binkar,True
9,*dumbar,dumbar,dumbar,True


Proto-Quechua to Tena

In [73]:
PQT_corpus1 = ["ʧumpi", "nutku", "hampatu", "ljantu", "akla", "inti", "wakli", "utka", "kunka", "timpu", "pukju", "mutki", "sanku"]

PQT_corpus2 = ["ʧumbi", "nuktu", "hambatu", "ljandu", "agla", "indi", "wagli", "ukta", "kunga", "timbu", "pugju", "mukti", "sangu"]

In [78]:
new_corpus = [w.replace("tk", "kt") for w in PQT_corpus1]

new_corpus = [re.sub(r'(?<=m)t', 'd', w) for w in new_corpus]
new_corpus = [re.sub(r'(?<=n)t', 'd', w) for w in new_corpus]

new_corpus = [re.sub(r'(?<=m)p', 'b', w) for w in new_corpus]
new_corpus = [re.sub(r'(?<=n)p', 'b', w) for w in new_corpus]

new_corpus = [re.sub(r'(?<=m)k', 'g', w) for w in new_corpus]
new_corpus = [re.sub(r'(?<=n)k', 'g', w) for w in new_corpus]
new_corpus = [re.sub(r'k(?=l)', 'g', w) for w in new_corpus]
new_corpus = [re.sub(r'k(?=j)', 'g', w) for w in new_corpus]


dfPQT = pd.DataFrame({"Proto-form": PQT_corpus1,"Attested form": PQT_corpus2,"try": new_corpus})
dfPQT["right?"] = dfPQT["Attested form"] == dfPQT["try"]
missing = dfPQT[dfPQT["Attested form"] != dfPQT["try"]]
dfPQT

Unnamed: 0,Proto-form,Attested form,try,right?
0,ʧumpi,ʧumbi,ʧumbi,True
1,nutku,nuktu,nuktu,True
2,hampatu,hambatu,hambatu,True
3,ljantu,ljandu,ljandu,True
4,akla,agla,agla,True
5,inti,indi,indi,True
6,wakli,wagli,wagli,True
7,utka,ukta,ukta,True
8,kunka,kunga,kunga,True
9,timpu,timbu,timbu,True


Zacatepec Chatino (Otomanguean)

In [99]:
ZCO_corpus1 = ["*kitè", "*jatiɁ", "*titá", "*leta", "*keku", "*jàtáɁ", "*luʦeɁ", "*kekaɁ", "*kʷitèèʔ", "*kʷetǫ", "*lùtí", "*sitiɁ", "*kʷituluʔ", "*sulu", "*tènè", "*tuɁwa", "*kitì"]
ZCO_corpus1 = [remove_accents(w) for w in ZCO_corpus1]
ZCO_corpus2 = ["kitʲè", "jatiɁ", "titʲá", "lita", "kiko", "jatǎɁ", "loʦeɁ", "kikaɁ", "kʷitʲeēʔ", "kʷitǫ", "lotǐ", "sitʲiɁ", "tʲoloʔ", "solo", "tinē", "toɁwa", "kitʲì"]
ZCO_corpus2 = [remove_accents(w) for w in ZCO_corpus2]

In [None]:
new_corpus = ["" + w[1:] if w[0] in "*" else w for w in ZCO_corpus1]

new_corpus = [re.sub(r'(?<=i)t', 'tʲ', w) for w in new_corpus]

new_corpus = [replace_Cconditional("e", "i", w) for w in new_corpus]
new_corpus = [w.replace("u", "o") for w in new_corpus]

new_corpus = [w.replace("kʷitʲoloʔ", "tʲoloʔ") for w in new_corpus] # kʷi > null / olo or something


dfZCO = pd.DataFrame({"Proto-form": ZCO_corpus1,"Attested form": ZCO_corpus2,"try": new_corpus})
dfZCO["right?"] = dfZCO["Attested form"] == dfZCO["try"]
missing = dfZCO[dfZCO["Attested form"] != dfZCO["try"]]
dfZCO

Unnamed: 0,Proto-form,Attested form,try,right?
0,*kite,kitʲe,kitʲe,True
1,*jatiɁ,jatiɁ,jatiɁ,True
2,*tita,titʲa,titʲa,True
3,*leta,lita,lita,True
4,*keku,kiko,kiko,True
5,*jataɁ,jataɁ,jataɁ,True
6,*luʦeɁ,loʦeɁ,loʦeɁ,True
7,*kekaɁ,kikaɁ,kikaɁ,True
8,*kʷiteeʔ,kʷitʲeeʔ,kʷitʲeeʔ,True
9,*kʷeto,kʷito,kʷito,True
