In [1]:
import pandas as pd
import re
import regex

# Reading cells 

We have manually made tables in `etc/`, we just annotate them with POS and concatenate to make a single table:

In [2]:

LatInfLexi_verbs_cells = pd.read_csv("etc/LatInfLexi-verbs_cells.csv")
LatInfLexi_verbs_cells["POS"] = "verb"
LatInfLexi_nouns_cells = pd.read_csv("etc/LatInfLexi-nouns_cells.csv")
LatInfLexi_verbs_cells["POS"] = "noun"
LatInfLexi_cells = pd.concat([LatInfLexi_verbs_cells, LatInfLexi_nouns_cells])

# Reading forms

We get the previous version's forms

In [3]:
%%bash 
git checkout v1.1 -- LatInfLexi-nouns.csv
git checkout v1.1 -- LatInfLexi-verbs.csv

We read these tables

In [4]:
LatInfLexi_verbs = pd.read_csv("LatInfLexi-verbs.csv")
LatInfLexi_nouns = pd.read_csv("LatInfLexi-nouns.csv")

In [5]:
LatInfLexi_verbs

Unnamed: 0,lexeme,PoSTag:features,form,form_IPA,freqTFTL,freqAntiquitas,freqAetasPatrum,freqMediumAeuum,freqRecentiorLatinitas
0,abalieno,VERB:Fin+Ind+Pres+-+Act+1+Sing+-+-,abaliēnō,abalieːnoː,0,0,0,0,0
1,abalieno,VERB:Fin+Ind+Pres+-+Act+2+Sing+-+-,abaliēnās,abalieːnaːs,0,0,0,0,0
2,abalieno,VERB:Fin+Ind+Pres+-+Act+3+Sing+-+-,abaliēnat,abalieːnat,5,2,2,1,0
3,abalieno,VERB:Fin+Ind+Pres+-+Act+1+Plur+-+-,abaliēnāmus,abalieːnaːmus,0,0,0,0,0
4,abalieno,VERB:Fin+Ind+Pres+-+Act+2+Plur+-+-,abaliēnātis,abalieːnaːtis,2,1,1,0,0
...,...,...,...,...,...,...,...,...,...
850387,uulnero,VERB:Part+-+Fut+-+Act+-+Plur+Voc+Fem,uulnerātūrae,wulneraːtuːraj,0,0,0,0,0
850388,uulnero,VERB:Part+-+Fut+-+Act+-+Plur+Voc+Neut,uulnerātūra,wulneraːtuːra,0,0,0,0,0
850389,uulnero,VERB:Part+-+Fut+-+Act+-+Plur+Abl+Masc,uulnerātūrīs,wulneraːtuːriːs,0,0,0,0,0
850390,uulnero,VERB:Part+-+Fut+-+Act+-+Plur+Abl+Fem,uulnerātūrīs,wulneraːtuːriːs,0,0,0,0,0


# Combining forms tables

Adding POS

In [6]:
LatInfLexi_verbs["POS"] = "verb"
LatInfLexi_nouns["POS"] = "noun"

Generating form_id

In [7]:
def add_form_id(df, suffix=""):
    df.index.name = "form_id"
    df.reset_index(inplace=True)
    df["form_id"] = "form_" + df["form_id"].apply(str) + suffix

add_form_id(LatInfLexi_verbs, "_v")
add_form_id(LatInfLexi_nouns, "_n")

Concatenating forms tables:

In [8]:
LatInfLexi_forms = pd.concat([LatInfLexi_verbs, LatInfLexi_nouns])

# Converting forms to Paralex format

Obtaining columns in Paralex format:

In [9]:
col_map = {"form": "orth_form", "form_IPA": "phon_form", "PoSTag:features": "cell",
           "freqTFTL":"frequency",
           "freqAntiquitas":"frequency_Antiquitas" ,
           "freqAetasPatrum":"frequency_AetasPatrum",
            "freqMediumAeuum":"frequency_MediumAeuum",
            "freqRecentiorLatinitas":"frequency_RecentiorLatinitas"
           }
LatInfLexi_forms.rename(col_map, axis=1, inplace=True)

Mapping cells to new scheme:

In [10]:
LatInfLexi_cells_mapper = LatInfLexi_cells.set_index("LatInFlexi-cell").cell_id.to_dict()
LatInfLexi_forms.loc[:,"cell"] = LatInfLexi_forms.cell.map(LatInfLexi_cells_mapper)

In [11]:
LatInfLexi_forms.sample(5)

Unnamed: 0,form_id,lexeme,cell,orth_form,phon_form,frequency,frequency_Antiquitas,frequency_AetasPatrum,frequency_MediumAeuum,frequency_RecentiorLatinitas,POS
489442,form_489442_v,lustro/-or,fut.act.ptcp.nom.n.pl,lūstrātūra,luːstraːtuːra,0,0,0,0,0,verb
659995,form_659995_v,proruo,gdv.dat.n.pl,prōruendīs,proːruendiːs,0,0,0,0,0,verb
529154,form_529154_v,oblecto,prs.pass.inf,oblectārī,oblektaːriː,39,0,13,23,3,verb
760603,form_760603_v,spuo,prs.act.ptcp.voc.n.sg,spuēns,spueːns,2,0,2,0,0,verb
250547,form_250547_v,diligo,gdv.dat.n.pl,dīligendīs,diːliɡendiːs,26,1,8,16,1,verb


Setting form_id as index

In [12]:
LatInfLexi_forms = LatInfLexi_forms.set_index("form_id")

In [13]:
LatInfLexi_forms

Unnamed: 0_level_0,lexeme,cell,orth_form,phon_form,frequency,frequency_Antiquitas,frequency_AetasPatrum,frequency_MediumAeuum,frequency_RecentiorLatinitas,POS
form_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
form_0_v,abalieno,prs.act.ind.1.sg,abaliēnō,abalieːnoː,0,0,0,0,0,verb
form_1_v,abalieno,prs.act.ind.2.sg,abaliēnās,abalieːnaːs,0,0,0,0,0,verb
form_2_v,abalieno,prs.act.ind.3.sg,abaliēnat,abalieːnat,5,2,2,1,0,verb
form_3_v,abalieno,prs.act.ind.1.pl,abaliēnāmus,abalieːnaːmus,0,0,0,0,0,verb
form_4_v,abalieno,prs.act.ind.2.pl,abaliēnātis,abalieːnaːtis,2,1,1,0,0,verb
...,...,...,...,...,...,...,...,...,...,...
form_12451_n,uxor,gen.pl,uxōrum,uksoːrum,265,15,124,122,4,noun
form_12452_n,uxor,dat.pl,uxōribus,uksoːribus,725,30,317,373,5,noun
form_12453_n,uxor,acc.pl,uxōrēs,uksoːreːs,2073,90,1064,903,16,noun
form_12454_n,uxor,voc.pl,uxōrēs,uksoːreːs,2073,90,1064,903,16,noun


Adding stress

In [14]:

C = r"b|d|ɡ|m|n|l|r|z|p|pʰ|f|t|tʰ|s|k|kʰ|h"
V = r"waj|[jw][aeiouy]ː|[aeiouy]ː?|[jw][aeiouy]|[aeiouy][jw]"
segmenter = f"(?:{C}|({V}))*?"


def search_vowels(word):
    segmented = regex.fullmatch(segmenter, word)
    return segmented.spans(1)

def find_latin_stress(word):

    if word == "#DEF#":
        return word

    def stress(idxs):
        i = idxs[0]
        if word[i] in 'wj':
            i += 1
        return word[:i] + "ˈ" + word[i:]

    indexes = search_vowels(word)

    # 2 syllables or less => stress first syllable
    if len(indexes) <= 2:
        return stress(indexes[0])

    *_, antepenult, penult, ultimate = indexes

    # If the penult has a long vowel it is stressed
    if word[slice(*penult)][-1] in {"ː", "j", "w"}:
        return stress(penult)

    # Single C after the penult => short penult => stress antepenult
    if (ultimate[0] - penult[1]) < 2:
        return stress(antepenult)

    # For the rest, it depends on the consonant sequence
    c_seq = word[penult[1]:ultimate[0]]
    cl = re.compile("^[bdɡpctd]ʰ?[rl]$")

    # C seq is a liquid cluster => short penult => stress antepenult
    if cl.match(c_seq):
        return stress(antepenult)

    # other C sequence => long penult => stress penult
    return stress(penult)


LatInfLexi_forms.loc[:, "phon_form"] = LatInfLexi_forms["phon_form"].apply(find_latin_stress)

Separating sounds with spaces

In [15]:

def splitter(series, split_pattern):
    series = series.str.split(pat=split_pattern, regex=True)
    return series.apply(lambda x: " ".join([char for char in x if char]))

sounds = ['b', 'd', 'ɡ', 'm', 'n', 'l', 'r', 'z', 'p', 'pʰ', 'f', 't', 'tʰ', 's',
          'k', 'kʰ', 'h', 'j', 'w', 'a', 'aː', 'e', 'eː', 'i', 'iː', 'o', 'oː',
          'u', 'uː',
          'ˈa', 'ˈaː', 'ˈe', 'ˈeː', 'ˈi', 'ˈiː', 'ˈo', 'ˈoː',
          'ˈu', 'ˈuː',
          
          'ˈy', 'ˈyː']
split_pattern = "(" + "|".join(sorted(sounds, key=len, reverse=True)) + ")"
LatInfLexi_forms["phon_form"] = splitter(LatInfLexi_forms["phon_form"], split_pattern)

# Adjusting the transcription

In [16]:
# Adding frequencies to the cells

In [17]:
cells_freq = LatInfLexi_forms.groupby("cell")[["frequency", 
                                              "frequency_Antiquitas",
                                              "frequency_AetasPatrum",
                                              "frequency_MediumAeuum",
                                              "frequency_RecentiorLatinitas"]].agg(sum)
cells_freq.index.name = "cell_id"

In [18]:
cells_freq

Unnamed: 0_level_0,frequency,frequency_Antiquitas,frequency_AetasPatrum,frequency_MediumAeuum,frequency_RecentiorLatinitas
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abl.pl,835283,115180,310927,382000,27176
abl.sg,2816933,297275,962861,1463839,92958
acc.pl,1565479,216650,557840,734860,56129
acc.sg,2663992,246582,945900,1391128,80382
dat.pl,835283,115180,310927,382000,27176
...,...,...,...,...,...
prs.pass.sbjv.3.sg,168334,9945,66246,83631,8512
sup.abl,156341,17168,41767,88748,8658
sup.acc,591907,50305,198546,323267,19789
voc.pl,2113371,236521,758001,1044528,74321


In [19]:
LatInfLexi_cells.set_index("cell_id", inplace=True)

In [20]:
LatInfLexi_cells = pd.merge(LatInfLexi_cells, cells_freq, left_index=True, right_index=True)

# Creating the lexemes table

In [21]:
LatInfLexi_lexemes = LatInfLexi_forms.groupby(["lexeme", "POS"])[["frequency", 
                                                                      "frequency_Antiquitas",
                                                                      "frequency_AetasPatrum",
                                                                      "frequency_MediumAeuum",
                                                                      "frequency_RecentiorLatinitas"]].agg(sum).reset_index("POS", drop=False)
LatInfLexi_lexemes.index.name = "lexeme_id"

In [22]:
LatInfLexi_lexemes

Unnamed: 0_level_0,POS,frequency,frequency_Antiquitas,frequency_AetasPatrum,frequency_MediumAeuum,frequency_RecentiorLatinitas
lexeme_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abalieno,verb,393,101,176,113,3
abdico,verb,2256,773,760,683,40
abdo,verb,4850,971,2111,1644,124
abduco,verb,3194,937,1057,1116,84
abeo,verb,6626,1723,1466,3229,208
...,...,...,...,...,...,...
uulgus,noun,7917,1588,2524,2311,1494
uulnero,verb,10975,1167,4903,4784,121
uulnus,noun,18041,3618,7531,6637,255
uultus,noun,25464,4688,9117,11269,390


# Output

Writing it all to file

In [23]:
LatInfLexi_lexemes.to_csv("LatInfLexi-lexemes.csv")
LatInfLexi_forms.to_csv("LatInfLexi-forms.csv")
LatInfLexi_cells.to_csv("LatInfLexi-cells.csv")

Remove temporary files from v1.1

In [24]:
%%bash
rm LatInfLexi-nouns.csv
rm LatInfLexi-verbs.csv
