In [1]:
import pandas as pd
import zipfile
import json
from nltk.tokenize import MWETokenizer

In [59]:
file = "../2_2_Data_Acquisition_ETCSL/Output/alltexts.csv"
etcsl = pd.read_csv(file, keep_default_na=False)
etcsl = etcsl.loc[etcsl["lang"].str.contains("sux")]  # throw out non-Sumerian words

In [60]:
etcsl["lemma"] = etcsl.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
etcsl['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in etcsl['lemma'] ] 
# kick out empty forms
etcsl["lemma"] = etcsl["lemma"].str.lower()

In [4]:
file = "../2_1_Data_Acquisition_ORACC/output/parsed.csv"
lexical = pd.read_csv(file, keep_default_na=False)
lexical = lexical.loc[lexical["lang"].str.contains("sux")]

In [5]:
lexical["lemma"] = lexical.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
lexical['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in lexical['lemma'] ] 
# kick out empty forms
lexical["lemma"] = lexical["lemma"].str.lower()

In [6]:
lexical = lexical[~lexical["field"].isin(["sg", "pr"])] # remove lemmas that derive from the fields "sign" 
# or "pronunciation" in sign lists.

In [7]:
lexical = lexical.groupby([lexical['id_text'], lexical['id_line']]).agg({
        'lemma': tuple
    }).reset_index()

In [64]:
etcsl = etcsl.groupby([etcsl['id_text'], etcsl['id_line'], etcsl['text_name']]).agg({
        'lemma': list
    }).reset_index()

In [35]:
# kick out tuples that contain any lemma with "[na]na" and tuples of length lower than 2
lex_vocab = list(set(lexical["lemma"]))
lex_vocab_l = []
for tup in lex_vocab:
    add = True
    if len(tup) < 2:
        continue
    for lemma in tup:
        if "[na]na" in lemma:
            add = False
    if add:
        lex_vocab_l.append(tup)
lex_vocab_l

[('ŋeštin[vine]n', 'šušru[grape]n'),
 ('a[arm]n', 'mah[great]v/i'),
 ('sa[bundle]n', 'sig[tie]v/t'),
 ('lu[person]n', 'izi[fire]n'),
 ('kinkin[millstone]n', 'zu[tooth]n'),
 ('gur[rim]n', 'balaŋdi[instrument]n'),
 ('gu[neck]n', 'niŋgilim[rodent]n'),
 ('dal[vessel]n', 'kugbabbar[silver]n'),
 ('lub[bag]n', 'patirum[leather-bag]n'),
 ('ašag[field]n', 'ŋeš[tree]n', 'ur[drag]v/t'),
 ('e₂.eš₂[prison]n', 'nun[prince]n'),
 ('a[water]n', 'sig[put]v/t'),
 ('kir[lamb]n', 'ŋeš[penis]n', 'zu[know]v/t'),
 ('ukuš[cucumber]n', 'liligi[plant]n'),
 ('a[arm]n', 'su[sprinkle]v/t', 'kugbabbar[silver]n'),
 ('ganum[stand]n', 'kaš[beer]n'),
 ('il[basket]n', 'siki[hair]n'),
 ('agin[thus]n', 'lu[person]n', 'zaʾe[you]ip'),
 ('giŋ[ax]n', 'kugbabbar[silver]n'),
 ('ušu[thirty]nu', 'mana[unit]n'),
 ('gidim[fork]n', 'tur[be-small]v/i'),
 ('aŋkiluh[broom]n', 'niŋkiluh[broom]n'),
 ('buru[harvest]n', 'ed[ascend]v/i'),
 ('lu[person]n', 'lu₂.im[criminal]n'),
 ('e[house]n', 'ŋeškešda[dam]n'),
 ('ninda[bread]n', 'abba[father

In [41]:
tokenizer = MWETokenizer(lex_vocab_l)

In [65]:
etcsl["lemma2"] = etcsl.apply(lambda x: (tokenizer.tokenize(x["lemma"])), axis=1)
etcsl[["lemma","lemma2"]]

Unnamed: 0,lemma,lemma2
0,[dubsaŋ[first]aj],[dubsaŋ[first]aj]
1,"[enki[1]dn, unu[dwelling]n, gal[big]v/i, ed[as...","[enki[1]dn, unu[dwelling]n, gal[big]v/i, ed[as..."
2,[anzag[horizon]n],[anzag[horizon]n]
3,"[anŋi[eclipse]n, zu[know]v/t, ama[mother]n, tu...","[anŋi[eclipse]n, zu[know]v/t, ama[mother]n, tu..."
4,"[gi[thicket]n, tuku[rock]v/t]","[gi[thicket]n, tuku[rock]v/t]"
5,"[an[na]na, kaš₄[na]na, an[na]na, kaš₄[na]na, m...","[an[na]na, kaš₄[na]na, an[na]na, kaš₄[na]na, m..."
6,"[mašmaš[sorcerer]n, erim[enemy]n, kur[differen...","[mašmaš[sorcerer]n, erim[enemy]n, kur[differen..."
7,"[ŋiriŋena[path]n, enki[1]dn, ki[place]n, unu[d...","[ŋiriŋena[path]n, enki[1]dn, ki[place]n, unu[d..."
8,"[šag[heart]n, pu₂[na]na, 1-kam-ma[1st]nu]","[šag[heart]n, pu₂[na]na, 1-kam-ma[1st]nu]"
9,[dubsaŋ[first]aj],[dubsaŋ[first]aj]


In [52]:
etcsl_l = tokenizer.tokenize_sents(etcsl["lemma"])
#probably better to use an apply function to tokenize within the DF.

In [55]:
mwe = []
for line in etcsl_l:
    for lemma in line:
        if "_" in lemma:
            mwe.append(line)
            continue
mwe, len(mwe)

([['šul[youth]v/i', 'a[arm]n_la[hang]v/t'],
  ['ŋiri[foot]n',
   'iš[na]na',
   'lu[person]n_inim[word]n',
   'zid[right]v/i',
   'ga[na]na',
   'ka×x[na]na',
   'ba[na]na'],
  ['enlil[1]dn', 'dug[speak]v/t_kur[different]v/i'],
  ['munus[woman]n_zid[right]v/i', 'mi[cvne]n_dug[speak]v/t'],
  ['munus[woman]n_zid[right]v/i', 'mi[cvne]n_dug[speak]v/t'],
  ['lugal[king]n', 'inim[word]n_sag[good]v/i'],
  ['ursaŋ[hero]n', 'ni[fear]n_gal[big]v/i', 'gur[lift]v/t'],
  ['lugal[king]n', 'ŋeštug[ear]n_daŋal[wide]v/i'],
  ['gud[ox]n_si[horn]n', 'ŋar[place]v/t'],
  ['ud[sun]n_zal[pass]v/t'],
  ['ursaŋ[hero]n',
   'piriŋ[lion]n_huš[reddish]v/i',
   'uru[flood]n',
   'me[being]n',
   'gal[big]v/i'],
  ['sahar[soil]n', 'kag[mouth]n_du[build]v/t'],
  ['lugal[king]n', 'mi[cvne]n_dug[speak]v/t'],
  ['en[lord]n', 'kur[mountain]n', 'lu[person]n_til[live]v/i'],
  ['kud[fish]n', 'e[house]n_du[build]v/t'],
  ['saŋki[forehead]n_gid[long]v/i'],
  ['gi[thicket]n', 'ana[what?]qp', 'gu[voice]n_de[pour]v/t'],
  ['an[

In [None]:
s1 = [n.split() for n in s]
s1

In [None]:
tokenizer.tokenize_sents(s1)

In [None]:
etcsl_t = etcsl[["lemma", "id_text", "id_line"]]

In [None]:
etcsl_t = etcsl_t[10000:10100]

In [None]:
etcsl_t_g = etcsl_t.groupby([etcsl_t["id_text"], etcsl_t["id_line"]]).agg({"lemma": list})

In [None]:
etcsl_t_g["lemma"][1]

In [None]:
etcsl_lines = list(etcsl_t_g["lemma"])

In [None]:
tokenizer.tokenize_sents(etcsl_lines)

In [8]:
lexical

Unnamed: 0,id_text,id_line,lemma
0,dcclt/P000723,4,"(x-bad[na]na,)"
1,dcclt/P000723,5,"(x-|en+ib|[na]na,)"
2,dcclt/P000723,7,"(gada-sukkal[na]na,)"
3,dcclt/P000723,8,"(gal-ga[na]na,)"
4,dcclt/P000723,9,"(tug₂-gara₂[na]na,)"
5,dcclt/P000723,12,"(gal-ŋišgal[na]na,)"
6,dcclt/P000723,13,"(gal-|ga₂×di.me|[na]na,)"
7,dcclt/P000723,14,"(saŋŋa[na]na, |ga₂×gar.me|[na]na)"
8,dcclt/P000723,15,"(x[na]na, |ga₂×x|[na]na)"
9,dcclt/P000724,5,"(x-suhur[na]na,)"


In [32]:
t = ('kinkin[millstone]n', 'zu[tooth]n')
len(t)

2