# 3.3 Looking at the Lexical Vocabulary from the Perspective of the Literary Material

Goal of this notebook is to explore the connection between the literary corpus and individual lexical texts. In order to do so we will construct a full DTM of the literary vocabulary with trigrams and see which lexical texts have a larger or smaller intersection with that vocabulary.

In [48]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # this suppresses a warning about pandas from tqdm
import pandas as pd
from ipywidgets import interact
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm.auto import tqdm
tqdm.pandas() # initiate pandas support in tqdm, allowing progress_apply() and progress_map()
from nltk import trigrams, bigrams
import zipfile
import json

In [2]:
lit_lines = pd.read_pickle('output/litlines.p')
lit_lines

Unnamed: 0,id_text,id_line,lemma,lemma_mwe
0,epsd2/literary/P209784,4,niŋšu[goods]n ŋal[be]v/i,niŋšu[goods]n ŋal[be]v/i
1,epsd2/literary/P209784,5,ibi[smoke]n,ibi[smoke]n
2,epsd2/literary/P209784,6,an[sky]n e[leave]v/i,an[sky]n e[leave]v/i
3,epsd2/literary/P251427,4,x[na]na x-x[na]na gal[big]v/i anki[universe]n ...,x[na]na x-x[na]na gal[big]v/i anki[universe]n ...
4,epsd2/literary/P251427,5,utu[1]dn nirŋal[authoritative]aj dumu[child]n ...,utu[1]dn nirŋal[authoritative]aj dumu[child]n ...
...,...,...,...,...
44196,epsd2/literary/X010001,58,lal[syrup]n ŋeštin[vine]n ulušin[beer]n kurun[...,lal[syrup]n ŋeštin[vine]n ulušin[beer]n kurun[...
44197,epsd2/literary/X010001,60,kirugu[notation]n ešakamak[third]nu,kirugu[notation]n ešakamak[third]nu
44198,epsd2/literary/X010001,62,šid[count]v/t 4(u)[na]na 9(diš)[na]na mu[name]n,šid[count]v/t 4(u)[na]na 9(diš)[na]na mu[name]n
44199,epsd2/literary/X010001,63,širnamšubak[subscript]n gula[1]dn,širnamšubak[subscript]n gula[1]dn


Make ngrams: unigrams, bigrams, and trigrams. Represent bigrams and trigrams as MWEs, connected by underscores. Create a full list of all lemmas and ngrams, omitting all non-lemmatized words (or ngrams that include non-lemmatized words).

In [3]:
def make_ngrams(lemmas):
    lemmas = lemmas.split()
    lemmas_bi = bigrams(lemmas)
    lemmas_tri = trigrams(lemmas)
    lemmas_n = list(lemmas_bi) + list(lemmas_tri)
    lemmas_n = ['_'.join(lem) for lem in lemmas_n]
    lemmas = set(lemmas + lemmas_n)
    lemmas = [lem for lem in lemmas if not '[na]na' in lem]
    lit_vocab.extend(lemmas)
    return

In [4]:
lit_vocab = []
lit_lines['lemma'].progress_apply(make_ngrams)
lit_vocab = list(set(lit_vocab))
lit_vocab.sort()
lit_vocab[:25]

HBox(children=(FloatProgress(value=0.0, max=44201.0), HTML(value='')))




['a.igi.lu[boatman]n',
 'a.igi.lu[boatman]n_šir[song]n',
 'a.igi.lu[boatman]n_šir[song]n_dug[good]v/i',
 'a.zi&zi.lagab[grass]n',
 'a.zi&zi.lagab[grass]n_a[water]n',
 'a.zi&zi.lagab[grass]n_a[water]n_de[pour]v/t',
 'a.zi&zi.lagab[grass]n_duašaga[1]sn',
 'a.zi&zi.lagab[grass]n_e[speak]v/t',
 'a.zi&zi.lagab[grass]n_gid[long]v/i',
 'a.zi&zi.lagab[grass]n_gid[long]v/i_ašag[field]n',
 'a.zi&zi.lagab[grass]n_mu[grow]v/i',
 'a.zi&zi.lagab[grass]n_munud[bed]n',
 'a[arm]n',
 'a[arm]n_ak[do]v/t',
 'a[arm]n_ak[do]v/t_enlil[1]dn',
 'a[arm]n_al[cvne]n',
 'a[arm]n_al[cvne]n_e[speak]v/t',
 'a[arm]n_ala[manacles]n',
 'a[arm]n_ala[manacles]n_la[hang]v/t',
 'a[arm]n_an[1]dn',
 'a[arm]n_an[1]dn_šum[give]v/t',
 'a[arm]n_an[sky]n',
 'a[arm]n_an[sky]n_bad[open]v/t',
 'a[arm]n_ana[what?]qp',
 'a[arm]n_ana[what?]qp_si[fill]v/t']

> Note: This step can be done with Countvectorizer, with setting ngrams = (1,3). Disadvantages of that approach:
> - we don not need a full DTM for the literary corpus
> - the DTM should be made on *lines* instead of *documents* to prevent words from consecutive lines to form bigrams or trigrams. Afterwards use groupby and agg to make DTM on document level


In [5]:
#lit_comp = lit_lines.groupby(['id_text']).agg({'lemma' : ' '.join}).reset_index()
#lit_comp['lemma'] = [lem for lem in lit_comp['lemma'] if not '[na]na' in lem] # remove unlemmatized 

In [6]:
#tv = TfidfVectorizer(token_pattern = r'[^ ]+' ngram_range = (1,3))
#dtm = tv.fit_transform(lit_comp['lemma'])
#lit_df = pd.DataFrame(dtm.toarray(), columns= tv.get_feature_names(), index=lit_comp["id_text"])
#cols = [col for col in lit_df.columns if not '[na]na' in col]
#lit_df = lit_df[cols]

# Read Lexical Corpus

In [7]:
lex_lines = pd.read_pickle('output/lexlines.p')
lex_lines['lemma'] = [lemma.replace(' ', '_') for lemma in lex_lines['lemma']]
lex_lines = lex_lines.loc[~lex_lines.lemma.str.contains('\[na\]na')]
lex_lines

Unnamed: 0,id_text,id_line,lemma
0,dcclt/P117394,2,kid[mat]n
1,dcclt/P117394,3,kid[mat]n_andul[shade]n
2,dcclt/P117394,4,kid[mat]n_antadul[cloak]n
3,dcclt/P117395,2,ŋeše[key]n
4,dcclt/P117395,3,pakud[~tree]n
...,...,...,...
69313,dcclt/signlists/Q000056,531,gakkul[mash-tub]n
69315,dcclt/signlists/Q000056,534,kilib[total]n
69316,dcclt/signlists/Q000056,535,šuniŋin[total]n
69317,dcclt/signlists/Q000056,536,šuniŋin[total]n


In [21]:
lex_comp = lex_lines.groupby(['id_text']).agg({'lemma': ' '.join}).reset_index()
lex_comp

Unnamed: 0,id_text,lemma
0,dcclt/P117394,kid[mat]n kid[mat]n_andul[shade]n kid[mat]n_an...
1,dcclt/P117395,ŋeše[key]n pakud[~tree]n raba[clamp]n
2,dcclt/P117396,hašhur[apple]n hašhur[apple]n_baza[dwarf]n haš...
3,dcclt/P117397,laqipu[1]dn ninkugnunak[1]dn ninagrunak[1]dn
4,dcclt/P117404,ig[door]n_eren[cedar]n ig[door]n_dib[board]n i...
...,...,...
809,dcclt/signlists/P333171,nun[object]n nun[prince]n nun[object]n gurud[t...
810,dcclt/signlists/P447993,ba[allot]v/t zaʾe[you]ip ŋaʾe[i]ip sag[good]v/...
811,dcclt/signlists/P447994,zah[disappear]v/i zah[disappear]v/i zah[disapp...
812,dcclt/signlists/P447997,lahar[ewe]n sag[good]v/i ne[brazier]n zah[mark...


Since the data are drawn from multiple (sub)projects, it is possible that there are dupliactes. We take the version with the largest number of (lemmatized) words.

In [22]:
lex_comp['id_text'] = [i[-7:] for i in lex_comp['id_text']]
lex_comp['length'] = [len(lem.split()) for lem in lex_comp['lemma']]
lex_comp = lex_comp.sort_values(by = 'length', ascending = False)
lex_comp = lex_comp.drop_duplicates(subset = 'id_text', keep = 'first')
lex_comp

Unnamed: 0,id_text,lemma,length
760,Q000050,izi[fire]n ne[brazier]n didal[ashes]n didal[as...,1020
758,Q000047,lu[person]n lugal[king]n namdumu[status]n sukk...,902
762,Q000055,a[water]n duru[wet]v/i a[water]n a[water]n aya...,777
753,Q000039,taškarin[boxwood]n esi[tree]n ŋešnu[tree]n hal...,706
754,Q000040,gašam[reed]n gišulhi[reed]n gizi[reed]n gi.ne[...,645
...,...,...,...
711,P427591,har[ring]n,1
45,P225075,šagadu[belt]n,1
796,P333147,umun[insect]n,1
366,P229897,tun[lip]n,1


In [43]:
cv = CountVectorizer(preprocessor = lambda x: x, tokenizer = lambda x: x.split(), vocabulary = lit_vocab, binary = True)
dtm = cv.fit_transform(lex_comp['lemma'])
lex_df = pd.DataFrame(dtm.toarray(), columns= cv.get_feature_names(), index=lex_comp["id_text"])

In [44]:
lex_df = lex_df.loc[: , lex_df.sum(axis=0) != 0].copy()
vocab = lex_df.columns

In [45]:
lex_df["n_matches"] = lex_df[vocab].sum(axis=1, numeric_only=True)

In [46]:
lex_df

Unnamed: 0_level_0,a[arm]n,a[arm]n_ak[do]v/t,a[arm]n_bad[open]v/t,a[arm]n_dar[split]v/t,a[arm]n_daŋal[wide]v/i,a[arm]n_durah[goat]n,a[arm]n_e[leave]v/i,a[arm]n_gab[left]n,a[arm]n_gal[big]v/i,a[arm]n_gud[ox]n,...,šutug[reed-hut]n,šutum[storehouse]n,šutur[garment]n,šuziʾana[1]dn,šuš[cover]v/t,šušin[1]sn,šušru[distressed]v/i,šuʾi[barber]n,šuʾura[goose]n,n_matches
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q000050,1,1,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,532
Q000047,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,404
Q000055,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,598
Q000039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,202
Q000040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P427591,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
P225075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
P333147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
P229897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [99]:
# First get the metadata. 
cat = {}
for proj in ['dcclt', 'dcclt/signlists', 'dcclt/nineveh', 'dcclt/ebla']:
    f = proj.replace('/', '-')
    file = f"jsonzip/{f}.zip" # The ZIP file was downloaded in the previous notebook
    z = zipfile.ZipFile(file) 
    st = z.read(f"{proj}/catalogue.json").decode("utf-8")
    j = (json.loads(st))
    cat.update(j["members"])
cat_df = pd.DataFrame(cat).T
cat_df["id_text"] = cat_df["id_text"].fillna(cat_df["id_composite"])
cat_df = cat_df[["id_text", "designation"]]

In [100]:
lex = pd.merge(cat_df, lex_df['n_matches'], on = 'id_text', how = 'inner')

In [102]:
lex = lex.sort_values(by = 'n_matches', ascending = False)
lex

Unnamed: 0,id_text,designation,n_matches
806,Q000055,OB Nippur Ea,598
804,Q000050,OB Nippur Izi,532
802,Q000047,OB Nippur Lu,404
194,P228842,"MSL 14, 018 Bb",333
808,Q000057,OB Nippur Diri,262
...,...,...,...
11,P209818,"Ontario 2, 502",0
32,P225015,"TIM 10, 038",0
31,P225009,"TIM 10, 032",0
600,P297195,"BIN 02, 054",0


Now normalize!