In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # this suppresses a warning about pandas from tqdm
import pandas as pd
from ipywidgets import interact
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import zipfile
import json
from tqdm.auto import tqdm
tqdm.pandas() # initiate pandas support in tqdm, allowing progress_apply() and progress_map()
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from lexicalrichness_v import LexicalRichness as lr

In [3]:
lit_lines = pd.read_pickle('output/litlines.p')
lex_lines = pd.read_pickle('output/lexlines.p')

Get a list of all lexical vocabulary items.

In [5]:
vocab = list(lex_lines['lemma'])
vocab2 = [v.split() for v in vocab]
vocab2 = [item for sublist in vocab2 for item in sublist]
vocab_s = set(vocab) | set(vocab2)
vocab_l = list(vocab_s)
vocab_l = [v for v in vocab_l if not '[na]na' in v]
vocab_l.sort()
length = [len(v.split()) for v in vocab_l]
m = max(length)

Use CountVectorizer on *lines* so that ngrams do not extent over the end of a line.

In [6]:
cv = CountVectorizer(token_pattern = r'[^ ]+', ngram_range = (1,m), vocabulary = vocab_l)
dtm = cv.fit_transform(lit_lines['lemma'])
df = pd.DataFrame(dtm.toarray(), columns= cv.get_feature_names(), index=lit_lines["id_text"])
df

Unnamed: 0_level_0,a[arm]n,a[arm]n ak[do]v/t,a[arm]n apin[plow]n,a[arm]n bad[open]v/t,a[arm]n bad[wall]n,a[arm]n badsi[parapet]n,a[arm]n be[diminish]v/t,a[arm]n da[line]n,a[arm]n dabašin[object]n,a[arm]n daluš[sling]n,...,šuʾabdu[1]wn,šuʾi[barber]n,šuʾi[barber]n egir[back]n,šuʾi[barber]n gin[firm]v/i,šuʾi[barber]n gina[offering]n,šuʾi[barber]n gu[neck]n,šuʾi[barber]n lugal[king]n,šuʾi[barber]n saŋ[head]n,šuʾu[stone]n,šuʾura[goose]n
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
epsd2/literary/P209784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/P209784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/P209784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/P251427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/P251427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
epsd2/literary/X010001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/X010001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/X010001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
epsd2/literary/X010001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Combine lines into compositions with groupby and aggregate. Note that the number of columns is identitcal to the number of lexical items in the Venn diagram in 3.1.

In [7]:
df = df.groupby(['id_text']).agg(sum)
df.shape

(911, 10146)

Remove columns with 0 attestations in the literary corpus

In [8]:
df = df.loc[: , df.sum(axis=0) != 0].copy()
df.shape

(911, 3558)

In [9]:
with open('output/lex_lit_vocab.txt', 'r', encoding = 'utf8') as r:
    v = r.read().splitlines()

In [10]:
v = set(v)

In [11]:
len(v)

3539

In [12]:
df_s = set(df.columns)

In [14]:
v = {vi.replace('_', ' ') for vi in v}

In [15]:
len(v)

3539

In [16]:
df_s - v

{'amar[young]n ga[milk]n',
 'duru[wet]v/i šu[hand]n',
 'e[house]n šerda[crime]n',
 'guza[chair]n gar[knob]n',
 'ilu[lament]n di[speak]v/t',
 'iri[city]n gal[big]v/i',
 'ki[place]n uludin[form]n',
 'lu[person]n ma[ship]n rugu[withstand]v/t',
 'lu[person]n šezar[heap]n',
 'maš[goat]n gaba[chest]n',
 'munus[woman]n zid[right]v/i munus[woman]n sag[good]v/i',
 'ninda[bread]n kurum[ration]n',
 'niŋ[thing]n huldim[rotten]aj',
 'saŋ[head]n geme[worker]n',
 'saŋ[head]n si[horn]n',
 'siki[hair]n gig[sick]v/i',
 'udu[sheep]n šub[fall]v/i',
 'umma[old-woman]n gal[big]v/i',
 'usakar[moon]n kugbabbar[silver]n'}

In [21]:
test = df[["amar[young]n ga[milk]n"]]

In [23]:
t2 = [l for l in lit_lines['lemma_mwe'] if 'amar[young]n_ga[milk]n' in l]

In [24]:
t2

['amar[young]n_ga[milk]n_gu[eat]v/t']

The approach here and the approach in section 3.1 differ slightly in that the present approach will also find partly overlapping matches. The literary line 'amar\[young\]n_ga\[milk\]n_gu\[eat\]v/t' matches the lexical entry 'amar\[young\]n_ga\[milk\]n_gu\[eat\]v/t' but not 'amar\[young\]n_ga\[milk\]n'. In the ngram approach both matches will work.