# Some basic statistics on the Akkadian ORACC corpus

In [1]:
import glob
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Load the data

In [2]:
path =r'output' # indicate the local path where files are stored
allFiles = glob.glob(path + "/*.csv")
list_ = []
files_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['id_text'] = [file_[7:-4].replace('_', '/') + '/' + text for text in df['id_text']]
    df['lemma'] = [lemma.replace('$', '') for lemma in df['lemma']]
    list_.append(df)
    files_.append(file_[7:-4].replace('_', '/'))
data = pd.concat(list_).reset_index(drop=True)

# How many texts are included?

In [5]:
data.shape

(7071, 2)

In [19]:
length = [text[:5] for text in data['lemma']]
length

['ana[t',
 'ana[t',
 'x[NA]',
 'x[NA]',
 'ana[t',
 'Rib-H',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'x[NA]',
 'x[NA]',
 'x[NA]',
 'ana[t',
 'ana[t',
 'x[NA]',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'umma[',
 'ana[t',
 'umma[',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'Rib-H',
 'Rib-H',
 'Rib-H',
 'ana[t',
 'ana[t',
 'Rib-H',
 'ana[t',
 'ana[t',
 'Rib-H',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ṭuppu',
 'ana[t',
 'šanīt',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'šanīt',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',
 'šâlu[',
 'ana[t',
 'ana[t',
 'ana[t',
 'ana[t',


In [71]:
length = [len(data['lemma'][i].split()) for i in range(len(data['lemma']))]
length = sorted(length)
length = pd.DataFrame(length)
length.describe

<bound method NDFrame.describe of          0
0        1
1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        1
10       1
11       1
12       1
13       1
14       1
15       1
16       1
17       1
18       1
19       1
20       2
21       2
22       2
23       2
24       2
25       2
26       2
27       2
28       2
29       2
...    ...
7041  1701
7042  1732
7043  1752
7044  1834
7045  1889
7046  1910
7047  1913
7048  1984
7049  1992
7050  2138
7051  2149
7052  2151
7053  2170
7054  2179
7055  2191
7056  2291
7057  2479
7058  2580
7059  2621
7060  2643
7061  2915
7062  3069
7063  3172
7064  3262
7065  3448
7066  3829
7067  3947
7068  4715
7069  6146
7070  6278

[7071 rows x 1 columns]>

In [10]:
cv = CountVectorizer(analyzer='word', token_pattern=r'[^ ]+')
fit = cv.fit_transform(data['lemma'])
dtm = pd.DataFrame(fit.toarray(), columns = cv.get_feature_names(), index = data['id_text'])

# Keep lemmatized words only

In [61]:
words = [word for word in dtm.columns if not word.endswith('[na]na')]
dtm = dtm[words]
dtm.describe

<bound method NDFrame.describe of                      a-eridu[1]dn  a-x[00]pn  a[the-sign-a₂]n  \
id_text                                                         
aemw/amarna/P270838             0          0                0   
aemw/amarna/P270861             0          0                0   
aemw/amarna/P270863             0          0                0   
aemw/amarna/P270871             0          0                0   
aemw/amarna/P270872             0          0                0   
aemw/amarna/P270873             0          0                0   
aemw/amarna/P270874             0          0                0   
aemw/amarna/P270875             0          0                0   
aemw/amarna/P270876             0          0                0   
aemw/amarna/P270877             0          0                0   
aemw/amarna/P270879             0          0                0   
aemw/amarna/P270880             0          0                0   
aemw/amarna/P270881             0          0            

In [14]:
terms = ['immeru[sheep]n', 'puhādu[lamb]n', 'arhu[cow]n', 'būru[(bull)-calf]n']
dtm[terms].sum()

immeru[sheep]n        435
puhādu[lamb]n          40
arhu[cow]n             18
būru[(bull)-calf]n     14
dtype: int64

In [15]:
dtm.sum().sort_values(ascending=False)

ina[in]prp                              25434
ana[to]prp                              24914
ša[of]det                               24424
šarru[king]n                            18566
u[and]cnj                               16753
māru[son]n                              16188
bēlu[lord]n                             12900
ša[that]rel                             11070
ūmu[day]n                                8007
lā[not]mod                               6906
bītu[house]n                             6662
mātu[land]n                              5733
libbu[interior]n                         4993
ilu[god]n                                4282
šību[witness]n                           4035
alāku[go]v                               3971
pānu[front]n                             3607
muhhu[skull]n                            3577
unqu[ring]n                              3516
epēšu[do]v                               3384
ištu[from]prp                            3256
šuāti[him]ip                      