# Search Sign Lists
Testing

In [92]:
%matplotlib inline  
# %matplotlib inline enables drawing of visualizations in the Notebook
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # this suppresses a warning about pandas from tqdm
import pandas as pd
from ipywidgets import interact
import os
import sys
from tqdm.auto import tqdm
tqdm.pandas() # initiate pandas support in tqdm, allowing progress_apply() and progress_map()
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
from nltk.tokenize import MWETokenizer
import zipfile
import json
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

#### 3.1.0.1 Read Lexical Data
The module `utils` in the `utils` directory of Compass includes the function `get_data()` which essentially runs the same code as the [Extended ORACC Parser](../2_1_Data_Acquisition_ORACC/2_1_3_extended_ORACC-JSON_parser.ipynb) (see there for explanation of the code). Its only parameter is a string with [ORACC](http://oracc.org) project names, separated by commas. It returns a Pandas DataFrame in which each word is represented by a row.

In [34]:
projects = "dcclt, dcclt/nineveh, dcclt/signlists, dcclt/ebla"
words = get_data(projects)

Downloading JSON
Saving http://build-oracc.museum.upenn.edu/json/dcclt-nineveh.zip as jsonzip/dcclt-nineveh.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/nineveh', max=1.0, style=Progress…


Saving http://build-oracc.museum.upenn.edu/json/dcclt-ebla.zip as jsonzip/dcclt-ebla.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/ebla', max=1.0, style=ProgressSty…


Saving http://build-oracc.museum.upenn.edu/json/dcclt-signlists.zip as jsonzip/dcclt-signlists.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/signlists', max=1.0, style=Progre…


Saving http://build-oracc.museum.upenn.edu/json/dcclt.zip as jsonzip/dcclt.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt', max=1.0, style=ProgressStyle(de…


Parsing JSON


HBox(children=(FloatProgress(value=0.0, description='dcclt/nineveh', max=664.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='dcclt/ebla', max=156.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='dcclt/signlists', max=136.0, style=ProgressStyle(descript…

dcclt/signlists/X003931 is not available or not complete



HBox(children=(FloatProgress(value=0.0, description='dcclt', max=4526.0, style=ProgressStyle(description_width…




#### 3.1.0.2 Lemmas
Create a lemma column and lowercase all lemmas.

The `lemma` column is created by combining Citation Form (`cf`), Guide Word (`gw`) and Part of Speech (`pos`). The Pandas `apply()` function applies a function to every row (`axis = 1`) or column (`axis = 0`) of a dataframe. The function used here is a `lambda` function (a temporary function). It is a simple addition of the strings of the `cf`, and `gw` , and `pos` columns (with `[` and `]` as separators), so that a single lemma now looks like `lugal[king]N`. The `lambda` function has one condition: if there is no Citation Form (column `cf` equals the empty string) the contents of the column `form` are taken, followed by `[NA]NA`. The absence of a Citation Form implies that the word was not lemmatized (perhaps an unknown or a broken word). The field `form` contains the raw transliteration - the result may be `x-ra-bi[NA]NA`.

If the field `form` is empty (which happens, for instance, when a row represents a horizontal ruling on the tablet), this results in the `lemma` entry `[NA]NA`. In those cases the value of `lemma` is replaced by the empty string by means of a conditional list comprehension.

For the current analysis we will use *lemmatized* forms for the comparison between literary and lexical vocabulary. The advantage of using lemmatized forms is that we can easily match, for instance `a naŋ` (to drink water) with `a mu-naŋ` (he drank water), because both are lemmatized as `a[water]N naŋ[drink]V/t`. The unlemmatized forms, therefore, are of little importance here. We need to keep them, for now, because we will also compare *sequences* of lemmas in lexical and literary texts. Premature removal of unlemmatized forms would result in false positives. For instance, the sequence `dumu[child]N x[NA]NA lugal[king]N` should *not* result in a match for the lemma sequence (or multiple word expression) `dumu[child]N lugal[king]N`.

In [42]:
def prepare(row):
    if row['cf'] != '':
        lemma = f'{row["cf"]}[{row["gw"]}]{row["pos"]}'
    elif row['form'] != '':
        lemma = f'{row["form"]}[NA]NA'
    else:
        lemma = ''
    row['lemma'] = lemma.lower()
    row['id_line'] = f'{row["id_text"]}.{str(row["id_line"]).zfill(4)}'
    return row

In [43]:
words = words.progress_apply(prepare, axis = 1)

HBox(children=(FloatProgress(value=0.0, max=1529463.0), HTML(value='')))




Select the lines that include an entry with filed = 'sg'

In [49]:
lines = words.loc[words.field == 'sg', 'id_line']
words = words.loc[words.id_line.isin(lines)]

In [54]:
words2 = words[['id_text', 'id_line', 'lang', 'lemma', 'form', 'field']]

In [55]:
words2

Unnamed: 0,id_text,id_line,lang,lemma,form,field
1370,dcclt/nineveh/P382640,dcclt/nineveh/P382640.0004,sux,bar[na]na,bar,sv
1371,dcclt/nineveh/P382640,dcclt/nineveh/P382640.0004,sux,ba-ar₂[na]na,ba-ar₂,pr
1372,dcclt/nineveh/P382640,dcclt/nineveh/P382640.0004,sux,bar[na]na,BAR,sg
1373,dcclt/nineveh/P382640,dcclt/nineveh/P382640.0004,akk-x-stdbab,bāru[the-sign-bar]n,ba-a-ru,sn
1374,dcclt/nineveh/P382640,dcclt/nineveh/P382640.0004,akk-x-stdbab,x[na]na,x,eq
...,...,...,...,...,...,...
1526534,dcclt/P229481,dcclt/P229481.0029,sux,tuk[na]na,TUK,sg
1526535,dcclt/P229481,dcclt/P229481.0030,sux,x-x[na]na,x-x,pr
1526536,dcclt/P229481,dcclt/P229481.0030,sux,tuk[na]na,TUK,sg
1526537,dcclt/P229481,dcclt/P229481.0031,sux,x-x[na]na,x-x,pr


In [87]:
words3 = words2.loc[words2.field == 'eq']

In [96]:
requested = 'TE'
relevant = words2.loc[(words2.field == 'sg') & (words2.form == requested), 'id_line']
words4 = words3.loc[(words3.id_line.isin(relevant)), ['id_text', 'lang', 'form', 'lemma']]

In [97]:
anchor = '<a href="http://oracc.org/dcclt/{}", target="_blank">{}</a>'
words5 = words4.copy()
words5['id_text'] = words5.id_text.str[-7:]
words5['id_text'] = [anchor.format(val,val) for val in words5['id_text']]

In [100]:
@interact(sort_by = words4.columns, rows = (1, len(words5), 1))
def sort_df(sort_by = "id_line", ascending = False, rows = 25):
    l = words5.sort_values(by = sort_by, ascending = ascending).reset_index(drop=True)[:rows].style
    return l

interactive(children=(Dropdown(description='sort_by', options=('id_text', 'lang', 'lemma', 'form'), value='id_…