# Search Sign Lists
Search sign lists by sign. Display glosses, lemmatizations, and equivalencies in other languages.

## 0. Preparation
Import the required modules

In [1]:
import pandas as pd
from tqdm.auto import tqdm
from ipywidgets import interact
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

## 1 Read Lexical Data
The module `utils` in the `utils` directory of Compass includes the function `get_data()` which essentially runs the same code as the [Extended ORACC Parser](../2_1_Data_Acquisition_ORACC/2_1_3_extended_ORACC-JSON_parser.ipynb) (see there for explanation of the code). Its only parameter is a string with [ORACC](http://oracc.org) project names, separated by commas. It returns a Pandas DataFrame in which each word is represented by a row.

In [2]:
projects = "dcclt, dcclt/nineveh, dcclt/signlists, dcclt/ebla"
words = get_data(projects)

Downloading JSON
Saving http://build-oracc.museum.upenn.edu/json/dcclt.zip as jsonzip/dcclt.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt', max=1.0, style=ProgressStyle(de…


Saving http://build-oracc.museum.upenn.edu/json/dcclt-nineveh.zip as jsonzip/dcclt-nineveh.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/nineveh', max=1.0, style=Progress…


Saving http://build-oracc.museum.upenn.edu/json/dcclt-ebla.zip as jsonzip/dcclt-ebla.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/ebla', max=1.0, style=ProgressSty…


Saving http://build-oracc.museum.upenn.edu/json/dcclt-signlists.zip as jsonzip/dcclt-signlists.zip.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='dcclt/signlists', max=1.0, style=Progre…


Parsing JSON


HBox(children=(FloatProgress(value=0.0, description='dcclt', max=4526.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='dcclt/nineveh', max=664.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='dcclt/ebla', max=156.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='dcclt/signlists', max=136.0, style=ProgressStyle(descript…




## 2 Select Sign Lists
Select the lines that include an entry with field = 'sg'. Format the data to create a proper `id_line` (as string) and to create a `lemma` column.

In [3]:
words['id_line'] = [str(ids) for ids in words['id_line']]
words['id_line'] = words['id_text'] + words['id_line']
lines = words.loc[words.field == 'sg', 'id_line']
words = words.loc[words.id_line.isin(lines)]
words['lemma'] = [f'{words.iloc[i]["cf"]}[{words.iloc[i]["gw"]}]{words.iloc[i]["pos"]}' 
                  if not words.iloc[i]["cf"] == '' else ''
                 for i in tqdm(range(len(words)))]

HBox(children=(FloatProgress(value=0.0, max=92595.0), HTML(value='')))




## 3 Select Relevant Columns

In [4]:
words = words[['id_text', 'id_line', 'id_word', 'lang', 'form', 'lemma', 'field']]

## 4 Sign, Gloss, Sign Name, Lemma
Create separate DataFrames for Sign (sg), Sign Name (sn), Equivalencies (eq; equivalencies in Akkadian or other languages), Sumerian word (sv) and Gloss (pr). Merge those DataFrames on `id_line`.

The only DataFrame that may have more than one entry per `id_line` is `eq`. This is the case in some Middle Babylonian exemplars of Sa, where the Sumerian sign is associated with Akkadian, Hurrian, and Ugaritic translations. For that reason, the first merge (between sg and eq) has `how = right`.

Each of the DataFrames has the columns `lemma`, `form`, `lang`. The column `lang` is only relevant for sv (the lemmatized word, usually in Sumerian, represented by the sign) and for eq (the equivalent in another language). In eq the field `lang` is renamed `lang2`. The fields `lemma` and `form` are given relevant names (for instance `sign_name` and `sn_lemma` in sn) in each of the DataFrames before merging.

In [5]:
sg = words[['id_line', 'form']].loc[words.field == 'sg']           
sn = words[['id_line', 'form']].loc[words.field == 'sn']
sn.columns = ['id_line', 'sign_name']
eq = words[['id_text', 'id_line', 'id_word', 'lang', 'form', 'lemma']].loc[words.field == 'eq']
eq.columns = ['id_text', 'id_line', 'id_word', 'lang2', 'equiv', 'equiv_lemm']
sv = words[['id_line', 'lang', 'form', 'lemma']].loc[words.field == 'sv']
sv.columns = ['id_line', 'lang', 'form_sux', 'lemma_sux']
pr = words[['id_line', 'form']].loc[words.field == 'pr']
pr.columns = ['id_line', 'gloss']

In [6]:
sign_l = pd.merge(sg, eq, on = 'id_line', how = 'right')
sign_l = pd.merge(sign_l, sn, on = 'id_line', how = 'left')
sign_l = pd.merge(sign_l, sv, on = 'id_line', how = 'left')
sign_l = pd.merge(sign_l, pr, on = 'id_line', how = 'left')
sign_l = sign_l.fillna('')
sign_l = sign_l[['id_word', 'form', 'lang', 'lemma_sux', 'lang2', 'equiv', 'equiv_lemm', 'gloss', 'sign_name']]

## 5 Save Pickled DataFrame for Future Use.
The code in the cells 6 and 7 may be run in a separate Notebook (`search_signlist.ipynb`).

In [7]:
sign_l.to_pickle('output/sign_lines.p')

## 6 Prepare Data for Search

In [8]:
anchor = '<a href="http://oracc.org/dcclt/{}", target="_blank">{}</a>'
t = sign_l.copy()
t['id_word'] = [anchor.format(val,val) for val in t['id_word']]
signs = list(set(sign_l['form']))
signs.sort()

## 7 Interactive Search

In [9]:
@interact(sort_by = t.columns, rows = (1, len(t), 1), search = signs)
def sort_df(sort_by = "id_word", ascending = False, rows = 25, search = 'A'):
    l = t[t.form == search]
    l = l.sort_values(by = sort_by, ascending = ascending).reset_index(drop=True)[:rows].style
    return l

interactive(children=(Dropdown(description='sort_by', options=('id_word', 'form', 'lang', 'lemma_sux', 'lang2'…