# 3 Overlap in Lexical and Admin Vocabulary

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # this suppresses a warning about pandas from tqdm
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from ipywidgets import interact
from tqdm.auto import tqdm
tqdm.pandas() # initiate pandas support in tqdm, allowing progress_apply() and progress_map()
from nltk.tokenize import MWETokenizer
import zipfile
import json
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [None]:
projects = "epsd2/admin/ur3"
words = get_data(projects)

In [None]:
words = words.loc[words["lang"].str.contains("sux")] 

In [None]:
words["lemma"] = words.progress_apply(lambda r: f"{r['cf']}[{r['gw']}]{r['pos']}" 
                            if r["cf"] != '' else f"{r['form']}[NA]NA", axis=1)
words["lemma"] = words["lemma"].str.lower()

In [None]:
adm_lines = words.groupby([words['id_text'], words['id_line']]).agg({
        'lemma': ' '.join
    }).reset_index()

In [None]:
with open('output/lex_vocab.txt', 'r', encoding = 'utf8') as r:
    lex_vocab = r.read().splitlines()
lex_vocab.sort()

In [None]:
lex = [tuple(item.split("_")) for item in lex_vocab]
lex = [item for item in lex if len(item) > 1]
tokenizer = MWETokenizer(lex)

In [None]:
lemma_list = [lemma.split() for lemma in adm_lines["lemma"]]
lemma_mwe = tokenizer.tokenize_sents(lemma_list)
adm_lines["lemma_mwe"] = [' '.join(line) for line in lemma_mwe]

In [None]:
adm_words2 = ' '.join(adm_lines['lemma_mwe']).split()
adm_words_s2 = {lemma for lemma in adm_words2 if not '[na]na' in lemma}
lexical_words_s2 = set(lex_vocab)

In [None]:
adm_lex = list(lexical_words_s2.intersection(adm_words_s2))
adm_lex = [item.replace('_', ' ') for item in adm_lex]
adm_lex.sort()

In [None]:
lex_lines = pd.read_pickle('output/lexlines.p')

In [None]:
Ura6 = ["dcclt/P227657",
"dcclt/P227743",
"dcclt/P227791",
"dcclt/P227799",
"dcclt/P227925",
"dcclt/P227927",
"dcclt/P227958",
"dcclt/P227967",
"dcclt/P227979",
"dcclt/P228005",
"dcclt/P228008",
"dcclt/P228200",
"dcclt/P228359",
"dcclt/P228368",
"dcclt/P228488",
"dcclt/P228553",
"dcclt/P228562",
"dcclt/P228663",
"dcclt/P228726",
"dcclt/P228831",
"dcclt/P228928",
"dcclt/P229015",
"dcclt/P229093",
"dcclt/P229119",
"dcclt/P229304",
"dcclt/P229332",
"dcclt/P229350",
"dcclt/P229351",
"dcclt/P229352",
"dcclt/P229353",
"dcclt/P229354",
"dcclt/P229356",
"dcclt/P229357",
"dcclt/P229358",
"dcclt/P229359",
"dcclt/P229360",
"dcclt/P229361",
"dcclt/P229362",
"dcclt/P229365",
"dcclt/P229366",
"dcclt/P229367",
"dcclt/P229890",
"dcclt/P229925",
"dcclt/P230066",
"dcclt/P230208",
"dcclt/P230230",
"dcclt/P230530",
"dcclt/P230586",
"dcclt/P231095",
"dcclt/P231128",
"dcclt/P231424",
"dcclt/P231446",
"dcclt/P231453",
"dcclt/P231458",
"dcclt/P231742",
"dcclt/P266520"]
lex_lines.loc[lex_lines["id_text"].isin(Ura6), "id_text"] = "dcclt/Q000043"

In [None]:
#lex_comp = lex_lines.groupby(
#    [lex_lines["id_text"]]).aggregate(
#    {"lemma": ' '.join}).reset_index()

In [None]:
cv = CountVectorizer(preprocessor = lambda x: x, tokenizer = lambda x: x.split(), vocabulary = adm_lex, ngram_range=(1, 5))
dtm = cv.fit_transform(lex_lines['lemma'])
lex_lines_dtm = pd.DataFrame(dtm.toarray(), columns= cv.get_feature_names(), index=lex_lines["id_text"])
lex_comp_dtm = lex_lines_dtm.groupby('id_text').agg(sum).reset_index()

In [None]:
lex_comp_dtm["n_matches"] = lex_comp_dtm[adm_lex].astype(bool).sum(axis = 1)

In [None]:
lex_comp = lex_lines.groupby(
    [lex_lines["id_text"]]).aggregate(
    {"lemma": ' '.join}).reset_index()

In [None]:
def lex_length(lemmas):
    lemmas = lemmas.split()
    lemmas = [lemma for lemma in lemmas if not '[na]na' in lemma] # remove unlemmatized words
    length = len(lemmas)
    return length

In [None]:
lex_comp['length'] = lex_comp['lemma'].map(lex_length)

In [None]:
lex_comp_dtm = pd.merge(lex_comp_dtm, lex_comp[['id_text', 'length']], on = 'id_text', how = 'inner')
lex_comp_dtm['id_text'] = lex_comp_dtm['id_text'].str[-7:]
lex_comp_dtm = lex_comp_dtm.sort_values(by = 'length', ascending=False)
lex_comp_dtm = lex_comp_dtm.drop_duplicates(subset = 'id_text', keep = 'first')
lex_comp_dtm = lex_comp_dtm.loc[lex_comp_dtm['length'] > 0] # remove compositions that have no lemmatized content

In [None]:
cat = {}
for proj in ['dcclt', 'dcclt/signlists', 'dcclt/nineveh', 'dcclt/ebla']:
    f = proj.replace('/', '-')
    file = f"jsonzip/{f}.zip" # The ZIP file was downloaded in notebook 3_1
    z = zipfile.ZipFile(file) 
    st = z.read(f"{proj}/catalogue.json").decode("utf-8")
    j = (json.loads(st))
    cat.update(j["members"])
cat_df = pd.DataFrame(cat).T
cat_df["id_text"] = cat_df["id_text"].fillna(cat_df["id_composite"])
cat_df = cat_df.fillna('')
cat_df = cat_df[["id_text", "designation", "subgenre"]]

In [None]:
lex = pd.merge(cat_df, lex_comp_dtm[['id_text', 'n_matches', 'length']], on = 'id_text', how = 'inner')
lex['norm'] = lex['n_matches'] / lex['length']

In [None]:
anchor = '<a href="http://oracc.org/dcclt/{}", target="_blank">{}</a>'
lex2 = lex.copy()
lex2['id_text'] = [anchor.format(val,val) for val in lex['id_text']]
lex2['PQ'] = ['Composite' if i[0] == 'Q' else 'Exemplar' for i in lex['id_text']]

In [None]:
@interact(sort_by = lex2.columns, rows = (1, len(lex2), 1), min_length = (0,500,5), show = ["Exemplars", "Composites", "All"])
def sort_df(sort_by = "norm", ascending = False, rows = 25, min_length = 250, show = 'All'):
    if not show == 'All':
        l = lex2.loc[lex2['PQ'] == show[:-1]]
    else:
        l = lex2
    l = l.drop('PQ', axis = 1)
    l = l.loc[l.length >= min_length].sort_values(by = sort_by, ascending = ascending).reset_index(drop=True)[:rows].style
    return l