# 3 Overlap in Lexical and Admin Vocabulary

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # this suppresses a warning about pandas from tqdm
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from ipywidgets import interact
from tqdm.auto import tqdm
tqdm.pandas() # initiate pandas support in tqdm, allowing progress_apply() and progress_map()
from nltk.tokenize import MWETokenizer
import zipfile
import json
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *

In [2]:
projects = "epsd2/admin/ur3"
words = get_data(projects)

Downloading JSON
Saving http://build-oracc.museum.upenn.edu/json/epsd2-admin-ur3.zip as jsonzip/epsd2-admin-ur3.zip.


HBox(children=(IntProgress(value=1, bar_style='info', description='epsd2/admin/ur3', max=1, style=ProgressStyl…


Parsing JSON


HBox(children=(IntProgress(value=0, description='epsd2/admin/ur3', max=71561, style=ProgressStyle(description_…

epsd2/admin/ur3/P102902 is not available or not complete



In [3]:
words = words.loc[words["lang"].str.contains("sux")] 

In [4]:
words["lemma"] = words.progress_apply(lambda r: f"{r['cf']}[{r['gw']}]{r['pos']}" 
                            if r["cf"] != '' else f"{r['form']}[NA]NA", axis=1)
words["lemma"] = words["lemma"].str.lower()

HBox(children=(IntProgress(value=0, max=3307520), HTML(value='')))




In [5]:
adm_lines = words.groupby([words['id_text'], words['id_line']]).agg({
        'lemma': ' '.join
    }).reset_index()

In [6]:
with open('output/lex_vocab.txt', 'r', encoding = 'utf8') as r:
    lex_vocab = r.read().splitlines()
lex_vocab.sort()

In [7]:
lex = [tuple(item.split("_")) for item in lex_vocab]
lex = [item for item in lex if len(item) > 1]
tokenizer = MWETokenizer(lex)

In [8]:
lemma_list = [lemma.split() for lemma in adm_lines["lemma"]]
lemma_mwe = tokenizer.tokenize_sents(lemma_list)
adm_lines["lemma_mwe"] = [' '.join(line) for line in lemma_mwe]

In [9]:
adm_words1 = words["lemma"]
adm_words_s1 = {lemma for lemma in adm_words1 if not '[na]na' in lemma}
adm_words2 = ' '.join(adm_lines['lemma_mwe']).split()
adm_words_s2 = {lemma for lemma in adm_words2 if not '[na]na' in lemma}
adm_words_s2 = adm_words_s1 | adm_words_s2
lexical_words_s2 = set(lex_vocab)

In [10]:
adm_lex = list(lexical_words_s2.intersection(adm_words_s2))
adm_lex = [item.replace('_', ' ') for item in adm_lex]
adm_lex.sort()

In [11]:
lex_lines = pd.read_pickle('output/lexlines.p')

In [12]:
Ura6 = ["dcclt/P227657",
"dcclt/P227743",
"dcclt/P227791",
"dcclt/P227799",
"dcclt/P227925",
"dcclt/P227927",
"dcclt/P227958",
"dcclt/P227967",
"dcclt/P227979",
"dcclt/P228005",
"dcclt/P228008",
"dcclt/P228200",
"dcclt/P228359",
"dcclt/P228368",
"dcclt/P228488",
"dcclt/P228553",
"dcclt/P228562",
"dcclt/P228663",
"dcclt/P228726",
"dcclt/P228831",
"dcclt/P228928",
"dcclt/P229015",
"dcclt/P229093",
"dcclt/P229119",
"dcclt/P229304",
"dcclt/P229332",
"dcclt/P229350",
"dcclt/P229351",
"dcclt/P229352",
"dcclt/P229353",
"dcclt/P229354",
"dcclt/P229356",
"dcclt/P229357",
"dcclt/P229358",
"dcclt/P229359",
"dcclt/P229360",
"dcclt/P229361",
"dcclt/P229362",
"dcclt/P229365",
"dcclt/P229366",
"dcclt/P229367",
"dcclt/P229890",
"dcclt/P229925",
"dcclt/P230066",
"dcclt/P230208",
"dcclt/P230230",
"dcclt/P230530",
"dcclt/P230586",
"dcclt/P231095",
"dcclt/P231128",
"dcclt/P231424",
"dcclt/P231446",
"dcclt/P231453",
"dcclt/P231458",
"dcclt/P231742",
"dcclt/P266520"]
lex_lines.loc[lex_lines["id_text"].isin(Ura6), "id_text"] = "dcclt/Q000043"

In [13]:
cv = CountVectorizer(preprocessor = lambda x: x, tokenizer = lambda x: x.split(), vocabulary = adm_lex, ngram_range=(1, 5))
dtm = cv.fit_transform(lex_lines['lemma'])
lex_lines_dtm = pd.DataFrame(dtm.toarray(), columns= cv.get_feature_names(), index=lex_lines["id_text"])
lex_comp_dtm = lex_lines_dtm.groupby('id_text').agg(sum).reset_index()

In [14]:
lex_comp_dtm["n_matches"] = lex_comp_dtm[adm_lex].astype(bool).sum(axis = 1)
lex_comp_dtm["id_text"] = [i[-7:] for i in lex_comp_dtm["id_text"]]

In [15]:
lex = pd.read_pickle('output/lexdtm.p').drop('n_matches', axis=1)
lex = pd.merge(lex, lex_comp_dtm[['n_matches', 'id_text']], on='id_text', how='inner')

In [16]:
lex['norm'] = lex['n_matches'] / lex['length']

In [17]:
anchor = '<a href="http://oracc.org/dcclt/{}", target="_blank">{}</a>'
lex2 = lex.copy()
lex2['id_text'] = [anchor.format(val,val) for val in lex['id_text']]
lex2['PQ'] = ['Composite' if i[0] == 'Q' else 'Exemplar' for i in lex['id_text']]

In [18]:
@interact(sort_by = lex2.columns, rows = (1, len(lex2), 1), min_length = (0,500,5), show = ["Exemplars", "Composites", "All"])
def sort_df(sort_by = "norm", ascending = False, rows = 25, min_length = 200, show = 'All'):
    if not show == 'All':
        l = lex2.loc[lex2['PQ'] == show[:-1]]
    else:
        l = lex2
    l = l.drop('PQ', axis = 1)
    l = l.loc[l.length >= min_length].sort_values(by = sort_by, ascending = ascending).reset_index(drop=True)[:rows].style
    return l

interactive(children=(Dropdown(description='sort_by', index=5, options=('id_text', 'designation', 'subgenre', …