# For Creating Output Only
The following code is used to create MarkDown tables from Pandas DataFrames. The tables can be included in the Compass Markdown files.

In [None]:
from tabulate import tabulate
import pandas as pd

# Tables for 3_2 (lit/lex comparsion)

In [None]:
lit_df2 = pd.read_pickle('output/lit.p')

In [None]:
min_length = 200
lit_tab = lit_df2.copy()
lit_tab = lit_tab.loc[lit_tab.length >= min_length]
markdown = "[{}](http://oracc.org/epsd2/literary/{})"
lit_tab['id_text'] = [markdown.format(val,val) for val in lit_df2.loc[lit_df2.length >= min_length, 'id_text']]
lit_tab = lit_tab.round({'ttr' : 3, 'norm': 3, 'mtld' : 3})
#lit_tab = lit_tab.loc[lit_tab.length >= 200]

In [None]:
rows = 10 # number of rows to be exported
col = 'norm' # column by which to sort
asc = True
tab = tabulate(lit_tab.sort_values(by=col, ascending=asc)[:rows],
         headers= lit_tab.columns , tablefmt="github", showindex=False)
with open('output/lit_tab.txt', 'w', encoding='utf8') as w:
    w.write(tab)

# Tables for 3_3 (lex/lit comparison)

In [None]:
lex2 = pd.read_pickle('output/lex.p')

In [None]:
min_length = 250
lex2['PQ'] = ['Composite' if i[0] == 'Q' else 'Exemplar' for i in lex2['id_text']]
lex_tab = lex2.copy()
lex_tab = lex_tab.loc[lex_tab.length >= min_length]
markdown = "[{}](http://oracc.org/dcclt/{})"
lex_tab['id_text'] = [markdown.format(val,val) for val in lex2.loc[lex2.length >= min_length, 'id_text']]
lex_tab = lex_tab.loc[lex_tab.PQ == 'Composite'].drop('PQ', axis = 1)

In [None]:
rows = 10 # number of rows to be exported
col = 'norm' # column by which to sort
asc = False
tab = tabulate(lex_tab.sort_values(by=col, ascending=asc)[:rows],
         headers= lex_tab.columns , tablefmt="github", showindex=False)
with open('output/lex_tab.txt', 'w', encoding='utf8') as w:
    w.write(tab)

# Tables for 3_4 (lex/adm comparison)

In [None]:
lex2 = pd.read_pickle('output/adm_lex.p')

In [None]:
min_length = 250
lex2['PQ'] = ['Composite' if i[0] == 'Q' else 'Exemplar' for i in lex2['id_text']]
lex_tab = lex2.copy()
lex_tab = lex_tab.loc[lex_tab.length >= min_length]
markdown = "[{}](http://oracc.org/dcclt/{})"
lex_tab['id_text'] = [markdown.format(val,val) for val in lex2.loc[lex2.length >= min_length, 'id_text']]
lex_tab = lex_tab.loc[lex_tab.PQ == 'Composite'].drop('PQ', axis = 1)

In [None]:
rows = 10 # number of rows to be exported
col = 'norm' # column by which to sort
asc = False
tab = tabulate(lex_tab.sort_values(by=col, ascending=asc)[:rows],
         headers= lex_tab.columns , tablefmt="github", showindex=False)
with open('output/lex_adm_tab.txt', 'w', encoding='utf8') as w:
    w.write(tab)

# Testing
Add one word at a time and see how that influences ttr. Does ttr arrive at a plateau?

In [None]:
plt.figure(figsize=(20,10))
for id in lit_comp['id_text']:
    c = lit_comp.loc[lit_comp['id_text'] == id, 'lemma_mwe']
    c = c.iloc[0]

    ttr_l = []
    enum = range(1, len(c))
    for ind in enum:
        t = c[:ind]
        ttr = lr(t).ttr
        ttr_l.append(ttr)
    plt.plot(enum, ttr_l)
plt.show()

In [None]:
lit_df2.loc[(86 < lit_df2.mtld) & (lit_df2.mtld < 162.6)].sort_values(by = 'mtld')

Following needs to be redone with Q numbers

In [None]:
tetrad = {'c.2.5.8.1' : 1, 'c.2.5.3.2' : 1, 'c.2.5.5.2' : 1, 'c.4.16.1': 1}
decad = {'c.2.4.2.01' : 2, 'c.2.5.5.1' : 2, 'c.5.5.4' : 2, 'c.4.07.2' : 2, 'c.4.05.1' : 2,
         'c.4.80.2' : 2, 'c.1.1.4' : 2, 'c.1.3.2' : 2, 'c.4.28.1' : 2, 'c.1.8.1.5' : 2}
houseF = {'c.5.1.2' : 3,'c.5.1.3' : 3, 'c.1.8.1.4' : 3, 'c.1.6.2' : 3, 'c.2.1.5' : 3,
          'c.2.4.2.02' : 3, 'c.2.2.2' : 3, 'c.5.6.1' : 3, 'c.5.1.1' : 3, 'c.5.3.2' : 3,
          'c.1.4.3' : 3, 'c.5.6.3' : 3, 'c.5.4.1' : 3, 'c.5.3.1' : 3}
proverbs = {'c.6.1.01' : 4, 'c.6.1.02' : 4, 'c.6.1.03' : 4, 'c.6.1.04' : 4, 'c.6.1.05' : 4,
            'c.6.1.06' : 4, 'c.6.1.07' : 4,'c.6.1.08' : 4, 'c.6.1.09' : 4, 'c.6.1.10' : 4,
            'c.6.1.11' : 4, 'c.6.1.12' : 4, 'c.6.1.13' : 4, 'c.6.1.14' : 4,'c.6.1.15' : 4,
            'c.6.1.16' : 4, 'c.6.1.17' : 4, 'c.6.1.18' : 4, 'c.6.1.19' : 4, 'c.6.1.20' : 4,
            'c.6.1.21' : 4, 'c.6.1.22' : 4, 'c.6.1.23' : 4, 'c.6.1.24' : 4, 'c.6.1.25' : 4,
            'c.6.1.26' : 4, 'c.6.1.27' : 4, 'c.6.1.28' : 4, 'c.6.2.1' : 4, 'c.6.2.2' : 4,
            'c.6.2.3' : 4,'c.6.2.4' : 4,'c.6.2.5' : 4}

In [None]:
educL = {}
educL.update(tetrad)
educL.update(decad)
educL.update(houseF)
educL.update(proverbs)
educ = lit_df2.loc[lit_df2.id_text.isin(educL)].sort_values(by = 'norm')
educ

In [None]:
educ['category'] = [educL[id] for id in educ.id_text]

In [None]:
educ.sort_values(by = 'mtld')

In [None]:
etcsl.norm.describe()

In [None]:
etcsl.loc[round(etcsl.norm, 3) == 0.874].style

In [None]:
import seaborn as sns
#colors = {1 :'red', 2:'blue', 3:'green', 4:'black'}
#plt.scatter(educ.norm, educ.mtld, s =75, c=educ['category'].apply(lambda x: colors[x]), alpha = 1)
sns.scatterplot('norm', 'mtld', data=educ, hue='category', size = 'length', sizes = (50, 200), alpha = 0.75)

In [None]:
lit_df2["mtld"].corr(lit_df2['norm'])

In [None]:
lit_df2.loc[lit_df2.norm > .5].plot.scatter(x = 'mtld', y = 'norm')

In [None]:
educL['c.2.5.5.2']

In [None]:
test_df = lit_df2.loc[lit_df2.norm > .6].copy()
test_df['norm'].corr(test_df['mtld'])

In [None]:
lit_df2.loc[lit_df2.mtld > 200]

In [None]:
hymns = etcsl_df2.loc[etcsl_df2.id_text.str.startswith('c.2')]

In [None]:
hymns.sort_values(by = 'id_text')

In [None]:
genres = etcsl_df2.groupby(etcsl_df2.id_text.str[:5]).aggregate({'norm' : 'mean'})

In [None]:
genres.plot()

In [None]:
etcsl[['mtld', 'length', 'norm', 'lex_var', 'ttr', 'n_matches']].describe()

In [None]:
DD = set(etcsl_comp.lemma_mwe.iloc[13])

In [None]:
len(DD - set(lex_vocab))

In [None]:
len(DD)

In [None]:
etcsl_comp.iloc[13]

In [None]:
etcsl_df2.loc[etcsl_df2.id_text == 'c.1.4.3']

In [None]:
etcsl.norm.median()

> # Some thoughts

> * Step 1. Measure length of lemma_mwe in etcsl_comp and remove rows with len < 200.
> * Step 2. Create DTM (see below) of etcsl_comp, binary = True and vocabulary = lemma_mwe from lex (use lex_lines)
> * Step 3. Order compositions by highest match
> * Step 4. Normalize for text length (from Step 1)
> * Step 5. Same process for individual lex texts (which has highest match for Ura 4?)
> * Step 6. TF-IDF

> In future iteration: do *not* select among lexical texts - let the script figure out which lex compositions are most relevant.

> Perhaps: make DTM first - show that DTM.shape gives same numbers for lex vocabulary as second Venn diagram above. Remove all columns where sum == 0. Show that DTM.shape now gives total of overlap as in Venn diagram above. Then remove rows <= minimum. Tricky!