# Lexicon Generator - Prototype 1

## Import Statements

In [40]:
from nate_givens_toolkit import cloud_io as cloud
from nate_givens_toolkit import local_io as local
import pandas as pd
import numpy as np
import sys

## Global Variables

In [2]:
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
TRANS_MATS_DIR = 'transition_matrices/'
BUCKET = 'lexgen'

## Read in Data Files

### Raw Corpora

In [5]:
raw_corpora = cloud.read_csv_from_s3('raw_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [6]:
raw_corpora

Unnamed: 0,filename,lang_code,source_url,last_load_dtime,note
0,en_full_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 02:04:07.270824,HermitDave's version of the full 2018 English ...
1,en_50k_2018.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.589244,HermitDave's version of the top 50k 2018 Engli...
2,en_full_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:53.984856,HermitDave's version of the full 2016 English ...
3,en_50k_2016.txt,en,https://raw.githubusercontent.com/hermitdave/F...,2021-03-26 03:25:54.377632,HermitDave's version of the top 50k 2016 Engli...
4,de_full_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.967039,HermitDave's version of the full 2018 German f...
5,de_50k_2018.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.970505,HermitDave's version of the top 50k 2018 Germa...
6,de_full_2016.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.974519,HermitDave's version of the full 2016 English ...
7,de_50k_2016.txt,de,https://raw.githubusercontent.com/hermitdave/F...,2021-04-06 19:42:26.978002,HermitDave's version of the top 50k 2016 Engli...
8,af_full_2018.txt,af,https://raw.githubusercontent.com/hermitdave/F...,2021-04-07 20:09:57.721287,HermitDave's version of the full 2018 Afrikaan...
9,af_full_2016.txt,af,https://raw.githubusercontent.com/hermitdave/F...,2021-04-07 20:13:55.305772,HermitDave's version of the full 2016 Afrikaan...


### Clean Corpora

In [3]:
clean_corpora = cloud.read_csv_from_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [4]:
clean_corpora

Unnamed: 0,filename,raw_corpora_filename,last_load_dtime,note
0,en_50k_2018_A.txt,en_50k_2018.txt,2021-04-01 01:07:27.259981,Cleaned version of en_50k_2018 with top 200 wo...
1,de_50k_2018_A.txt,de_50k_2018.txt,2021-04-06 20:25:02.967162,Cleaned version of de_50k_2018 with top 200 wo...
2,af_full_2018_A.txt,af_full_2018.txt,2021-04-07 20:30:13.085269,Cleaned version of af_full_2018 with top 200 w...


### Transition Matrices

In [7]:
trans_mats = cloud.read_csv_from_s3('trans_mats_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [8]:
trans_mats

Unnamed: 0,filename,clean_corpus_filename,prefix_len,last_load_dtime
0,en_50k_2018_A-tm1.dat,en_50k_2018_A.txt,1,2021-04-02 01:26:32.184511
1,en_50k_2018_A-tm2.dat,en_50k_2018_A.txt,2,2021-04-02 01:26:32.184511
2,en_50k_2018_A-tm3.dat,en_50k_2018_A.txt,3,2021-04-02 01:26:32.184511
3,en_50k_2018_A-tm4.dat,en_50k_2018_A.txt,4,2021-04-02 01:26:32.184511
4,de_50k_2018_A-tm1.dat,de_50k_2018_A.txt,1,2021-04-07 20:03:20.399818
5,de_50k_2018_A-tm2.dat,de_50k_2018_A.txt,2,2021-04-07 20:03:20.399818
6,de_50k_2018_A-tm3.dat,de_50k_2018_A.txt,3,2021-04-07 20:03:20.399818
7,de_50k_2018_A-tm4.dat,de_50k_2018_A.txt,4,2021-04-07 20:03:20.399818
8,af_full_2018_A-tm1.dat,af_full_2018_A.txt,1,2021-04-07 20:31:26.257230
9,af_full_2018_A-tm2.dat,af_full_2018_A.txt,2,2021-04-07 20:31:26.257230


## Logic

## Select Corpora

For this prototype, we'll pick 2, assign weighs, and then check to make sure they exist

In [83]:
# enter the corpora
selected_corpora = [{
    'filename' : 'en_50k_2018_A.txt' 
    , 'weight' : 60
    }
    ,{
    'filename' : 'de_50k_2018_A.txt'
    , 'weight' : 40
    }
]

In [85]:
# normalize the weights 
total_weight = 0.0
for corpus in selected_corpora:
    total_weight += corpus['weight']
for corpus in selected_corpora:
    corpus['weight'] = corpus['weight'] / total_weight

### Merge Transition Matrices

In [86]:
# read in all the transition matrices
original_trans_mats = {
    'prefix_1':[]
    ,'prefix_2':[]
    ,'prefix_3':[]
    ,'prefix_4':[]
}

for corpus in selected_corpora:     
    for key, tm in zip(original_trans_mats.keys(), [f'{corpus["filename"].replace(local.get_file_extension(corpus["filename"]), "")}-tm{x}.dat' for x in range(1,5)]):
        original_trans_mats[key].append(cloud.read_csv_from_s3(tm, TRANS_MATS_DIR, BUCKET, sep='|'))

In [93]:
original_trans_mats['prefix_1'][0].head()

Unnamed: 0,from_str,to_char,rel_frequency,weight
0,,y,0.008568,0.6
1,,i,0.019913,0.6
2,,t,0.0593,0.6
3,,a,0.057908,0.6
4,,o,0.019873,0.6


In [104]:
# merge the dataframes using the provided weights
new_tms = {}

for prefix_key in original_trans_mats.keys():
    for corpus, df in zip(selected_corpora, original_trans_mats[prefix_key]):
        df['weight'] = float(corpus['weight'])
    new_tm = pd.concat(original_trans_mats[prefix_key])
    new_tm['new_weight'] = new_tm['rel_frequency'] * new_tm['weight']
    new_tm = new_tm.drop(['rel_frequency', 'weight'], axis=1)
    new_tm.groupby(['from_str', 'to_char']).sum().reset_index()
    new_tms[prefix_key] = new_tm

In [121]:
# renormalize the new transition matrices

for prefix_key in new_tms.keys():
    grouped_df = new_tms['prefix_1'].groupby(['from_str']).sum().reset_index()
    imbalanced_from_str_list = zip(grouped_df['from_str'].tolist(), grouped_df['new_weight'].tolist())
    for from_str, weight in imbalanced_from_str_list:
        