# Transition Matrix Generator

## Imports and Settings

In [19]:
from nate_givens_toolkit import cloud_io as cloud
from nate_givens_toolkit import local_io as local
import pandas as pd
from datetime import datetime

## Global Variables

In [20]:
CLEAN_CORPORA_DIR = 'clean_corpora/'
DATA_DIR = 'data_files/'
TRANS_MATS_DIR = 'transition_matrices/'
BUCKET = 'lexgen'

## Logic

### Read in Data Tables

#### Clean Corpora

In [21]:
clean_corpora = cloud.read_csv_from_s3('clean_corpora_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [22]:
clean_corpora.head()

Unnamed: 0,filename,raw_corpora_filename,last_load_dtime,note
0,en_50k_2018_A.txt,en_50k_2018.txt,2021-04-01 01:07:27.259981,Cleaned version of en_50k_2018 with top 200 wo...
1,de_50k_2018_A.txt,de_50k_2018.txt,2021-04-06 20:25:02.967162,Cleaned version of de_50k_2018 with top 200 wo...
2,af_full_2018_A.txt,af_full_2018.txt,2021-04-07 20:30:13.085269,Cleaned version of af_full_2018 with top 200 w...


#### Transition Matrices

In [23]:
trans_mats = cloud.read_csv_from_s3('trans_mats_inventory.dat', DATA_DIR, BUCKET, sep='|')

In [24]:
trans_mats.head()

Unnamed: 0,filename,clean_corpus_filename,prefix_len,last_load_dtime
0,en_50k_2018_A-tm1.dat,en_50k_2018_A.txt,1,2021-04-02 01:26:32.184511
1,en_50k_2018_A-tm2.dat,en_50k_2018_A.txt,2,2021-04-02 01:26:32.184511
2,en_50k_2018_A-tm3.dat,en_50k_2018_A.txt,3,2021-04-02 01:26:32.184511
3,en_50k_2018_A-tm4.dat,en_50k_2018_A.txt,4,2021-04-02 01:26:32.184511
4,de_50k_2018_A-tm1.dat,de_50k_2018_A.txt,1,2021-04-07 20:03:20.399818


### Select Clean Corpus

In [25]:
clean_corpus_filename = 'af_full_2018_A.txt'

### Create Output Filenames

In [26]:
tm_names = [f'{clean_corpus_filename.replace(local.get_file_extension(clean_corpus_filename), "")}-tm{x}.dat' for x in range(1,5)]

### Process Clean Corpus

In [27]:
clean_corpus = cloud.read_csv_from_s3(clean_corpus_filename, CLEAN_CORPORA_DIR, BUCKET, sep='|')

In [28]:
clean_corpus.head()

Unnamed: 0,word,freq
0,die,5.77427
1,nie,5.77427
2,ek,5.77427
3,is,5.77427
4,het,5.77427


In [29]:
# create a list of 4 dictionaries, one for each transition matrix
tms = [{} for i in range(4)]
# the transition matrix dictionaries will be nested dictionaries with structure as follows:
# Outer Dictionary key: the prefix (1 - 4 characters)
# Outer Dictionary value: the Inner Dictionary
# Inner Dictionary key: the suffix (1 character)
# Inner Dictionary value: the frequency of transitioning to that suffix given the prefix (Outer Dictionary key)

In [30]:
# create a list of 4 dictionaries to track total frequency for each from_substr
# this will be used to normalize probability conditioned on from_substr
from_substr_freqs = [{} for i in range(4)]

In [31]:
# run through the clean corpus and create transition matrix dictionaries and cumulative frequency dictionaries
for row in clean_corpus.itertuples(index=False):
    word = f' {row[0]} '
    frequency = float(row[1])
    for i in range(1, len(word)):
        to_char = word[i]
        # for each character in the word, we're going to iterate through our substring lengths = (1, 2, 3, 4)
        # of course, the indices are actually t = (0, 1, 2, 3)
        for t in range(4):
            # populate from_substr with the 1, 2, 3 or 4-char substring preceding to_char
            # if there are not enough characters, set from_substr to None
            from_substr = word[i-t-1:i] if i > t else None
            
            if from_substr in tms[t].keys():
                if to_char in tms[t][from_substr].keys():
                    tms[t][from_substr][to_char] = tms[t][from_substr][to_char] + frequency
                else:
                    tms[t][from_substr][to_char] = frequency
            elif from_substr is not None:
                tms[t][from_substr] = {to_char: frequency}
            else:
                pass
            
            if from_substr in from_substr_freqs[t].keys():
                from_substr_freqs[t][from_substr] = from_substr_freqs[t][from_substr] + frequency
            elif from_substr is not None:
                from_substr_freqs[t][from_substr] = frequency
            else:
                pass

In [32]:
# normalize the frequencies
for t in range(4):
    for key in from_substr_freqs[t].keys():
        total_freq = from_substr_freqs[t][key]
        for sub_key in tms[t][key].keys():
            tms[t][key][sub_key] = tms[t][key][sub_key] / total_freq

In [33]:
# convert transition matrix dictionaries to Pandas dataframes
tm_dfs = []
for t in range(4):
    substr_col = []
    to_char_col = []
    frequency_col = []
    for outer_key in tms[t].keys():
        for inner_key, value in tms[t][outer_key].items():
            substr_col.append(outer_key)
            to_char_col.append(inner_key)
            frequency_col.append(value)
    data = list(zip(substr_col, to_char_col, frequency_col))
    df = pd.DataFrame(data, columns=['from_str', 'to_char', 'rel_frequency'])
    tm_dfs.append(df)

### Save Transition Matrices to S3

In [34]:
# write transition matrices to S3
for tm_df, tm_name in zip(tm_dfs, tm_names):
    cloud.write_csv_to_s3(tm_name, TRANS_MATS_DIR, BUCKET, tm_df, sep='|', index=False)
    

### Update Transition Matrices Inventory

In [35]:
load_dtime = str(datetime.utcnow())
t = 1
for tm_name in tm_names:
    new_row = {
        'filename' : tm_name
        ,'clean_corpus_filename' : clean_corpus_filename
        ,'prefix_len' : t
        ,'last_load_dtime' : load_dtime
    }
    t += 1
    trans_mats = trans_mats.append(new_row, ignore_index = True)

In [36]:
cloud.write_csv_to_s3('trans_mats_inventory.dat', DATA_DIR, BUCKET, trans_mats, sep='|', index=False)