# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [1]:
# standard libraries - installed by default
import collections
import datetime
import pickle
import sqlite3
import string
import os
import timeit

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_file_db_utils import *
from part_00_process_functions import *

### set input and output paths

In [4]:
# base file path
base_file_path = '/project/finding_anagrams'

In [5]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [6]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [7]:
if os.path.exists(db_path):
    pass
else:
    os.makedirs(db_path)

In [8]:
db_name = 'words.db'

### process control flags

In [9]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 4

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

# set to None to include all letters
# test with a subset of letters by setting the letter_subset_list to ['q', 'x'] or 
# a different set of letters
letter_subset_list = ['x']
#letter_subset_list = None

# generate a sample dataset of ten words that start with each letter

# testo = wg_df.groupby(['first_letter']).sample(n = 10, random_state = 123)


In [10]:
# start a timer to record the entire operation
total_time_start = datetime.datetime.now()

### load input data

In [11]:
# load the word_df, the words from Part 1
sql = 'select * from words;'
word_df = query_db(sql = sql, db_path = db_path, db_name = db_name)

...query execution took: 1.770999 seconds...


In [12]:
word_df.shape

(234370, 8)

In [13]:
# extract the column of word ids as a numpy array
word_id_list = word_df['word_id'].to_numpy(dtype = int)    

In [14]:
# create a dataframe with the letters sorted by the frequency of words that
# start with a particular letter
agg_word_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame()

# set column names
agg_word_df.columns = ['word_count']

# reset the index to rename columns
agg_word_df = agg_word_df.reset_index()

# sort the dataframe by frequency
agg_word_df = agg_word_df.sort_values(by='word_count')

In [15]:
# extract the letters sorted by word frequency
sorted_first_letters = agg_word_df['first_letter'].tolist()

In [16]:
# load the letter dictionary from part 1
in_file_name = 'letter_dict.pkl'
letter_dict = load_pickle(in_file_path = in_file_path, in_file_name=in_file_name)

In [17]:
# load the word dictionary from part 1
in_file_name = 'word_dict.pkl'
word_dict = load_pickle(in_file_path = in_file_path, in_file_name=in_file_name)

In [18]:
# load the char matrix from part 1
in_file_name = 'char_matrix.npy'
ipn = os.path.join(in_file_path, in_file_name)
char_matrix = np.load(file = ipn)

### create the word group df: wg_df

In [19]:
# drop duplicates based on the word group. 
# by default, this will only keep the first record and it will drop all others
wg_df = word_df.drop_duplicates(subset = ['word_group_id']).copy()

In [20]:
wg_df = wg_df.sort_values(by = 'word_id')

In [21]:
# unique word groups
wg_df.shape

(215842, 8)

In [22]:
# get the word group ids
word_group_id_list = wg_df['word_group_id'].to_numpy()
# and the associated word_id
word_id_list = wg_df['word_id'].to_numpy()

In [23]:
# trim the char matrix by word id
# and not the word_group id
wchar_matrix = char_matrix[word_id_list, :]

In [24]:
# i don't use these objects, but i can't delete them?
# build a word_id to word_group_id dictionary
word_id_wg_id_dict = dict()
# and a word_group_id to word_id dictionary
wg_id_word_id_dict = dict()

for word_id, wg_id in zip(wg_df['word_id'], wg_df['word_group_id']):
    word_id_wg_id_dict[word_id] = wg_id
    wg_id_word_id_dict[wg_id] = word_id

In [25]:
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:n_subset_letters]

In [26]:
# there will be three parts to this function
# The first part does a selection by a single character
# The second part does the selection based on the newly created subselection

# peforming selections on a dataframe is slow.
# Especially so since we are comparing characters
    
# now, do it again, but this time use the dictionary
# by word length and n least common letters
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:n_subset_letters]
nc_ls_df = wg_df[['n_chars', 'letter_selector']].drop_duplicates()

print('...creating', nc_ls_df.shape[0], 'sets of ids')

...creating 16101 sets of ids


In [27]:
n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict, split_count_df, single_letter_df = split_matrix(
    letter_dict = letter_dict,
    word_group_id_list = word_group_id_list,
    nc_ls_df = nc_ls_df,
                 wg_df = wg_df,
                 wchar_matrix = wchar_matrix,
db_path = db_path, 
db_name = db_name)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
Total extraction time: 68.083913
...now writing: word_count_by_n_char
...now writing: word_count_by_single_letter
...now writing: word_count_by_letter_selector
...now writing: word_count_by_n_char_and_letter_selector


In [28]:
wg_df = pd.merge(left = wg_df, right = single_letter_df)

In [29]:
# join this back to the wg_df
wg_df = pd.merge(left = wg_df, right = split_count_df)

# demonstrate the different matrix extraction options with the word 'achiever'

In [30]:
demo_word = 'achiever'

wg_id = word_df.loc[word_df['lcase'] == demo_word, 'word_group_id'].iloc[0]

demo_wg_df = wg_df.loc[wg_df['word_group_id'] == wg_id, : ]

# option 2
n_char = demo_wg_df['n_chars'].iloc[0]

# option 3
first_letter = demo_wg_df['first_letter'].iloc[0]

# option 4
least_common_letter = demo_wg_df['letter_selector'].iloc[0][0]

# option 5
letter_selector = demo_wg_df['letter_selector'].iloc[0]

# option 6
nc_ls_tuple = demo_wg_df['nc_ls_tuple'].iloc[0]


In [31]:
# 1 full matrix: complete | get_values_full_matrix
# 2 n char (word length): complete | get_values_n_char
# 3 first letter: complete | get_value_first_letter
# 3 focal letter: complete | get_values_letter_selector
# 4 n char and least common letters: complete | get_values_n_char_letter_selector

## Select on the full matrix: option 1

In [32]:
# demo the full matrix selection
output = get_values_full_matrix(wg_id = wg_id, 
                    wchar_matrix = wchar_matrix,
                   word_group_id_list = word_group_id_list)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by word-length: option 2

In [33]:
# demo the n char selection
output = get_values_n_char(wg_id = wg_id,
                      n_char = n_char,
                      n_char_matrix_dict = n_char_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the first letter: option 3

In [34]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the single least common letter: option 4

In [35]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the letter selector: option 5

In [36]:
# demo with the letter selector
output = get_values_letter_selector(wg_id = wg_id,
                      letter_selector = letter_selector,
                      letter_selector_matrix_dict = letter_selector_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by word-length and the letter selector: option 6

In [37]:
# demo with the n_char letter selector
output = get_values_n_char_letter_selector(wg_id = wg_id,
                           nc_ls_tuple = nc_ls_tuple,                           
                           nc_ls_matrix_dict=nc_ls_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


In [38]:
# we've tested with one word, let's time many evaluations to get a sense of how quickly 
# the different matrix extraction options work
# use the timeit() function to evaluate how long, on average, a single matrix operation
# takes to complete

n_trials = 10

code_snippet_dict = {
    'Selecting by full matrix':
"""get_values_full_matrix(wg_id = wg_id, wchar_matrix = wchar_matrix, word_group_id_list = word_group_id_list)""",
    'Selecting by word length':
"""get_values_n_char(wg_id = wg_id, n_char = n_char, n_char_matrix_dict = n_char_matrix_dict)""",
    'Selecting by first letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = first_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",    
    'Selecting by single least common letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",
    'Selecting by letter selector':
"""get_values_letter_selector(wg_id = wg_id, letter_selector = letter_selector, letter_selector_matrix_dict = letter_selector_matrix_dict)""",
    'Selecting by word length and letter selector':
"""get_values_n_char_letter_selector(wg_id = wg_id, nc_ls_tuple = nc_ls_tuple, nc_ls_matrix_dict=nc_ls_matrix_dict)"""
}

for csd, cs in code_snippet_dict.items():
    
    total_time = timeit.timeit(cs, number=n_trials, globals=globals())

    # total time    
    total_time_formatted = '{:,}'.format(round(total_time, 4))    

    # average time
    avg_time = total_time / n_trials
    avg_time_formatted = '{:,}'.format(round(avg_time, 6)) 
        
    print(csd)
    # average number of seconds per trial
    print('Total time:', total_time_formatted, 'seconds. Average time:', avg_time_formatted, 'seconds.')
    
    

Selecting by full matrix
Total time: 0.3227 seconds. Average time: 0.032275 seconds.
Selecting by word length
Total time: 0.2711 seconds. Average time: 0.02711 seconds.
Selecting by first letter
Total time: 0.4186 seconds. Average time: 0.04186 seconds.
Selecting by single least common letter
Total time: 0.0712 seconds. Average time: 0.007123 seconds.
Selecting by letter selector
Total time: 0.0012 seconds. Average time: 0.000123 seconds.
Selecting by word length and letter selector
Total time: 0.0012 seconds. Average time: 0.000117 seconds.


In [39]:
# the combination of the word length and letter selector is the fastest

### estimate total number of from/to word pairs

In [40]:
# how many anagrams are there?
# let's estimate the number of anagrams by assuming that the number of
# parent/from words is a function of word length. 
# estimate_total_pairs estimates the total number of from/to word pairs

In [41]:
n_possible_anagrams = estimate_total_pairs(word_df = word_df, wg_df = wg_df,
                         nc_ls_matrix_dict = nc_ls_matrix_dict)

...estimated number of from/to pair word pairs: 196,320,089


### discover from/to word group id pairs

In [42]:

# initialize counters to count the number of to (child words) from a focal word.
# we could do this in post-processing, but the data are already in memory and it's a simple
# calculation to make.
# we want to minimize the number of trips through our data.

# the number of candidate words examined for each focal word

# a list to hold the dataframes generated for each letter
proc_time_df_list = []

# subset the list of leters
if letter_subset_list:
    letters = letter_subset_list[:]
else:
    letters = sorted_first_letters

anagram_pair_count = 0
# use numpy to pre-allocate an array that will be updated while enumerating.
# this eliminates list.append() calls

output_list = np.full(shape=(n_possible_anagrams, 2), fill_value=-1, dtype=int)

wg_count = 0

for curr_letter in letters:
    # enumerate by each letter
    # this isn't absolutely necessary, we could just enumerate by word id,
    # but for testing and development, letters are a handy way to chunk up the data.

    # this dictionary will store the calculations for each letter
    proc_time_dict = {}

    # the list of words that start with the focal letter
    curr_wg_df = wg_df.loc[wg_df["first_letter"] == curr_letter, :]

    # sort the dataframe by n_chars and letter_selector, if it exists.
    # this will cut down on dictionary lookups for matrix_extraction_types 3 and 4.
    curr_wg_df = curr_wg_df.sort_values(by=["n_chars", "letter_selector"])
    curr_word_group_id_list = curr_wg_df["word_group_id"].to_numpy()
    curr_nc_ls_tuple_list = curr_wg_df["nc_ls_tuple"].to_numpy()

    wg_count += len(curr_word_group_id_list)

    n_curr_words = "{:,}".format(curr_wg_df.shape[0])
    print(
        "...finding parent anagrams for",
        n_curr_words,
        "words that start with",
        curr_letter,
    )

    # enumerate by word id, working with integers is faster than words
    for row in curr_wg_df.itertuples(index = False):
        wg_id = row.word_group_id
            
        # start timing to record processing for each word
        s_time = datetime.datetime.now()

        # get the current word length, from the word id
        # to_word, to_word_length, curr_first_letter, clg, clgr = word_dict[word_group_id]
        to_word_length = word_dict[wg_id][1]

        # get the tuple associated with the word id
        # much faster to look up stored values for the hash value than it is to
        # only look up if the hash value has changed

        # get the possible candidate word_group_ids and char matrix
        ####
        ## TODO: CODE OPTIONS 1 THROUGH 4. OR 5?
        ####
        if matrix_extraction_option == 1:
            # option 1: full matrix        
            outcome_word_id_list = get_values_full_matrix(wg_id = wg_id, wchar_matrix = wchar_matrix, word_group_id_list = word_group_id_list)
        elif matrix_extraction_option == 2:
            # option 2: word length
            outcome_word_id_list = get_values_n_char(wg_id = wg_id, n_char = row.n_chars, n_char_matrix_dict = n_char_matrix_dict)
        elif matrix_extraction_option == 3:            
            # option 3: first character
            outcome_word_id_list = get_values_single_letter(wg_id = wg_id, single_letter = row.first_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)
        elif matrix_extraction_option == 4:
            # option 4: single least common letter
            outcome_word_id_list = get_values_single_letter(wg_id = wg_id, single_letter = row.letter_selector[0],
                                                           single_letter_matrix_dict = single_letter_matrix_dict)
        elif matrix_extraction_option == 5:
            # option 5: letter selector / focal letter
            outcome_word_id_list = get_values_letter_selector(wg_id = wg_id, letter_selector = row.letter_selector, letter_selector_matrix_dict = letter_selector_matrix_dict)                            
        else:     
            # option 6: word length and letter selector
            outcome_word_id_list = get_values_n_char_letter_selector(
                wg_id=wg_id, nc_ls_tuple=row.nc_ls_tuple, nc_ls_matrix_dict=nc_ls_matrix_dict
            )
        
        # if the outcome is greater than or equal to zero, then the current word is an
        # anagram of the other word
        # a value  >= 0 means that the current word contains the exact same number of focal letters
        # mite --> time or miter --> time
        # a value >= 1 means that current word contains at least the same number of focal letters
        # terminator --> time
        # a value of <=-1 means that the current word does not have the
        # correct number of letters and is therefore not an anagram.
        # trait <> time

        # number of parent words found
        n_from_words = outcome_word_id_list.shape[0]

        if n_from_words > 1:
            # we have matches
            # the focal word

            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            # the from words
            # print(anagram_pair_count)
            # print(new_anagram_pair_count)
            # print(len(outcome_word_id_list))
            # print(output_list.shape)
            output_list[
                anagram_pair_count:new_anagram_pair_count, 0
            ] = outcome_word_id_list[:, 0]

            # the to word
            output_list[
                anagram_pair_count:new_anagram_pair_count, 1
            ] = outcome_word_id_list[:, 1]

            # set the anagram pair count
            anagram_pair_count = new_anagram_pair_count

        del outcome_word_id_list

        # record the time for the word
        e_time = datetime.datetime.now()
        p_time = e_time - s_time
        p_time = round(p_time.total_seconds(), 8)        

        proc_time_dict[wg_id] = (p_time, n_from_words)

    # create a dataframe from the proc_time_dict
    proc_time_df = pd.DataFrame.from_dict(data=proc_time_dict, orient="index")
    proc_time_df = proc_time_df.reset_index()
    proc_time_df.columns = [
        "word_group_id",
        "n_seconds",
        "n_from_word_groups"        
    ]

    # display processing time for the current letter
    total_proc_time = round(proc_time_df["n_seconds"].sum(), 2)
    print(
        "...finding parent anagrams for",
        curr_letter,
        "words took",
        total_proc_time,
        "seconds...",
    )

    proc_time_df_list.append(proc_time_df)

# let's do some stuff that counts the words
proc_time_df = pd.concat(objs = proc_time_df_list)

...finding parent anagrams for 364 words that start with x
...finding parent anagrams for x words took 0.33 seconds...


### shape and store output data

In [43]:
# truncate the output array to only include indices with a from/to word pair
output_indices = np.all(output_list >= 0, axis = 1)
output_list = output_list[output_indices, ]
del output_indices

In [44]:
# how many anagram pairs were found?
n_total_anagrams = len(output_list)
n_total_anagrams_formatted = '{:,}'.format(n_total_anagrams)
print('...total anagrams', n_total_anagrams_formatted)

...total anagrams 26,489


In [45]:
## count the number of to words
# https://docs.python.org/3/library/collections.html#collections.Counter
# number of to words

### write anagram pairs to SQLite

In [46]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = db_path, db_name = db_name)    

### store number of from/to word pairs and time related to processing

In [47]:
# we need to split up the dataframes tracking the total number of options and the processing time and found words

In [48]:
# we have three dataframes: wg_df, word_df, and proc_time_df

In [49]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,letter_selector,first_letter_lookup,nc_ls_tuple,full_matrix_lookup,n_char_lookup,single_letter_lookup,letter_selector_lookup,nc_ls_lookup
0,A,a,1,a,0,0,a,a,a,133001,"(1, a)",215842,215842,133001,133001,133001
1,aa,aa,2,a,1,1,a,a,a,133001,"(2, a)",215842,215816,133001,133001,133000
2,aal,aal,3,a,2,2,al,la,la,133001,"(3, la)",215842,215717,98258,65528,65527
3,all,all,3,a,5394,5305,al,la,la,133001,"(3, la)",215842,215717,98258,65528,65527
4,aalii,aalii,5,a,3,3,ail,lai,lai,133001,"(5, lai)",215842,211614,98258,42754,42731


In [50]:
proc_time_df.head()

Unnamed: 0,word_group_id,n_seconds,n_from_word_groups
0,214062,0.005999,6748
1,214287,0.003001,4445
2,214335,0.0,327
3,214291,0.003002,1456
4,214293,0.0,1099


In [51]:
def format_anagaram_processing(
    output_list: np.array,
    proc_time_df: pd.DataFrame,
    word_df: pd.DataFrame,
    wg_df: pd.DataFrame,
    matrix_extraction_option: int,
    db_path: str,
    db_name: str) -> None:        
    
    # remove columns that will be duplicated, this is necessary for a
    # subsequent join to the word_df
    drop_col_names = [
        "word",
        "lcase",
        "n_chars",
        "first_letter",
        "word_id",
        "letter_group",
        "letter_group_ranked",
    ]

    wg_df = wg_df.drop(labels=drop_col_names, axis=1)

    # indicate which words were used in the data processing
    wg_df["word_processed"] = 1

    # merge the word_df and wg_df, this has the processing times and the number of candidates.
    word_df = pd.merge(left=word_df, right=wg_df)

    # convert the nc_ls_tuple column to a string
    word_df["nc_ls_tuple"] = word_df["nc_ls_tuple"].map(
        lambda x: ",".join([str(x[0]), x[1]])
    )
    
    # the count of to/child words, and in order to do that,
    # we need to count the number of times each word_group_id
    # exists in the from/parent column
    to_word_counter = collections.Counter(output_list[:, 0])

    # now, use the map function to get the number of from/to words and the number of
    # candidate words for each word
    proc_time_df["n_to_word_groups"] = proc_time_df["word_group_id"].map(to_word_counter)
    
    # select and re-order columns
    col_names = [
        "word",
        "lcase",
        "n_chars",
        "first_letter",
        "word_id",
        "word_group_id",
        "letter_group",
        "letter_group_ranked",
        "letter_selector",
        "nc_ls_tuple",
        "full_matrix_lookup",
        "n_char_lookup",
        "first_letter_lookup",
        "single_letter_lookup",
        "letter_selector_lookup",
        "nc_ls_lookup",        
        "word_processed",
    ]

    word_df = word_df[col_names]

    # rename some columns to include the matrix extraction option
    rename_col_dict = {"full_matrix_lookup": "me_01_full_matrix_lookup",
                       "n_char_lookup": "me_02_n_char_lookup",
                       "first_letter_lookup": "me_03_first_letter_lookup",
                       "single_letter_lookup": "me_04_single_letter_lookup",
                       "letter_selector_lookup": "me_05_letter_selector_lookup",
                       "nc_ls_lookup": "me_06_nc_ls_lookup"}
    
    word_df = word_df.rename(columns = rename_col_dict)   
    

    # add a matrix extraction option
    proc_time_df["matrix_extraction_option"] = int(matrix_extraction_option)
    

    return proc_time_df, word_df


In [None]:
def store_anagram_processing(proc_time_df: str, word_df:str, matrix_extraction_option: str, db_path: str, db_name: str) -> None:

    # create database connection objects
    db_conn = build_db_conn(db_path=db_path, db_name=db_name)

    # output table name
    table_name = f"words_me_{str(matrix_extraction_option).zfill(2)}"

    # write the processing option table
    proc_time_df.to_sql(name=table_name, con=db_conn, if_exists="replace", index=False)

    # write the word df to disk
    table_name = 'words_processed'
    word_df.to_sql(name = table_name, con=db_conn, if_exists='replace')

    # close the connection
    db_conn.close()       

    return None

In [None]:
def display_total_processing_time(proc_time_df:pd.DataFrame, total_time_start: datetime.datetime) -> None:


    anagram_discovery_time = proc_time_df["n_seconds"].sum()
    anagram_discovery_time = anagram_discovery_time / 60
    anagram_discovery_time = round(anagram_discovery_time, 2)

    print("...anagram discovery time:", anagram_discovery_time, "minutes")

    # record the total time
    total_time_end = datetime.datetime.now()
    total_time_proc = total_time_end - total_time_start
    total_time_proc = total_time_proc.total_seconds()
    total_time_proc = total_time_proc / 60
    total_time_proc = round(total_time_proc, 2)

    print("...total processing time:", total_time_proc, "minutes")

    return None



In [52]:
proc_time_df, word_df = store_anagaram_processing_time(output_list=output_list, proc_time_df=proc_time_df, word_df=word_df,
                               wg_df = wg_df,
                               matrix_extraction_option = matrix_extraction_option, db_path=db_path,
                               db_name=db_name

...anagram discovery time: 0.01 minutes
...total processing time: 2.22 minutes


In [None]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)