# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [1]:
# standard libraries - installed by default
import collections
import datetime
import pickle
import sqlite3
import string
import os
import timeit

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_file_db_utils import *
from part_00_process_functions import *

### set input and output paths

In [4]:
# base file path
base_file_path = '/project/finding_anagrams'

In [5]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [6]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [7]:
if os.path.exists(db_path):
    pass
else:
    os.makedirs(db_path)

In [8]:
db_name = 'words.db'

### process control flags

In [9]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 6

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

# set to None to include all letters
# test with a subset of letters by setting the letter_subset_list to ['q', 'x'] or 
# a different set of letters
letter_subset_list = ['x']
# letter_subset_list = None

# generate a sample dataset of ten words that start with each letter

# testo = wg_df.groupby(['first_letter']).sample(n = 10, random_state = 123)


In [10]:
# start a timer to record the entire operation
total_time_start = datetime.datetime.now()

### load input data

In [11]:
word_df, wg_df, letter_dict, char_matrix, word_group_id_list, word_id_list, wchar_matrix = load_input_data(db_path = db_path, db_name = db_name, in_file_path = in_file_path)

...loading words into a dataframe...
...query execution took: 2.131913 seconds...
...loading word groups into a dataframe...
...query execution took: 2.21272 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [13]:
# Split the char_matrix into N sub matrices
# See split_matrix() for a more elaborate description. 
# This function does a lot of things. Effectively, it computes and stores values in the wg_df, and splits the matrix into various components.

In [14]:
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict= split_matrix(
    letter_dict = letter_dict,
    word_group_id_list = word_group_id_list,
        wg_df = wg_df,
    wchar_matrix = wchar_matrix, 
    n_subset_letters = n_subset_letters,
)

...creating 16,101 sub-matrices...
...1,000 sub-matrices created...
...2,000 sub-matrices created...
...3,000 sub-matrices created...
...4,000 sub-matrices created...
...5,000 sub-matrices created...
...6,000 sub-matrices created...
...7,000 sub-matrices created...
...8,000 sub-matrices created...
...9,000 sub-matrices created...
...10,000 sub-matrices created...
...11,000 sub-matrices created...
...12,000 sub-matrices created...
...13,000 sub-matrices created...
...14,000 sub-matrices created...
...15,000 sub-matrices created...
...16,000 sub-matrices created...
... 16,101 sub-matrices created...
Total extraction time: 101.08 seconds.


# demonstrate the different matrix extraction options with the word 'achiever'

In [15]:
demo_word = 'achiever'

wg_id = word_df.loc[word_df['lcase'] == demo_word, 'word_group_id'].iloc[0]

demo_wg_df = wg_df.loc[wg_df['word_group_id'] == wg_id, : ]

# option 1 - Full matrix
# No additional data needed

# option 2 -  Number of characters
n_char = demo_wg_df['n_chars'].iloc[0]

# option 3 - First letter
first_letter = demo_wg_df['first_letter'].iloc[0]

# option 4 - Least common letter
least_common_letter = demo_wg_df['letter_selector'].iloc[0][0]

# option 5 - Multiple least common letters
letter_selector = demo_wg_df['letter_selector'].iloc[0]

# option 6 - Number of characters and multiple least common letters
nc_ls_tuple = demo_wg_df['nc_ls_tuple'].iloc[0]


## Select on the full matrix: option 1

In [16]:
# demo the full matrix selection
output = get_values_full_matrix(wg_id = wg_id, 
                    wchar_matrix = wchar_matrix,
                   word_group_id_list = word_group_id_list)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by word-length: option 2

In [17]:
# demo the n char selection
output = get_values_n_char(wg_id = wg_id,
                      n_char = n_char,
                      n_char_matrix_dict = n_char_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the first letter: option 3

In [18]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the single least common letter: option 4

In [19]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by the letter selector: option 5

In [20]:
# demo with the letter selector
output = get_values_letter_selector(wg_id = wg_id,
                      letter_selector = letter_selector,
                      letter_selector_matrix_dict = letter_selector_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


## Select on the matrices split by word-length and the letter selector: option 6

In [21]:
# demo with the n_char letter selector
output = get_values_n_char_letter_selector(wg_id = wg_id,
                           nc_ls_tuple = nc_ls_tuple,                           
                           nc_ls_matrix_dict=nc_ls_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

There are 45 parent/from word groups for the word achiever
There are 46 parent/from words for the word achiever
The first five words are:
1348               achiever
12440          archdeceiver
12445         archdetective
12624          architective
12737    archrepresentative
Name: lcase, dtype: object
The last five words are:
224512       urethrovesical
225856        vaucheriaceae
225857       vaucheriaceous
227047    vibrotherapeutics
233655       zepharovichite
Name: lcase, dtype: object


In [22]:
# we've tested with one word, let's time many evaluations to get a sense of how quickly 
# the different matrix extraction options work
# use the timeit() function to evaluate how long, on average, a single matrix operation
# takes to complete

n_trials = 10

code_snippet_dict = {
    'Matrix Selection Option 1: Selecting by full matrix':
"""get_values_full_matrix(wg_id = wg_id, wchar_matrix = wchar_matrix, word_group_id_list = word_group_id_list)""",
    'Matrix Selection Option 2: Selecting by word length':
"""get_values_n_char(wg_id = wg_id, n_char = n_char, n_char_matrix_dict = n_char_matrix_dict)""",
    'Matrix Selection Option 3: Selecting by first letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = first_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",    
    'Matrix Selection Option 4: Selecting by single least common letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",
    'Matrix Selection Option 5: Selecting by letter selector':
"""get_values_letter_selector(wg_id = wg_id, letter_selector = letter_selector, letter_selector_matrix_dict = letter_selector_matrix_dict)""",
    'Matrix Selection Option 6: Selecting by word length and letter selector':
"""get_values_n_char_letter_selector(wg_id = wg_id, nc_ls_tuple = nc_ls_tuple, nc_ls_matrix_dict=nc_ls_matrix_dict)"""
}

for csd, cs in code_snippet_dict.items():
    
    total_time = timeit.timeit(cs, number=n_trials, globals=globals())

    # total time    
    total_time_formatted = '{:,}'.format(round(total_time, 4))    

    # average time
    avg_time = total_time / n_trials
    avg_time_formatted = '{:,}'.format(round(avg_time, 6)) 
        
    print(csd)
    # average number of seconds per trial
    print('Total time:', total_time_formatted, 'seconds. Average time:', avg_time_formatted, 'seconds.')
    
    

Matrix Selection Option 1: Selecting by full matrix
Total time: 0.3654 seconds. Average time: 0.036536 seconds.
Matrix Selection Option 2: Selecting by word length
Total time: 0.2404 seconds. Average time: 0.02404 seconds.
Matrix Selection Option 3: Selecting by first letter
Total time: 0.2647 seconds. Average time: 0.026469 seconds.
Matrix Selection Option 4: Selecting by single least common letter
Total time: 0.0315 seconds. Average time: 0.003146 seconds.
Matrix Selection Option 5: Selecting by letter selector
Total time: 0.0011 seconds. Average time: 0.000106 seconds.
Matrix Selection Option 6: Selecting by word length and letter selector
Total time: 0.0018 seconds. Average time: 0.000177 seconds.


In [23]:
# the combination of the word length and letter selector is the fastest

### estimate total number of from/to word pairs

In [24]:
# how many anagrams are there?
# let's estimate the number of anagrams by assuming that the number of
# parent/from words is a function of word length. 
# estimate_total_pairs estimates the total number of from/to word pairs
# the reason for estimating the upper bound is that it is both just interesting 
# to know but it also means that we can use the estimated values to an allocate an 
# object in memory as opposed to incrementally appending to a list
# the object in memory is a NumPy Array that will store integers: from word group id | to word group id

In [25]:
n_possible_anagrams = estimate_total_pairs(wg_df = wg_df, nc_ls_matrix_dict = nc_ls_matrix_dict)

...estimated number of from/to pair word pairs: 194,572,272


### discover from/to word group id pairs

In [26]:
proc_time_df, output_list = \
    generate_from_to_word_group_pairs(wg_df = wg_df,
                                      letter_subset_list = letter_subset_list,
                                      n_possible_anagrams = n_possible_anagrams,
                                      matrix_extraction_option = matrix_extraction_option,
                                                   wchar_matrix = wchar_matrix,
                                                   word_group_id_list = word_group_id_list,
                                                   n_char_matrix_dict = n_char_matrix_dict,
                                                   single_letter_matrix_dict = single_letter_matrix_dict,
                                                   letter_selector_matrix_dict = letter_selector_matrix_dict,
                                                   nc_ls_matrix_dict = nc_ls_matrix_dict
                                     
                                     
                                     )

...finding parent anagrams for 364 words that start with x
...finding parent anagrams for x words took 0.32 seconds...
...total anagrams: 26,489
...total anagram discovery time: 0.323 seconds


### write anagram pairs to SQLite

In [27]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = db_path, db_name = db_name)    

### store number of from/to word pairs and time related to processing

In [28]:
# we have three dataframes: wg_df, word_df, and proc_time_df

In [29]:
proc_time_df, word_df = format_anagaram_processing(output_list = output_list, 
                                                   proc_time_df = proc_time_df,
                                                   word_df = word_df,
                                                   wg_df = wg_df,
                                                   matrix_extraction_option = matrix_extraction_option)

In [30]:
store_anagram_processing(proc_time_df = proc_time_df, word_df = word_df, matrix_extraction_option = matrix_extraction_option, db_path = db_path, db_name = db_name)

In [31]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)

...anagram discovery time: 0.323 seconds | 0.0054 minutes
...total processing time: 6.8 minutes
