# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [1]:
# standard libraries - installed by default
import collections
import datetime
import pickle
import sqlite3
import string
import os
import timeit

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_file_db_utils import *
from part_00_process_functions import *

### set input and output paths

In [4]:
# base file path
base_file_path = '/project/finding_anagrams'

In [5]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [6]:
# output path
out_file_path = 'data'
out_file_path = os.path.join(base_file_path, out_file_path)

In [7]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [8]:
if os.path.exists(db_path):
    pass
else:
    os.makedirs(db_path)

In [9]:
db_name = 'words.db'

### process control flags

In [10]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 5

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

# set to None to include all letters
# test with a subset of letters by setting the letter_subset_list to ['q', 'x'] or 
# a different set of letters
letter_subset_list = ['x']
#letter_subset_list = None

# generate a sample dataset of ten words that start with each letter

# testo = wg_df.groupby(['first_letter']).sample(n = 10, random_state = 123)


In [11]:
# start a timer to record the entire operation
total_time_start = datetime.datetime.now()

### load input data

In [12]:
# load the word_df, the words from Part 1
sql = 'select * from words;'
word_df = query_db(sql = sql, db_path = db_path, db_name = db_name)

...query execution took: 1.968885 seconds...


In [13]:
word_df.shape

(234370, 8)

In [14]:
# extract the column of word ids as a numpy array
word_id_list = word_df['word_id'].to_numpy(dtype = int)    

In [15]:
# create a dataframe with the letters sorted by the frequency of words that
# start with a particular letter
agg_word_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame()

# set column names
agg_word_df.columns = ['word_count']

# reset the index to rename columns
agg_word_df = agg_word_df.reset_index()

# sort the dataframe by frequency
agg_word_df = agg_word_df.sort_values(by='word_count')

In [16]:
# extract the letters sorted by word frequency
sorted_first_letters = agg_word_df['first_letter'].tolist()

In [17]:
# load the letter dictionary from part 1
in_file_name = 'letter_dict.pkl'
letter_dict = load_pickle(in_file_path = in_file_path, in_file_name=in_file_name)

In [18]:
# load the word dictionary from part 1
in_file_name = 'word_dict.pkl'
word_dict = load_pickle(in_file_path = in_file_path, in_file_name=in_file_name)

In [19]:
# load the char matrix from part 1
in_file_name = 'char_matrix.npy'
ipn = os.path.join(in_file_path, in_file_name)
char_matrix = np.load(file = ipn)

### create the word group df: wg_df

In [20]:
# drop duplicates based on the word group. 
# by default, this will only keep the first record and it will drop all others
wg_df = word_df.drop_duplicates(subset = ['word_group_id']).copy()

In [21]:
wg_df = wg_df.sort_values(by = 'word_id')

In [22]:
# unique word groups
wg_df.shape

(215842, 8)

In [23]:
# get the word group ids
word_group_id_list = wg_df['word_group_id'].to_numpy()
# and the associated word_id
word_id_list = wg_df['word_id'].to_numpy()

In [24]:
# trim the char matrix by word id
# and not the word_group id
wchar_matrix = char_matrix[word_id_list, :]

In [25]:
# i don't use these objects, but i can't delete them?
# build a word_id to word_group_id dictionary
#word_id_wg_id_dict = dict()
# and a word_group_id to word_id dictionary
#wg_id_word_id_dict = dict()

#for word_id, wg_id in zip(wg_df['word_id'], wg_df['word_group_id']):
#    word_id_wg_id_dict[word_id] = wg_id
#    wg_id_word_id_dict[wg_id] = word_id

In [26]:
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:n_subset_letters]

In [27]:
# there will be three parts to this function
# The first part does a selection by a single character
# The second part does the selection based on the newly created subselection

# peforming selections on a dataframe is slow.
# Especially so since we are comparing characters
    
# now, do it again, but this time use the dictionary
# by word length and n least common letters
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:n_subset_letters]
nc_ls_df = wg_df[['n_chars', 'letter_selector']].drop_duplicates()

print('...creating', nc_ls_df.shape[0], 'sets of ids')

...creating 16101 sets of ids


In [29]:
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict= split_matrix(
    letter_dict = letter_dict,
    word_group_id_list = word_group_id_list,
    nc_ls_df = nc_ls_df,
    wg_df = wg_df,
    wchar_matrix = wchar_matrix,
out_file_path = out_file_path)

...check for previously saved matrix splits...
...saving matrix splits...


# demonstrate the different matrix extraction options with the word 'achiever'

In [None]:
demo_word = 'achiever'

wg_id = word_df.loc[word_df['lcase'] == demo_word, 'word_group_id'].iloc[0]

demo_wg_df = wg_df.loc[wg_df['word_group_id'] == wg_id, : ]

# option 2
n_char = demo_wg_df['n_chars'].iloc[0]

# option 3
first_letter = demo_wg_df['first_letter'].iloc[0]

# option 4
least_common_letter = demo_wg_df['letter_selector'].iloc[0][0]

# option 5
letter_selector = demo_wg_df['letter_selector'].iloc[0]

# option 6
nc_ls_tuple = demo_wg_df['nc_ls_tuple'].iloc[0]


In [None]:
# 1 full matrix: complete | get_values_full_matrix
# 2 n char (word length): complete | get_values_n_char
# 3 first letter: complete | get_value_first_letter
# 3 focal letter: complete | get_values_letter_selector
# 4 n char and least common letters: complete | get_values_n_char_letter_selector

## Select on the full matrix: option 1

In [None]:
# demo the full matrix selection
output = get_values_full_matrix(wg_id = wg_id, 
                    wchar_matrix = wchar_matrix,
                   word_group_id_list = word_group_id_list)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

## Select on the matrices split by word-length: option 2

In [None]:
# demo the n char selection
output = get_values_n_char(wg_id = wg_id,
                      n_char = n_char,
                      n_char_matrix_dict = n_char_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

## Select on the matrices split by the first letter: option 3

In [None]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

## Select on the matrices split by the single least common letter: option 4

In [None]:
# demo the first letter selection
output = get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter,
                                                           single_letter_matrix_dict = single_letter_matrix_dict)         

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

## Select on the matrices split by the letter selector: option 5

In [None]:
# demo with the letter selector
output = get_values_letter_selector(wg_id = wg_id,
                      letter_selector = letter_selector,
                      letter_selector_matrix_dict = letter_selector_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

## Select on the matrices split by word-length and the letter selector: option 6

In [None]:
# demo with the n_char letter selector
output = get_values_n_char_letter_selector(wg_id = wg_id,
                           nc_ls_tuple = nc_ls_tuple,                           
                           nc_ls_matrix_dict=nc_ls_matrix_dict)

# this is an array of from words to the word 'achiever'
format_demo_output(demo_word = demo_word,
                   word_df = word_df,
                   demo_output = output)

In [None]:
# we've tested with one word, let's time many evaluations to get a sense of how quickly 
# the different matrix extraction options work
# use the timeit() function to evaluate how long, on average, a single matrix operation
# takes to complete

n_trials = 10

code_snippet_dict = {
    'Matrix Selection Option 1: Selecting by full matrix':
"""get_values_full_matrix(wg_id = wg_id, wchar_matrix = wchar_matrix, word_group_id_list = word_group_id_list)""",
    'Matrix Selection Option 2: Selecting by word length':
"""get_values_n_char(wg_id = wg_id, n_char = n_char, n_char_matrix_dict = n_char_matrix_dict)""",
    'Matrix Selection Option 3: Selecting by first letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = first_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",    
    'Matrix Selection Option 4: Selecting by single least common letter':
"""get_values_single_letter(wg_id = wg_id, single_letter = least_common_letter, single_letter_matrix_dict = single_letter_matrix_dict)""",
    'Matrix Selection Option 5: Selecting by letter selector':
"""get_values_letter_selector(wg_id = wg_id, letter_selector = letter_selector, letter_selector_matrix_dict = letter_selector_matrix_dict)""",
    'Matrix Selection Option 6: Selecting by word length and letter selector':
"""get_values_n_char_letter_selector(wg_id = wg_id, nc_ls_tuple = nc_ls_tuple, nc_ls_matrix_dict=nc_ls_matrix_dict)"""
}

for csd, cs in code_snippet_dict.items():
    
    total_time = timeit.timeit(cs, number=n_trials, globals=globals())

    # total time    
    total_time_formatted = '{:,}'.format(round(total_time, 4))    

    # average time
    avg_time = total_time / n_trials
    avg_time_formatted = '{:,}'.format(round(avg_time, 6)) 
        
    print(csd)
    # average number of seconds per trial
    print('Total time:', total_time_formatted, 'seconds. Average time:', avg_time_formatted, 'seconds.')
    
    

In [None]:
# the combination of the word length and letter selector is the fastest

### estimate total number of from/to word pairs

In [None]:
# how many anagrams are there?
# let's estimate the number of anagrams by assuming that the number of
# parent/from words is a function of word length. 
# estimate_total_pairs estimates the total number of from/to word pairs
# the reason for estimating the upper bound is that it is both just interesting 
# to know but it also means that we can use the estimated values to an allocate an 
# object in memory as opposed to incrementally appending to a list
# the object in memory is a NumPy Array that will store integers: from word group id | to word group id

In [None]:
n_possible_anagrams = estimate_total_pairs(wg_df = wg_df, nc_ls_matrix_dict = nc_ls_matrix_dict)

### discover from/to word group id pairs

In [None]:
proc_time_df, output_list = \
    generate_from_to_word_group_pairs(wg_df = wg_df,
                                      letter_subset_list = letter_subset_list,
                                      n_possible_anagrams = n_possible_anagrams,
                                      matrix_extraction_option = matrix_extraction_option,
                                                   wchar_matrix = wchar_matrix,
                                                   word_group_id_list = word_group_id_list,
                                                   n_char_matrix_dict = n_char_matrix_dict,
                                                   single_letter_matrix_dict = single_letter_matrix_dict,
                                                   letter_selector_matrix_dict = letter_selector_matrix_dict,
                                                   nc_ls_matrix_dict = nc_ls_matrix_dict
                                     
                                     
                                     )

### write anagram pairs to SQLite

In [None]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = db_path, db_name = db_name)    

### store number of from/to word pairs and time related to processing

In [None]:
# we have three dataframes: wg_df, word_df, and proc_time_df

In [None]:
proc_time_df, word_df = format_anagaram_processing(output_list = output_list, 
                                                   proc_time_df = proc_time_df,
                                                   word_df = word_df,
                                                   wg_df = wg_df,
                                                   matrix_extraction_option = matrix_extraction_option)

In [None]:
store_anagram_processing(proc_time_df = proc_time_df, word_df = word_df, matrix_extraction_option = matrix_extraction_option, db_path = db_path, db_name = db_name)

In [None]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)