# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [1]:
# standard libraries - installed by default
import collections
import os
import pickle
import sqlite3
import string
from time import perf_counter_ns

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_file_db_utils import *
from part_00_process_functions import *

### set input and output paths

In [4]:
# base file path
base_file_path = '/project/finding_anagrams'

In [5]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [6]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [7]:
db_name = 'words.db'

### process control flags

In [8]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 5

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

## Testing options
# NoneL to include all letters
# ['q', 'x'] or a different set of letters to test a specific letter
# 'SAMPLE' to take a 10% sample by word length group
#letter_subset_list = ['x']
letter_subset_list = None

In [9]:
# start a timer to record the entire operation
total_time_start = perf_counter_ns()

### load input data

In [10]:
word_df, wg_df, letter_dict, char_matrix, word_group_id_list, word_id_list, wchar_matrix = load_input_data(db_path = db_path, db_name = db_name, in_file_path = in_file_path)

...loading words into a dataframe...
...query execution took: 1.34 seconds...
...loading word groups into a dataframe...
...query execution took: 1.32 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [11]:
# Split the char_matrix into N sub matrices
# See split_matrix() for a more elaborate description. 
# This function does a lot of things. Effectively, it computes and stores values in the wg_df, and splits the matrix into various components.

In [12]:
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict= split_matrix(
        letter_dict = letter_dict,
        word_group_id_list = word_group_id_list,
            wg_df = wg_df,
        wchar_matrix = wchar_matrix, 
        n_subset_letters = n_subset_letters,
        matrix_extraction_option=matrix_extraction_option
    )

...enumerating 16,101 records...
...1,000 records enumerated...
...2,000 records enumerated...
...3,000 records enumerated...
...4,000 records enumerated...
...5,000 records enumerated...
...6,000 records enumerated...
...7,000 records enumerated...
...8,000 records enumerated...
...9,000 records enumerated...
...10,000 records enumerated...
...11,000 records enumerated...
...12,000 records enumerated...
...13,000 records enumerated...
...14,000 records enumerated...
...15,000 records enumerated...
...16,000 records enumerated...
...2,387 sub-matrices created...
Total extraction time: 9.15 seconds.


### estimate total number of from/to word pairs

In [13]:
# how many anagrams are there?
# let's estimate the number of anagrams by assuming that the number of
# parent/from words is a function of word length. 
# estimate_total_pairs() estimates the total number of from/to word pairs
# the reason for estimating the upper bound is that it is both just interesting 
# to know but it also means that we can use the estimated values to allocate an 
# object in memory as opposed to incrementally appending to a list - this is faster
# the object in memory is a NumPy Array that will store integers: from word group id | to word group id

In [14]:
n_possible_anagrams = estimate_total_pairs(wg_df = wg_df, letter_selector_matrix_dict = letter_selector_matrix_dict)

...estimated number of from/to pair word pairs: 194,572,272


### discover from/to word group id pairs

In [15]:
proc_time_df, output_list = \
    generate_from_to_word_group_pairs_simple(wg_df = wg_df,                                      
                                      n_possible_anagrams = n_possible_anagrams,
                                      matrix_extraction_option = matrix_extraction_option,
                                                   wchar_matrix = wchar_matrix,
                                                   word_group_id_list = word_group_id_list,
                                                   n_char_matrix_dict = n_char_matrix_dict,
                                                   single_letter_matrix_dict = single_letter_matrix_dict,
                                                   letter_selector_matrix_dict = letter_selector_matrix_dict,
                                                   nc_ls_matrix_dict = nc_ls_matrix_dict,
                                             letter_subset_list = letter_subset_list,
                                     )

...finding parent anagrams for 215,842 words...
...found parent anagrams for 10,000 words...
...found parent anagrams for 20,000 words...
...found parent anagrams for 30,000 words...
...found parent anagrams for 40,000 words...
...found parent anagrams for 50,000 words...
...found parent anagrams for 60,000 words...
...found parent anagrams for 70,000 words...
...found parent anagrams for 80,000 words...
...found parent anagrams for 90,000 words...
...found parent anagrams for 100,000 words...
...found parent anagrams for 110,000 words...
...found parent anagrams for 120,000 words...
...found parent anagrams for 130,000 words...
...found parent anagrams for 140,000 words...
...found parent anagrams for 150,000 words...
...found parent anagrams for 160,000 words...
...found parent anagrams for 170,000 words...
...found parent anagrams for 180,000 words...
...found parent anagrams for 190,000 words...
...found parent anagrams for 200,000 words...
...found parent anagrams for 210,000 word

### write anagram pairs to SQLite

In [16]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = db_path, db_name = db_name)    

### store number of from/to word pairs and time related to processing

In [17]:
store_anagram_processing(proc_time_df = proc_time_df, matrix_extraction_option = matrix_extraction_option, db_path = db_path, db_name = db_name)

...now writing: words_me_05


In [18]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)

...anagram discovery time: 150.47 seconds | 2.51 minutes
...total processing time: 206.0 seconds | 3.43 minutes
