# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [1]:
# standard libraries - installed by default
import collections
import datetime
import pickle
import sqlite3
import string
import os
import timeit

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_file_db_utils import *
from part_00_process_functions import *

### set input and output paths

In [4]:
# base file path
base_file_path = '/project/finding_anagrams'

In [5]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [6]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [7]:
if os.path.exists(db_path):
    pass
else:
    os.makedirs(db_path)

In [8]:
db_name = 'words.db'

### process control flags

In [9]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 5

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

# set to None to include all letters
# test with a subset of letters by setting the letter_subset_list to ['q', 'x'] or 
# a different set of letters
letter_subset_list = ['x']
# letter_subset_list = None

In [10]:
# start a timer to record the entire operation
total_time_start = datetime.datetime.now()

### load input data

In [11]:
word_df, wg_df, letter_dict, char_matrix, word_group_id_list, word_id_list, wchar_matrix = load_input_data(db_path = db_path, db_name = db_name, in_file_path = in_file_path)

...loading words into a dataframe...
...query execution took: 1.877372 seconds...
...loading word groups into a dataframe...
...query execution took: 1.416619 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [12]:
# Split the char_matrix into N sub matrices
# See split_matrix() for a more elaborate description. 
# This function does a lot of things. Effectively, it computes and stores values in the wg_df, and splits the matrix into various components.

In [13]:
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict= split_matrix(
        letter_dict = letter_dict,
        word_group_id_list = word_group_id_list,
            wg_df = wg_df,
        wchar_matrix = wchar_matrix, 
        n_subset_letters = n_subset_letters,
        matrix_extraction_option=matrix_extraction_option
    )

...enumerating 16,101 records...
...1,000 records enumerated...
...2,000 records enumerated...
...3,000 records enumerated...
...4,000 records enumerated...
...5,000 records enumerated...
...6,000 records enumerated...
...7,000 records enumerated...
...8,000 records enumerated...
...9,000 records enumerated...
...10,000 records enumerated...
...11,000 records enumerated...
...12,000 records enumerated...
...13,000 records enumerated...
...14,000 records enumerated...
...15,000 records enumerated...
...16,000 records enumerated...
...2,387 sub-matrices created...
Total extraction time: 11.69 seconds.


### estimate total number of from/to word pairs

In [14]:
# how many anagrams are there?
# let's estimate the number of anagrams by assuming that the number of
# parent/from words is a function of word length. 
# estimate_total_pairs() estimates the total number of from/to word pairs
# the reason for estimating the upper bound is that it is both just interesting 
# to know but it also means that we can use the estimated values to allocate an 
# object in memory as opposed to incrementally appending to a list - this is faster
# the object in memory is a NumPy Array that will store integers: from word group id | to word group id

In [15]:
n_possible_anagrams = estimate_total_pairs(wg_df = wg_df, letter_selector_matrix_dict = letter_selector_matrix_dict)

...estimated number of from/to pair word pairs: 194,572,272


### discover from/to word group id pairs

In [16]:
# do we want to santize the inputs? no.
# that is beyond the scope of this. 
# little bobby tables

In [17]:
proc_time_df, output_list, processed_word_id = \
    generate_from_to_word_group_pairs_simple(wg_df = wg_df,                                      
                                      n_possible_anagrams = n_possible_anagrams,
                                      matrix_extraction_option = matrix_extraction_option,
                                                   wchar_matrix = wchar_matrix,
                                                   word_group_id_list = word_group_id_list,
                                                   n_char_matrix_dict = n_char_matrix_dict,
                                                   single_letter_matrix_dict = single_letter_matrix_dict,
                                                   letter_selector_matrix_dict = letter_selector_matrix_dict,
                                                   nc_ls_matrix_dict = nc_ls_matrix_dict,
                                             letter_subset_list = letter_subset_list,
                                     )

...finding parent anagrams for 364 words...
...found parent anagrams for 364 words...
...finding parent anagrams for 364 words took 0.0683 seconds | 0.0011 minutes...
...truncating output list...
...populating the count of to-words...
...total anagram pairs: 26,489


In [18]:
proc_time_df.head()

Unnamed: 0,word_group_id,n_seconds,n_from_word_groups,n_to_word_groups
0,214131,0.0,2,7
1,214139,0.0,4,12
2,214208,0.0,1,5
3,214209,0.0,2,12
4,214211,0.0,1,4


In [19]:
proc_time_df.shape

(364, 4)

In [20]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,first_letter_id,single_letter_id,letter_selector_id,nc_ls_id
0,A,a,1,a,0,0,a,a,1,a,0,0,0,0
1,aa,aa,2,a,1,1,a,a,1,a,0,0,0,1
2,aal,aal,3,a,2,2,al,la,2,la,0,11,1081,2
3,all,all,3,a,5394,5305,al,la,1,la,0,11,1081,2
4,aalii,aalii,5,a,3,3,ail,lai,1,lai,0,11,1083,3


### write anagram pairs to SQLite

In [21]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = db_path, db_name = db_name)    

### store number of from/to word pairs and time related to processing

In [22]:
# we have three dataframes: wg_df, word_df, and proc_time_df

In [23]:
proc_time_df, word_df = format_anagaram_processing(proc_time_df = proc_time_df,                                                   
                                                   word_df = word_df,
                                                   wg_df = wg_df,
                                                   processed_word_id=processed_word_id,                                                   
                                                   matrix_extraction_option = matrix_extraction_option)

In [24]:
# 2024 02 22: all that we need for the word_df is the word_id, word_group_id, and the word_processed variable. 
# this stuff is stored elsewhere. 

In [25]:
store_anagram_processing(proc_time_df = proc_time_df, word_df = word_df, matrix_extraction_option = matrix_extraction_option, db_path = db_path, db_name = db_name)

...now writing: words_me_05
...now writing: words_processed_me_05


In [26]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)

...anagram discovery time: 0.0683 seconds | 0.0011 minutes
...total processing time: 24.207409 seconds | 0.4 minutes
