# Mike Babb
# babb.mike@outlook.com
# Find anagrams
## Part 2: Generate and store the anagrams v2.0

In [None]:
# standard libraries - installed by default
import collections
import os
import pickle
import sqlite3
import string
from time import perf_counter_ns

In [None]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [None]:
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *

### process control flags

In [None]:
# Use numpy to perform matrix opertions and determine from/to and exact anagram relationships
# Option 1: Full matrix
# Option 2: Word-length
# Option 3: First letter
# Option 4: Single-least common letter
# Option 5: n least common letters
# Option 6: word-length and n least common letters

matrix_extraction_option = 5

# max number of letters to slice to use for the generation of sub-matrices for
# options 5 and 6. More letters means more sub-matrices
# 3 seems to be the sweet spot
n_subset_letters = 3

# set write_data to True to store the generated list of anagrams
write_data = False

## Testing options
# NoneL to include all letters
# ['q', 'x'] or a different set of letters to test a specific letter
# 'SAMPLE' to take a 10% sample by word length group
#letter_subset_list = ['x']
#letter_subset_list = 'SAMPLE'
letter_subset_list = None

In [None]:
# start a timer to record the entire operation
total_time_start = perf_counter_ns()

### load input data

In [None]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.db_path, db_name=rc.db_name, 
        in_file_path=rc.data_output_file_path)

In [None]:
# Split the char_matrix into N sub matrices
# See split_matrix() for a more elaborate description. 
# This function does a lot of things. Effectively, it computes and stores values in the wg_df, and splits the matrix into various components.

In [None]:
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict, p_time = split_matrix(
    letter_dict=letter_dict,
    word_group_id_list=word_group_id_list,
    wg_df=wg_df,
    wchar_matrix=wchar_matrix,
    n_subset_letters=n_subset_letters,
    matrix_extraction_option=matrix_extraction_option
)

### get the total number of from/to word pairs from the previous steps

In [None]:
n_possible_anagrams = load_possible_anagrams(db_path = rc.db_path, db_name = rc.db_name)

### discover from/to word group id pairs

In [None]:
proc_time_df, output_list = \
    generate_from_to_word_group_pairs_simple(wg_df=wg_df,
                                             n_possible_anagrams=n_possible_anagrams,
                                             matrix_extraction_option=matrix_extraction_option,
                                             wchar_matrix=wchar_matrix,
                                             word_group_id_list=word_group_id_list,
                                             n_char_matrix_dict=n_char_matrix_dict,
                                             single_letter_matrix_dict=single_letter_matrix_dict,
                                             letter_selector_matrix_dict=letter_selector_matrix_dict,
                                             nc_ls_matrix_dict=nc_ls_matrix_dict,
                                             letter_subset_list=letter_subset_list,
                                             )

### write anagram pairs to SQLite

In [None]:
# write the anagram pairs to the database
if write_data:
    store_anagram_pairs(output_list = output_list, db_path = rc.db_path, db_name = rc.db_name)    

### store number of from/to word pairs and time related to processing

In [None]:
store_anagram_processing(proc_time_df = proc_time_df, matrix_extraction_option = matrix_extraction_option, db_path = rc.db_path, db_name = rc.db_name)

In [None]:
display_total_processing_time(proc_time_df = proc_time_df, total_time_start = total_time_start)