In [1]:
# standard libraries
from time import perf_counter_ns
import time

# external libraries
import pandas as pd
import numpy as np
import cupy as cp

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *


In [2]:
from cupyx.profiler import benchmark

def my_func(a):
    return cp.sqrt(cp.sum(a**2, axis=-1))

a = cp.random.random((256, 1024))
print(benchmark(my_func, (a,), n_repeat=20))  


my_func             :    CPU:    65.900 us   +/-  9.852 (min:    54.400 / max:    85.600) us     GPU-0:   199.754 us   +/-  9.331 (min:   192.256 / max:   226.304) us


In [3]:
# initialize some gpu stats
start_gpu = cp.cuda.Event()
end_gpu = cp.cuda.Event()

start_gpu.record()
start_cpu = time.perf_counter()
out = my_func(a)
end_cpu = time.perf_counter()
end_gpu.record()
end_gpu.synchronize()
t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
t_cpu = end_cpu - start_cpu
print(t_gpu, t_cpu)

0.6768640279769897 0.0006055999983800575


# LOAD INPUT DATA

In [4]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH)


...loading words into a dataframe...
...query execution took: 0.5 seconds...
...loading word groups into a dataframe...
...query execution took: 0.49 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


# let's process 1000 rows using a single lookup using the full matrix

In [5]:
n_possible_anagrams = int(1e6)

In [6]:
f"{n_possible_anagrams :,}"

'1,000,000'

# Using the CPU

In [7]:
# sample 1000 rows
n_samples = 1000
sample_wg_id = wg_df['word_group_id'].sample(n = n_samples, random_state = 42).to_numpy()

# establish counters for record keeping
output_list = np.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)

row_count = 0
anagram_pair_count = 0
intermediate_to_word_count = collections.Counter()

for wg_id in sample_wg_id:
    # identify parent words
    outcome = wchar_matrix - wchar_matrix[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = np.all(outcome >= 0, axis=1)
    outcome = None

    n_from_words = outcome_indices.sum()

    if n_from_words >= 1:
        # extract anagrams based on index values    
        outcome_word_id_list = word_group_id_list[outcome_indices]    

        # we have matches
        # the focal word
        curr_output_list = np.zeros(shape=(n_from_words, 2), dtype=int)

        # update the output list with the word_id_list - these are from/parent words
        curr_output_list[:, 0] = outcome_word_id_list

        # update with the word_id - this is the to/child word
        curr_output_list[:, 1] = wg_id

        # enumerate the from/parent wordsds
        new_anagram_pair_count = anagram_pair_count + n_from_words

        output_list[anagram_pair_count:new_anagram_pair_count,
                    :] = curr_output_list

        # n_to_word_counter = collections.Counter(output_list[:, 0])
        intermediate_to_word_count.update(outcome_word_id_list.tolist())

        # set the anagram pair count
        anagram_pair_count = new_anagram_pair_count
    
    row_count += 1

    if row_count % 100 == 0:
        print(row_count)
print('truncating list')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
del output_indices
output_list.shape

100
200
300
400
500
600
700
800
900
1000
truncating list


(367646, 2)

# using the GPU

In [8]:
# convert to cupy objects
wchar_matrix_cp = cp.asarray(a = wchar_matrix)
word_group_id_list_cp = cp.asarray(a = word_group_id_list)

In [9]:
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [10]:
n_possible_anagrams

198842245

In [11]:
# establish counters for record keeping
# sample 1000 rows
n_samples = 1000
sample_wg_id = wg_df['word_group_id'].sample(n = n_samples, random_state = 42).to_numpy()
# sample_wg_id = wg_df['word_group_id'].to_numpy()
sample_wg_id_cp = cp.asarray(a = sample_wg_id)

output_list_cp = cp.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)

row_count = 0
anagram_pair_count = 0
intermediate_to_word_count = collections.Counter()

for wg_id in sample_wg_id_cp:
    outcome = wchar_matrix_cp - wchar_matrix_cp[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices_cp = cp.all(outcome >= 0, axis=1)
    outcome = None

    n_from_words = outcome_indices_cp.sum()   
            
    
    if n_from_words >= 1:
        # extract anagrams based on index values    
        outcome_word_id_list_cp = word_group_id_list_cp[outcome_indices_cp]    

        # we have matches
        curr_output_list_cp = cp.zeros(shape=(outcome_word_id_list_cp.shape[0], 2), dtype=int)

        # update the output list with the word_id_list - these are from/parent words
        curr_output_list_cp[:, 0] = outcome_word_id_list_cp

        # update with the word_id - this is the to/child word
        curr_output_list_cp[:, 1] = wg_id

        # enumerate the from/parent wordsds
        new_anagram_pair_count = anagram_pair_count + n_from_words

        # update the total output list
        output_list_cp[anagram_pair_count:new_anagram_pair_count,
                    :] = curr_output_list_cp

        # n_to_word_counter = collections.Counter(output_list[:, 0])
        intermediate_to_word_count.update(outcome_word_id_list_cp.tolist())

        # set the anagram pair count
        anagram_pair_count = new_anagram_pair_count
    
    row_count += 1

    if row_count % 10000 == 0:
        print(row_count)

print('truncating list')
output_indices_cp = cp.all(output_list_cp >= 0, axis=1)
output_list_cp = output_list_cp[output_indices_cp,]
del output_indices_cp
print(output_list_cp.shape)

truncating list
(367646, 2)


In [12]:
# now, let's build 

In [13]:
# let's split the matrix, gather the values for each split, and then combine
n_subset_letters = 3
wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]

letter_selector_list = wg_df["letter_selector"].unique()
letter_selector_list.sort()
letter_selector_id_dict = {ls: i_ls for i_ls, ls in enumerate(letter_selector_list)}

wg_df["letter_selector_id"] = wg_df["letter_selector"].map(letter_selector_id_dict)
# here's the thing: I need to be able to identify on a single matrix the rows that match various conditions.
# I can't step through it and create objects at abandon. 
# so, given our wchar_matrix: what are the rows that match to such and such?
# we can add three columns to track this... b


In [14]:
# load letter ranks
sql = 'select letter, total_letter_rank from letter_count;'

In [15]:
lr_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [16]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,letter_selector_id
0,A,a,1,a,0,0,a,a,1,a,0
1,aa,aa,2,a,1,1,a,a,1,a,0
2,aal,aal,3,a,2,2,al,la,2,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,ma,1114


In [17]:
lr_dict = {l:r for l, r in zip(lr_df['letter'], lr_df['total_letter_rank'])}

In [18]:
def get_lgr_id(lgr:str):
    outcome = [-1, -1, -1]
    for i_cl, cl in enumerate(lgr):        
        outcome[i_cl] = letter_dict[cl]
    return outcome

In [19]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,letter_selector_id
0,A,a,1,a,0,0,a,a,1,a,0
1,aa,aa,2,a,1,1,a,a,1,a,0
2,aal,aal,3,a,2,2,al,la,2,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,ma,1114


In [70]:
ls_df = wg_df['letter_selector'].value_counts().to_frame().reset_index()

In [71]:
ls_df.head()

Unnamed: 0,letter_selector,count
0,gyh,2544
1,yhp,2494
2,yhm,2438
3,hmp,2180
4,yhd,2091


In [72]:
ls_df['ls_id_vector'] = ls_df['letter_selector'].map(get_lgr_id)

In [73]:
ls_df.head()

Unnamed: 0,letter_selector,count,ls_id_vector
0,gyh,2544,"[6, 24, 7]"
1,yhp,2494,"[24, 7, 15]"
2,yhm,2438,"[24, 7, 12]"
3,hmp,2180,"[7, 12, 15]"
4,yhd,2091,"[24, 7, 3]"


In [74]:
ls_id_cp = cp.asarray(a = ls_df['ls_id_vector'].to_list())

In [75]:
ls_id_cp.shape

(2387, 3)

In [85]:
ls_id_cp

array([[ 6, 24,  7],
       [24,  7, 15],
       [24,  7, 12],
       ...,
       [25, 10,  4],
       [25,  4, -1],
       [25,  0,  4]], shape=(2387, 3))

In [84]:
for ls_id_index in range(0, 1):
    curr_ls_id_cp = ls_id_cp[ls_id_index, :]
    outcome = curr_ls_id_cp >= 0
    col_selector = curr_ls_id_cp[outcome]
    outcome_indices_cp = cp.all(wchar_matrix_cp[:, col_selector] >= 1, axis=1)
    ls_wchar_matrix_cp = wchar_matrix_cp[outcome_indices_cp, :]
    ls_wchar_matrix_cp.shape
    for ii in range(0, ls_wchar_matrix_cp.shape[0]):
        outcome = ls_wchar_matrix_cp - ls_wchar_matrix_cp[ii, :]
        outcome_check = cp.all(outcome > 0, axis = 1)


(3130, 26)

In [56]:
ls_df.head()

Unnamed: 0,letter_selector,lgr_id
0,a,"[0, -1, -1]"
2,la,"[11, 0, -1]"
3,lai,"[11, 0, 8]"
4,ma,"[12, 0, -1]"
5,nai,"[13, 0, 8]"


In [None]:
lgr

In [None]:
wg_df['lgr_id'].value_counts().shape

In [None]:
 outcome = wchar_matrix_cp - wchar_matrix_cp[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices_cp = cp.all(outcome >= 0, axis=1)
    outcome = None

    n_from_words = outcome_indices_cp.sum()   

In [None]:
# build that rank tuple, and attach to the wg_df
def get_rank(lgr:str):
    outcome = [-1, -1, -1]
    for i_cl, cl in enumerate(lgr):
        outcome[i_cl] = lr_dict[cl]
    return outcome

In [None]:
wg_df['that_rank'] = wg_df['letter_selector'].map(get_rank)

In [None]:
wg_df.head()

In [None]:
new_wchar_matrix = np.zeros(shape = (wg_df.shape[0], wchar_matrix.shape[1] + 3))

In [None]:
new_wchar_matrix[:, :wchar_matrix.shape[1]] = wchar_matrix

In [None]:
ls_ranks = np.array(object = wg_df['that_rank'].to_list())

In [None]:
ls_ranks.shape

In [None]:
new_wchar_matrix[:, -3:] = ls_ranks

In [None]:
new_wchar_matrix

In [None]:
ls_ranks_cp = cp.asarray(ls_ranks)


In [None]:
ls_ranks_cp.shape

In [None]:
# let's get the submatrix for all rows that feature the letter a

In [None]:
wg_df.head()

In [None]:
# least common letter 1, least common letter 2, least common letter 3
for ls in ls_ranks_cp:
    
    # get the indices of the single_letter_wchar_matrix that feature the n least common letters
    outcome_cp = wchar_matrix_cp[:, -3:] == ls # this generates a true/false array
    outcome_indices_cp = word_group_id_list_cp[outcome_cp] # this is the working list of ids
   
    # subset the wchar_matrix to get the sub-matrix - this contains the N least common letters for a group of words
    ls_wchar_matrix_cp = wchar_matrix_cp[outcome_indices,]
    # this is the working matrix
    # so, let's subset the df

    new_word_id = ls_wg_id_list == wg_id
    print(type(new_word_id))

    # now, perform the comparison
    outcome = ls_wchar_matrix - ls_wchar_matrix[new_word_id,]
    print(type(outcome))

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    print(type(outcome_indices))
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = ls_wg_id_list[outcome_indices]
    print(type(outcome_word_id_list))

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )
    print(type(output_list))
    
    
    outcome_word_id_list = get_values_letter_selector_cp(
                wg_id=wg_id,
                letter_selector_id=row.letter_selector_id,
                letter_selector_matrix_dict=letter_selector_matrix_dict,
            )



In [None]:
a

In [None]:
cp.cuda.is_available()

In [None]:
def split_matrix_cp(
    letter_dict: dict,
    word_group_id_list: cp.ndarray,
    wg_df: pd.DataFrame,
    wchar_matrix: cp.ndarray,
    n_subset_letters: int,
    matrix_extraction_option: int = 0
):

    # the different matrix extraction options
    # Option 0: Return all of the different types of matrix extraction options
    # Option 1: Full matrix - no objects are returned
    # Option 2: Word-length - returns matrices split by the number of characters
    # Option 3: First letter - returns matrices split by each letter
    # Option 4: Single-least common letter - return matrices split by each letter
    # Option 5: n least common letters - return matrices split by least common letters
    # Option 6: word-length and n least common letters - return matrices split by least common letters and word length.

    s_time = perf_counter_ns()
    

    # create the letter selector and determine the max number
    # of sub-matrices to makes
    wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]
    wg_df["first_letter_id"] = wg_df["first_letter"].map(letter_dict)
    wg_df["single_letter_id"] = wg_df["letter_selector"].str[0].map(
        letter_dict)

    # build the letter selector id list and dict
    letter_selector_list = wg_df["letter_selector"].unique()
    letter_selector_list.sort()
    letter_selector_id_dict = {ls: i_ls for i_ls,
                               ls in enumerate(letter_selector_list)}

    wg_df["letter_selector_id"] = wg_df["letter_selector"].map(
        letter_selector_id_dict)

    nc_ls_df = wg_df[
        ["n_chars", "letter_selector_id", "letter_selector"]
    ].drop_duplicates()
    nc_ls_df["nc_ls_id"] = range(0, nc_ls_df.shape[0])

    wg_df = pd.merge(left=wg_df, right=nc_ls_df)

    # only proceed if matrix_extraction_option != 1:
    if matrix_extraction_option != 1:

        # word length dictionary
        # used in matrix extraction option: 2
        n_char_matrix_dict = {}

        # single letter matrix dict
        # used in matrix extraction option: 3 and 4
        single_letter_matrix_dict = {}

        # letter selector dictionary
        # used in matrix extraction option: 5
        letter_selector_matrix_dict = {}

        # word length and lettor selector dictionary
        # used in matrix extraction option: 6
        nc_ls_matrix_dict = {}

        # create dictionaries to hold sets - these will only exist in the context of this function
        n_char_set_dict = {}
        single_letter_set_dict = {}
        letter_selector_set_dict = {}

        # enumerate these combinations only once
        # reduce the number of times we have to compute ids and sub-matrices

        # we have created some ids, but we don't need to enumerate for all of the
        # matrix extraction options.
        # but because of the way enumeration and creation of dictionaries is
        # setup, we're over-enumerating for options 2 through 5.
        # this is trade off between minimizing code, code reuse, and data enumeration
        n_records = nc_ls_df.shape[0]

        print("...enumerating", "{:,}".format(n_records), "records...")

        loop_count = 0
        for row in nc_ls_df.itertuples(index=False):
            nc = row.n_chars
            ls = row.letter_selector
            ls_id = row.letter_selector_id

            if matrix_extraction_option in (0, 6):
                nc_ls_id = row.nc_ls_id

            ####
            # MATRIX EXTRACTION OPTION 1: NO SUB-MATRICES ARE CREATED.
            ####
            # (Block left here for convenience)

            ####
            # MATRIX EXTRACTION OPTION 2: DICTIONARY BY NUMBER OF CHARACTERS
            ####
            if nc not in n_char_matrix_dict:
                nc_wg_id_list = wg_df.loc[
                    (wg_df["n_chars"] >= nc), "word_group_id"
                ].to_numpy()
                nc_wg_id_set = set(nc_wg_id_list)
                n_char_set_dict[nc] = nc_wg_id_set

                # subset the wchar_matrix to get the sub-matrix
                nc_sub_wchar_matrix = wchar_matrix[nc_wg_id_list,]

                n_char_matrix_dict[nc] = (nc_wg_id_list, nc_sub_wchar_matrix)

            else:
                nc_wg_id_list, nc_sub_wchar_matrix = n_char_matrix_dict[nc]
                nc_wg_id_set = n_char_set_dict[nc]

            ####
            # MATRIX EXTRACTION OPTION 3 AND 4: DICTIONARY BY SINGLE-LETTER
            ####
            ll = ls[0]
            ll_id = letter_dict[ll]

            # check to see if the sub-matrix with the first letter has already been created
            if ll_id not in single_letter_matrix_dict:
                # the submatrix has not been created, let's do it.
                column_selector = [ll_id]
                outcome = wchar_matrix[:, column_selector] > 0
                
                outcome_indices = cp.all(outcome > 0, axis=1)
                
                # these indices match with the word_id_list, extract the subset
                single_letter_word_group_id_list = word_group_id_list[outcome_indices]

                # the set of ids
                single_letter_word_group_id_set = set(
                    single_letter_word_group_id_list.tolist())
                
                single_letter_set_dict[ll_id] = single_letter_word_group_id_set

                # subset the wchar_matrix to get the sub-matrix
                single_letter_wchar_matrix = wchar_matrix[single_letter_word_group_id_list, ]

                single_letter_matrix_dict[ll_id] = (
                    single_letter_word_group_id_list,
                    single_letter_wchar_matrix
                )

            else:
                # query the sub-matrices split by individual letter to then get the smaller matrices
                (
                    single_letter_word_group_id_list,
                    single_letter_wchar_matrix,
                ) = single_letter_matrix_dict[ll_id]

                single_letter_word_group_id_set = single_letter_set_dict[ll_id]

            ####
            # MATRIX EXTRACTION OPTION 5: DICTIONARY BY LETTER SELECTOR
            ####
            if ls_id not in letter_selector_matrix_dict:
                # build a column selector
                column_selector = [letter_dict[curr_letter]
                                   for curr_letter in ls]

                # get the indices of the single_letter_wchar_matrix that feature the n least common letters
                outcome = single_letter_wchar_matrix[:, column_selector] > 0
                outcome_indices = cp.all(outcome > 0, axis=1)
                
                # these are now the ids
                ls_wg_id_list = single_letter_word_group_id_list[outcome_indices]

                # the set of ids
                ls_wg_id_set = set(ls_wg_id_list.tolist())
                letter_selector_set_dict[ls_id] = ls_wg_id_set

                # subset the wchar_matrix to get the sub-matrix - this contains the N least common letters for a group of words
                ls_wchar_matrix = wchar_matrix[ls_wg_id_list,]
                letter_selector_matrix_dict[ls_id] = (
                    ls_wg_id_list,
                    ls_wchar_matrix
                )

            else:
                # this is the submatrix by letter selector
                ls_wg_id_list, ls_wchar_matrix = letter_selector_matrix_dict[
                    ls_id
                ]

                ls_wg_id_set = letter_selector_set_dict[ls_id]

            ####
            # MATRIX EXTRACTION OPTION 6: DICTIONARY BY NUMBER OF CHARACTERS AND LETTER SELECTOR
            ####

            ##
            # We need to find the intersection of the word_group_id by number of characters
            # and the word_group_id by letter selector. The fastest way to do that
            # is to use the set().intersection() method. It blows other methods out of the water.
            # But...

            # THERE IS A LOT OF OVERHEAD IN THIS PART - THE set() INTERSECTION
            # AND THEN CONVERTING THE RESULTING SET TO A NUMPY ARRAY. THIS TAKES
            # ABOUT 33% OF THE TOTAL RUNTIME OF THIS FUNCTION
            # LEAVING THESE SNIPPETS OF ALTERNATIVES IN FOR REFERENCE AND LEARNING
            ##

            # 2024 02 05: USE np.intersect1d(): This is very slow
            # nc_ls_wg_id_list = np.intersect1d(ar1 = nc_wg_id_list, ar2=ls_wg_id_list, assume_unique=True)

            # 2024 02 05: use a pandas join: This is very slow
            # df_ls = pd.DataFrame(data = ls_wg_id_list, columns = ['word_group_id'])
            # df_nc = pd.DataFrame(data = nc_wg_id_list, columns = ['word_group_id'])
            # df_out = pd.merge(left = df_ls, right = df_nc)
            # nc_ls_wg_id_set = None
            # nc_ls_wg_id_list = df_out['word_group_id'].to_numpy()

            # 2024 02 06: use a collections.Counter(). This is also sloooooooow!
            # this_counter = collections.Counter(nc_wg_id_list)
            # this_counter.update(ls_wg_id_list)
            # this_array = np.array(list(this_counter.items()))
            # outcome = this_array[:, 1] == 2
            # nc_ls_wg_id_list = this_array[outcome, 0]
            # nc_ls_wg_id_set = None

            # This is the fastest implementation
            if matrix_extraction_option in (0, 6):
                nc_ls_wg_id_set = nc_wg_id_set.intersection(ls_wg_id_set)
                nc_ls_wg_id_list = cp.fromiter(iter=nc_ls_wg_id_set, dtype=int)
                
                # now, get the rows
                nc_ls_wchar_matrix = wchar_matrix[nc_ls_wg_id_list,]
                nc_ls_matrix_dict[nc_ls_id] = (
                    nc_ls_wg_id_list,
                    nc_ls_wchar_matrix
                )

            # get the right loop count
            loop_count += 1
            if loop_count % 1000 == 0:
                print("...{:,}".format(loop_count), "records enumerated...")

        # display the final count
        if matrix_extraction_option == 2:
            n_sub_matrices = len(n_char_matrix_dict)

        if matrix_extraction_option in (3, 4):
            n_sub_matrices = len(single_letter_matrix_dict)

        if matrix_extraction_option == 5:
            n_sub_matrices = len(letter_selector_matrix_dict)

        if matrix_extraction_option in (0, 6):
            n_sub_matrices = len(nc_ls_matrix_dict)
    else:
        n_sub_matrices = 0

    print("...{:,}".format(n_sub_matrices), "sub-matrices created...")
    p_time = calc_time(time_start=s_time)
    print("Total extraction time:", p_time, "seconds.")

    # set things to None so that we can free up memory and reduce overhead
    # these objects are no longer needed
    # only return objects specific to the particular matrix extraction option
    if matrix_extraction_option not in (0, 2):
        # option 2
        n_char_matrix_dict = None

    if matrix_extraction_option not in (0, 3, 4):
        # option 3 and 4
        single_letter_matrix_dict = None

    if matrix_extraction_option not in (0, 5):
        # option 5
        letter_selector_matrix_dict = None

    if matrix_extraction_option not in (0, 6):
        # option 6
        nc_ls_matrix_dict = None

    return (
        wg_df,
        n_char_matrix_dict,
        single_letter_matrix_dict,
        letter_selector_matrix_dict,
        nc_ls_matrix_dict,
        p_time
    )

In [None]:
word_group_id_list = cp.asarray(word_group_id_list)
wchar_matrix = cp.asarray(a = wchar_matrix)

In [None]:
# subset the matrix
n_subset_letters = 3
matrix_extraction_option = 5
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict, p_time = split_matrix_cp(
    letter_dict=letter_dict,
    word_group_id_list=word_group_id_list,
    wg_df=wg_df,
    wchar_matrix=wchar_matrix_cp,
    n_subset_letters=n_subset_letters,
    matrix_extraction_option=matrix_extraction_option
)

In [None]:
type(wchar_matrix_cp)

In [None]:
def format_output_list_cp(outcome_word_id_list: cp.ndarray, wg_id: int) -> cp.ndarray:
        
    output_list = cp.zeros(shape=(outcome_word_id_list.shape[0], 2), dtype=int)

    # update the output list with the word_id_list - these are from/parent words
    output_list[:, 0] = outcome_word_id_list

    # update with the word_id - this is the to/child word
    output_list[:, 1] = wg_id

    return output_list


def get_values_full_matrix_cp(
    wg_id: int, wchar_matrix: cp.ndarray, word_group_id_list: cp.ndarray
):
        

    # matrix extraction option 1
    outcome = wchar_matrix - wchar_matrix[wg_id,]

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = word_group_id_list[outcome_indices]

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )

    return output_list



def get_values_n_char_cp(wg_id: int, n_char: int, n_char_matrix_dict: dict):



    # matrix extraction option 2
    nc_wg_id_list, nc_sub_wchar_matrix = n_char_matrix_dict[n_char]
    new_word_id = nc_wg_id_list == wg_id

    # perform the comparison
    outcome = nc_sub_wchar_matrix - nc_sub_wchar_matrix[new_word_id,]

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = nc_wg_id_list[outcome_indices]

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )

    return output_list


def get_values_single_letter_cp(
    wg_id: int, single_letter_id: str, single_letter_matrix_dict: dict
):
    

    # matrix extraction option 3 and 4
    (
        single_letter_word_group_id_list,
        single_letter_wchar_matrix
    ) = single_letter_matrix_dict[single_letter_id]

    new_word_id = single_letter_word_group_id_list == wg_id

    # now, peform the comparison
    outcome = single_letter_wchar_matrix - \
        single_letter_wchar_matrix[new_word_id,]

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = single_letter_word_group_id_list[outcome_indices]

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )

    return output_list


def get_values_letter_selector_cp(
    wg_id: int, letter_selector_id: str, letter_selector_matrix_dict: dict
):
    
    # matrix extraction option 5
    ls_wg_id_list, ls_wchar_matrix = letter_selector_matrix_dict[
        letter_selector_id
    ]

    new_word_id = ls_wg_id_list == wg_id
    print(type(new_word_id))

    # now, perform the comparison
    outcome = ls_wchar_matrix - ls_wchar_matrix[new_word_id,]
    print(type(outcome))

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    print(type(outcome_indices))
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = ls_wg_id_list[outcome_indices]
    print(type(outcome_word_id_list))

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )
    print(type(output_list))

    return output_list


def get_values_n_char_letter_selector_cp(
    wg_id: int, nc_ls_id: tuple, nc_ls_matrix_dict: dict
):   

    # matrix extraction option 6
    nc_ls_wg_id_list, nc_ls_wchar_matrix = nc_ls_matrix_dict[nc_ls_id]

    new_word_id = nc_ls_wg_id_list == wg_id

    # now, perform the comparison
    outcome = nc_ls_wchar_matrix - nc_ls_wchar_matrix[new_word_id,]

    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = cp.all(outcome >= 0, axis=1)
    outcome = None

    # extract anagrams based on index values
    outcome_word_id_list = nc_ls_wg_id_list[outcome_indices]

    output_list = format_output_list_cp(
        outcome_word_id_list=outcome_word_id_list, wg_id=wg_id
    )

    return output_list

In [None]:
####
# generate_from_to_word_group_pairs placeholder
####
def generate_from_to_word_group_pairs_simple_cp(
    wg_df: pd.DataFrame,
    n_possible_anagrams: int,
    matrix_extraction_option: int,
    wchar_matrix: cp.ndarray,
    word_group_id_list: cp.ndarray,
    n_char_matrix_dict: dict,
    single_letter_matrix_dict: dict,
    letter_selector_matrix_dict: dict,
    nc_ls_matrix_dict: dict,
    letter_subset_list: str = None
):

    # use numpy to pre-allocate an array that will be updated while enumerating.
    # this eliminates list.append() calls which are fine in small amounts, but
    # hundreds of thousands of append calls are very slow.      

    output_list = cp.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)

    # this dictionary will store the calculations for each word
    proc_time_dict = {}

    if letter_subset_list == 'SAMPLE':
        # generate 100 samples within each n_chars and first_letter group combination
        curr_wg_df = wg_df.groupby(['n_chars', 'first_letter']).sample(
            n=100, replace=True, random_state=123).drop_duplicates()
    elif isinstance(letter_subset_list, str) or isinstance(letter_subset_list, list):
        # subset by a specific set of letters or a single letter
        curr_wg_df = wg_df.loc[wg_df['first_letter'].isin(
            set(letter_subset_list)), :].copy()
    else:
        curr_wg_df = wg_df.copy()

    # display counts
    curr_word_count = curr_wg_df.shape[0]

    n_curr_words = "{:,}".format(curr_word_count)
    print(
        "...finding parent anagrams for",
        n_curr_words,
        "words..."
    )

    # establish counters for record keeping
    row_count = 0
    anagram_pair_count = 0
    intmerdiate_to_word_count = collections.Counter()
    # enumerate by word id, working with integers is faster than words
    temp_curr_wg_df = curr_wg_df.iloc[0:10]
    for row in temp_curr_wg_df.itertuples(index=False):
        # start timing to record processing for each word
        s_time = perf_counter_ns()

        # word group id
        wg_id = row.word_group_id

        if matrix_extraction_option == 1:
            # option 1: full matrix
            outcome_word_id_list = get_values_full_matrix_cp(
                wg_id=wg_id,
                wchar_matrix=wchar_matrix,
                word_group_id_list=word_group_id_list,
            )
        elif matrix_extraction_option == 2:
            # option 2: word length
            outcome_word_id_list = get_values_n_char_cp(
                wg_id=wg_id,
                n_char=row.n_chars,
                n_char_matrix_dict=n_char_matrix_dict,
            )
        elif matrix_extraction_option == 3:
            # option 3: first character
            outcome_word_id_list = get_values_single_letter_cp(
                wg_id=wg_id,
                single_letter_id=row.first_letter_id,
                single_letter_matrix_dict=single_letter_matrix_dict,
            )
        elif matrix_extraction_option == 4:
            # option 4: single least common letter
            outcome_word_id_list = get_values_single_letter_cp(
                wg_id=wg_id,
                single_letter_id=row.single_letter_id,
                single_letter_matrix_dict=single_letter_matrix_dict,
            )
        elif matrix_extraction_option == 5:
            # option 5: letter selector / focal letter
            outcome_word_id_list = get_values_letter_selector_cp(
                wg_id=wg_id,
                letter_selector_id=row.letter_selector_id,
                letter_selector_matrix_dict=letter_selector_matrix_dict,
            )
        else:
            # option 6: word length and letter selector
            outcome_word_id_list = get_values_n_char_letter_selector_cp(
                wg_id=wg_id,
                nc_ls_id=row.nc_ls_id,
                nc_ls_matrix_dict=nc_ls_matrix_dict,
            )

        # if the outcome is greater than or equal to zero, then the current word is an
        # anagram of the other word
        # a value  >= 0 means that the current word contains the exact same number of focal letters
        # mite --> time or miter --> time
        # a value >= 1 means that current word contains at least the same number of focal letters
        # terminator --> time
        # a value of <= -1 means that the current word does not have the
        # correct number of letters and is therefore not an anagram.
        # trait <> time

        # number of parent words found
        n_from_words = outcome_word_id_list.shape[0]

        if n_from_words >= 1:
            # we have matches
            # the focal word

            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words

            output_list[anagram_pair_count:new_anagram_pair_count,
                        :] = outcome_word_id_list

            # n_to_word_counter = collections.Counter(output_list[:, 0])            
            intmerdiate_to_word_count.update(outcome_word_id_list[:, 0].tolist())

            # set the anagram pair count
            anagram_pair_count = new_anagram_pair_count

        # delete the intermediate list
        del outcome_word_id_list

        # record the time for the word
        p_time = calc_time(time_start=s_time, round_digits=-1)

        proc_time_dict[wg_id] = (p_time, n_from_words)

        row_count += 1
        if row_count % 1e4 == 0:
            print('...found parent anagrams for',
                  "{:,}".format(row_count), 'words...')

    # last update
    print('...found parent anagrams for', "{:,}".format(row_count), 'words...')
    # create a dataframe from the proc_time_dict
    proc_time_df = pd.DataFrame.from_dict(data=proc_time_dict, orient="index")
    proc_time_df = proc_time_df.reset_index()
    proc_time_df.columns = ["word_group_id", "n_seconds", "n_from_word_groups"]

    # display processing time for the current letter
    total_proc_time_s = round(proc_time_df["n_seconds"].sum(), 2)
    total_proc_time_m = round(proc_time_df["n_seconds"].sum() / 60, 2)
    print(
        "...finding parent anagrams for",
        n_curr_words,
        "words took",
        total_proc_time_s,
        "seconds |",
        total_proc_time_m,
        "minutes..."
    )

    # truncate the output array to only include rows with a from/to word pair
    # this removes any row that has a value of -1
    print('...truncating output list...')
    output_indices = cp.all(output_list >= 0, axis=1)
    output_list = output_list[output_indices,]
    del output_indices

    # initialize Counters to hold the count of found pairs for a given word
    # for the count of to/child words, we need to count the number of times
    # each word_group_id
    # exists in the from/parent column
    # count the number of to words
    # seems little counter-intuitive... but we're counting the number of
    # to-words from each from-word. So, this is the number of child words
    # from each parent word.
    # https://docs.python.org/3/library/collections.html#collections.Counter

    # we do not need the count of from-word, but leaving in for convenience
    # print("...populating the count of from-words...")
    # n_from_word_counter = collections.Counter(output_list[:, 1])

    print("...populating the count of to-words...")
    # big_count_start_time = perf_counter_ns()
    # n_to_word_counter = collections.Counter(output_list[:, 0])
    # print(calc_time(time_start = big_count_start_time))
    # outcome_test = intmerdiate_to_word_count == n_to_word_counter
    # print(outcome_test)

    # now, use the map function to get the number of from/to words and the number of
    # candidate words for each word
    proc_time_df["n_to_word_groups"] = proc_time_df["word_group_id"].map(
        intmerdiate_to_word_count
    )

    # record the matrix extraction option
    proc_time_df['matrix_extraction_option'] = matrix_extraction_option

    # how many anagram pairs were found?
    n_total_anagrams = output_list.shape[0]
    n_total_anagrams_formatted = "{:,}".format(n_total_anagrams)
    print("...total anagram pairs:", n_total_anagrams_formatted)

    return proc_time_df, output_list



In [None]:
# discover from/to word group id pairs
letter_subset_list = None
matrix_extraction_option = 5
proc_time_df, output_list = \
    generate_from_to_word_group_pairs_simple_cp(wg_df=wg_df,
                                                n_possible_anagrams=n_possible_anagrams,
                                                matrix_extraction_option=matrix_extraction_option,
                                                wchar_matrix=wchar_matrix,
                                                word_group_id_list=word_group_id_list,
                                                n_char_matrix_dict=n_char_matrix_dict,
                                                single_letter_matrix_dict=single_letter_matrix_dict,
                                                letter_selector_matrix_dict=letter_selector_matrix_dict,
                                                nc_ls_matrix_dict=nc_ls_matrix_dict,
                                                letter_subset_list=letter_subset_list,
                                                )

In [None]:
# subset the matrix
n_subset_letters = 3
matrix_extraction_option = 5
wg_df, n_char_matrix_dict, single_letter_matrix_dict, letter_selector_matrix_dict, nc_ls_matrix_dict, p_time = split_matrix(
    letter_dict=letter_dict,
    word_group_id_list=cp.asnumpy(word_group_id_list),
    wg_df=wg_df,
    wchar_matrix=cp.asnumpy(wchar_matrix),
    n_subset_letters=n_subset_letters,
    matrix_extraction_option=matrix_extraction_option
)

In [None]:
# discover from/to word group id pairs
letter_subset_list = None
matrix_extraction_option = 5
proc_time_df, output_list = \
    generate_from_to_word_group_pairs_simple(wg_df=wg_df,
                                                n_possible_anagrams=n_possible_anagrams,
                                                matrix_extraction_option=matrix_extraction_option,
                                                wchar_matrix=cp.asnumpy(wchar_matrix),
                                                word_group_id_list=cp.asnumpy(word_group_id_list),
                                                n_char_matrix_dict=n_char_matrix_dict,
                                                single_letter_matrix_dict=single_letter_matrix_dict,
                                                letter_selector_matrix_dict=letter_selector_matrix_dict,
                                                nc_ls_matrix_dict=nc_ls_matrix_dict,
                                                letter_subset_list=letter_subset_list,
                                                )

In [None]:
# Example array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Subtract each row from every other row
result = arr[:, np.newaxis, :] - arr[np.newaxis, :, :]

print(result)

In [None]:
output = wchar_matrix[:, cp.newaxis, :] - wchar_matrix[cp.newaxis, :, :]