In [1]:
# Can we find parent/child word relationships faster
# One way to achieve this is to use the GPU via cupy
# https://docs.cupy.dev/en/stable/index.html
# cupy is basically numpy on the gpu. Let's start with a simple case.

In [2]:
# standard libraries
from time import perf_counter_ns
import time

In [3]:
# external libraries
import pandas as pd
import numpy as np

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *

In [4]:
# specific run constants
change_data_types = True

# Load input data
The char_matrix and the formatted words

In [5]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH,change_data_types=change_data_types)

...loading words into a dataframe...
...query execution took: 0.49 seconds...
...loading word groups into a dataframe...
...query execution took: 0.48 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


# let's process 1000 rows using a single lookup on the full wchar_matrix

In [6]:
# load the total number of anagrams
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH,
                                             db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [7]:
f"{n_possible_anagrams :,}"

'99,421,122'

# Using the CPU
Create a data extraction scenario that is very similar to Matrix Extraction Technique 1: Using the full matrix. I remove the excess code and create a simple loop. 

In [8]:
run_start_time = perf_counter_ns()
# sample 1000 rows
n_samples = 1000
sample_wg_id = wg_df['word_group_id'].sample(n = n_samples, random_state = 42).to_list()

# let's add a specific word: acanthology - the study of spines (as of sea urchins) especially as an adjunct of taxonomy
# this is to ensure that extraction technique produces the correct values
# On a complete run, there should by 26 parent words and 329 child words
# This will find the 26 parent words, and depending on what else in the sample,
# a number of child words

if 746 not in sample_wg_id:
    sample_wg_id.append(746)    

# create an output object
output_list = np.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)
if change_data_types:
    output_list = output_list.astype(np.int32)

anagram_pair_count = 0

for i_wg_id, wg_id in enumerate(sample_wg_id):
    # identify parent words
    outcome = wchar_matrix - wchar_matrix[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = np.all(outcome >= 0, axis=1)    

    n_from_words = outcome_indices.sum()

    if n_from_words >= 1:

        outcome_word_id_list = word_group_id_list[outcome_indices]
        # extract anagrams based on index values    
        outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list,
                                                  wg_id=wg_id)

        # enumerate the from/parent words
        new_anagram_pair_count = anagram_pair_count + n_from_words

        output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list
        
        # set the anagram pair count
        anagram_pair_count = new_anagram_pair_count    
    
    if i_wg_id % 100 == 0:
        print(i_wg_id)
        
print('...time to find parent words...')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

0
100
200
300
400
500
600
700
800
900
1000
...time to find parent words...
Hours: 0 | minutes: 0 | seconds: 7.082
...truncating output list...
(367672, 2)
Hours: 0 | minutes: 0 | seconds: 7.8705


In [9]:
# with changing the data types: ~ 7 seconds
# without changing the data types: ~ 18 seconds

In [10]:
from_word_counter, to_word_counter = build_counters(output_list=output_list)

In [11]:
# the number of from word groups: should be 26
from_word_counter[746]

np.int64(26)

# Setup a similar workflow, using CuPy

In [12]:
# load the library
import cupy as cp

In [13]:
# perform a simple benchmark
from cupyx.profiler import benchmark

def my_func(a):
    return cp.sqrt(cp.sum(a**2, axis=-1))

a = cp.random.random((256, 1024))
print(benchmark(my_func, (a,), n_repeat=20))  


my_func             :    CPU:    61.090 us   +/- 10.142 (min:    48.600 / max:    86.800) us     GPU-0:   199.634 us   +/-  6.468 (min:   192.512 / max:   218.112) us


In [14]:
# initialize some gpu stats
# this page, https://docs.cupy.dev/en/stable/user_guide/performance.html,
# talks about performance best practices
start_gpu = cp.cuda.Event()
end_gpu = cp.cuda.Event()

start_gpu.record()
start_cpu = time.perf_counter()
out = my_func(a)
end_cpu = time.perf_counter()
end_gpu.record()
end_gpu.synchronize()
t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
t_cpu = end_cpu - start_cpu
print(t_gpu, t_cpu)

0.3829759955406189 0.00033090000215452164


## Create cupy objects

In [15]:
char_matrix_cp = cp.asarray(a = char_matrix)
word_group_id_list_cp = cp.asarray(a = word_group_id_list)
word_id_list = cp.asarray(a = word_id_list)
wchar_matrix_cp = cp.asarray(a = wchar_matrix)
word_group_id_list_cp = cp.asarray(a = word_group_id_list)


In [16]:
if change_data_types:
    char_matrix_cp = char_matrix_cp.astype(cp.int8)
    word_group_id_list_cp = word_group_id_list_cp.astype(cp.int32)
    word_id_list = word_id_list.astype(cp.int32)
    wchar_matrix_cp = wchar_matrix_cp.astype(cp.int8)
    word_group_id_list_cp = word_group_id_list_cp.astype(cp.int32)

In [20]:
def format_output_list_cp(outcome_word_id_list_cp: cp.ndarray, wg_id: int) -> cp.ndarray:
    output_list = cp.zeros(
        shape=(outcome_word_id_list_cp.shape[0], 2), dtype=cp.int32)

    # update the output list with the word_id_list - these are from/parent words
    output_list[:, 0] = outcome_word_id_list_cp

    # update with the word_id - this is the to/child word
    output_list[:, 1] = wg_id

    return output_list


In [22]:
run_start_time = perf_counter_ns()
# the naive approach
# establish counters for record keeping
# sample 1000 rows
n_samples = 1000

#sample_wg_id = wg_df['word_group_id'].sample(n = n_samples, random_state = 42)
sample_wg_id = wg_df['word_group_id'].to_numpy(dtype = np.int32)
sample_wg_id_cp = cp.asarray(a = sample_wg_id)

output_list_cp = cp.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)

row_count = 0
anagram_pair_count = 0
intermediate_to_word_count = collections.Counter()

for wg_id in sample_wg_id_cp:
    outcome = wchar_matrix_cp - wchar_matrix_cp[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices_cp = cp.all(outcome >= 0, axis=1)
    outcome = None

    n_from_words = outcome_indices_cp.sum()               
    
    if n_from_words >= 1:
        # extract anagrams based on index values    
        outcome_word_id_list_cp = word_group_id_list_cp[outcome_indices_cp]    
        
        # extract anagrams based on index values    
        outcome_word_id_list_cp = format_output_list_cp(outcome_word_id_list_cp=outcome_word_id_list_cp,
                                                  wg_id=wg_id)
        
        # enumerate the from/parent wordsds
        new_anagram_pair_count = anagram_pair_count + n_from_words

        # update the total output list
        output_list_cp[anagram_pair_count:new_anagram_pair_count,
                    :] = outcome_word_id_list_cp

        # set the anagram pair count
        anagram_pair_count = new_anagram_pair_count
    
    row_count += 1

    if row_count % 10000 == 0:
        print(row_count)

print('...time to find parent words...')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
...time to find parent words...
Hours: 0 | minutes: 2 | seconds: 55.6663
...truncating output list...
(367672, 2)
Hours: 0 | minutes: 2 | seconds: 55.6738


In [None]:
# very cool! this took about 1 second on a sample of 1000 words
# as expected, the GPU is faster.
# but, what about a full run?
# on all ~216K words, it takes a little less than 3 minutes.

# Implement a variant of matrix extraction 5: using the three least common letters
The variant here is that letter selectors are not stored - they are used once. With a max of three letters in each selector, there are 2,387 letter selectors. Each sub-matrix is created, words are queried against it, and the values are stored. This implementation, while similar to the existing version of matrix extraction technique 5, is a little faster than the existing version.

In [None]:
# let's split the matrix, gather the values for each split, and then combine
n_subset_letters = 3
wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]

#letter_selector_list = wg_df["letter_selector"].unique()
#letter_selector_list.sort()
#letter_selector_id_dict = {ls: i_ls for i_ls, ls in enumerate(letter_selector_list)}

#wg_df["letter_selector_id"] = wg_df["letter_selector"].map(letter_selector_id_dict)
# here's the thing: I need to be able to identify on a single matrix the rows that match various conditions.
# I can't step through it and create objects at abandon. 
# so, given our wchar_matrix: what are the rows that match to such and such?
# we can add three columns to track this... 

In [24]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,letter_selector_id
0,A,a,1,a,0,0,a,a,1,a,0
1,aa,aa,2,a,1,1,a,a,1,a,0
2,aal,aal,3,a,2,2,al,la,2,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,ma,1114


In [25]:
wg_df['n_records'] = int(1)

In [26]:
ls_df = build_letter_selector_df(df = wg_df,
                          ls_nchar = n_subset_letters,
                          letter_selector_col_name = 'letter_selector',                          
                          letter_selector_id_col_name = 'letter_selector_id')

In [27]:
ls_df.shape

(2387, 5)

In [28]:
ls_df.head()

Unnamed: 0,letter_selector,ls_count,ls_nchar_iter,ls_nchar,letter_selector_id
0,a,2,3,1,0
1,ae,1,3,2,1
2,ai,1,3,2,2
3,b,1,3,1,3
4,ba,4,3,2,4


In [29]:
ls_df['ls_count'].sum()

np.int64(215842)

In [30]:
ls_df['ls_count'].describe()

count    2387.000000
mean       90.423963
std       212.422095
min         1.000000
25%         3.000000
50%        14.000000
75%        75.000000
max      2544.000000
Name: ls_count, dtype: float64

In [31]:
ls_df = get_ls_index(df = ls_df)

...loading the letter dictionary...


In [32]:
# let's create another matrix that features the letter_selector_id and the corresponding id
# of each word_group
ls_id_wg_id, ls_index_array = build_ls_index_arrays(wg_df=wg_df, ls_df=ls_df)

In [35]:
if change_data_types:
    ls_id_wg_id = ls_id_wg_id.astype(np.int32)

In [37]:
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1)
if change_data_types:
    output_list = output_list.astype(np.int32)
output_time_list = []

# start counting
anagram_pair_count = 0

for ls_row_id, ls_row in enumerate(ls_index_array):    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()    
   
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_row] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    
    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'buc'    
    n_search_space = temp_wg_id_list.shape[0]
    
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'buc'    
    curr_wg_id_list = ls_id_wg_id[ls_id_wg_id[:, 0] == ls_row_id, 1]
    
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):            
        
        # this returns the index of where the current word_group_id within the
        # temp_wg_id_list
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        
        # this is where the selection happens. Through several nested operations,
        # we are subtracting the focal word-as-vector from the ls_wchar_matrix
        # from there, we compute where all cells in each row are GTE 0
        # the resulting true/false list is then used to select the parent word group id
        # from the temp_wg_id_list.
        #         
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
                      
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words         

            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_row_id, n_search_space, curr_time])

print('...time to find parent words...')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
...time to find parent words...
Hours: 0 | minutes: 0 | seconds: 33.0098
...truncating output list...
(73218235, 2)
Hours: 0 | minutes: 0 | seconds: 34.396


In [None]:
# wow! A little more than 30 seconds to process all of the data. And this is on the CPU. 

In [38]:
time_df = build_timing_and_output_objects(output_time_list=output_time_list,
                                          ls_df = ls_df)

In [39]:
time_df.head()

Unnamed: 0,letter_selector_id,n_search_space,total_time,letter_selector,ls_count,ls_nchar_iter,ls_nchar,ls_index,avg_lookup_time
0,0,133001,0.013339,a,2,3,1,"[True, False, False, False, False, False, Fals...",5.014737e-08
1,1,84952,0.006435,ae,1,3,2,"[True, False, False, False, True, False, False...",7.575219e-08
2,2,82414,0.005876,ai,1,3,2,"[True, False, False, False, False, False, Fals...",7.129614e-08
3,3,35319,0.00374,b,1,3,1,"[False, True, False, False, False, False, Fals...",1.059033e-07
4,4,23469,0.005301,ba,4,3,2,"[True, True, False, False, False, False, False...",5.646491e-08


In [40]:
time_df['n_search_space'].describe()

count      2387.000000
mean       7444.399665
std       13775.994922
min           1.000000
25%         340.000000
50%        1665.000000
75%        8655.500000
max      145106.000000
Name: n_search_space, dtype: float64

In [41]:
# check to make sure that all words have been accounted for
time_df['ls_count'].sum()

np.int64(215842)

In [42]:
time_df['avg_lookup_time'].describe()

count    2.387000e+03
mean     7.136168e-06
std      6.139913e-05
min      2.315772e-08
25%      4.044542e-08
50%      8.057735e-08
75%      4.008654e-07
max      1.273200e-03
Name: avg_lookup_time, dtype: float64

In [43]:
print('...building output counters...')
from_word_counter, to_word_counter = build_counters(output_list=output_list)

...building output counters...


In [44]:
# check to see if the word 'acanthology', with word_group_id 746 is in the counter
print(from_word_counter[746]) # should be 26
print(to_word_counter[746]) # should 329

26
329


In [45]:
time_df.head()  

Unnamed: 0,letter_selector_id,n_search_space,total_time,letter_selector,ls_count,ls_nchar_iter,ls_nchar,ls_index,avg_lookup_time
0,0,133001,0.013339,a,2,3,1,"[True, False, False, False, False, False, Fals...",5.014737e-08
1,1,84952,0.006435,ae,1,3,2,"[True, False, False, False, True, False, False...",7.575219e-08
2,2,82414,0.005876,ai,1,3,2,"[True, False, False, False, False, False, Fals...",7.129614e-08
3,3,35319,0.00374,b,1,3,1,"[False, True, False, False, False, False, Fals...",1.059033e-07
4,4,23469,0.005301,ba,4,3,2,"[True, True, False, False, False, False, False...",5.646491e-08


In [46]:
# drop the ls_index field
time_df = time_df.drop(labels = ['ls_index'], axis = 1)

In [47]:
# compute the total number of comps
time_df['total_comps'] = time_df['n_search_space'] * time_df['ls_count']

In [48]:
# let's save this experiment for later use
write_data_to_sqlite(df = time_df, table_name = "exp_01_mod_meo_5", db_path = rc.DB_PATH, db_name = rc.DB_NAME)

...now writing: exp_01_mod_meo_5


# Do it with CUPY

In [50]:
def format_output_list_cp(outcome_word_id_list: cp.ndarray, wg_id: int) -> cp.ndarray:
        
    output_list = cp.zeros(shape=(outcome_word_id_list.shape[0], 2), dtype=cp.int32)

    # update the output list with the word_id_list - these are from/parent words
    output_list[:, 0] = outcome_word_id_list    

    # update with the word_id - this is the to/child word
    output_list[:, 1] = wg_id

    return output_list

In [51]:
# create some more cupy objects
ls_index_array_cp = cp.asarray(ls_index_array)
ls_id_wg_id_cp = cp.asarray(ls_id_wg_id)
if change_data_types:
    ls_id_wg_id_cp = ls_id_wg_id_cp.astype(cp.int32)
    


In [52]:
ls_id_wg_id_cp.dtype

dtype('int32')

In [55]:
run_start_time = perf_counter_ns()
# create the output list
output_list_cp = cp.full(shape = (n_possible_anagrams, 2), fill_value=-1, dtype = int)
if change_data_types:
    output_list_cp = output_list_cp.astype(cp.int32)
output_time_list = []

# start counting
anagram_pair_count = 0

for ls_row_id, ls_row in enumerate(ls_index_array_cp):    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
        
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices_cp = cp.all(wchar_matrix_cp[:, ls_row] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix_cp = wchar_matrix_cp[outcome_indices_cp, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list_cp = word_group_id_list_cp[outcome_indices_cp]
    # place into a dictionary to go from wg_id to wg_index.
    # What is the index of wg_id 675?    

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'buc'    
    n_search_space = temp_wg_id_list_cp.shape[0]
        
       
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'buc'    
    curr_wg_id_list = ls_id_wg_id_cp[ls_id_wg_id_cp[:, 0] == ls_row_id, 1]    
    
    # enumerate each word in the word group
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):    
        
        temp_wg_id = cp.where(temp_wg_id_list_cp == curr_wg_id)[0][0]        
        
        outcome_word_id_list_cp = temp_wg_id_list_cp[cp.all(a = (ls_wchar_matrix_cp - ls_wchar_matrix_cp[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list_cp.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list_cp(outcome_word_id_list=outcome_word_id_list_cp, wg_id=curr_wg_id)                       
            
            new_anagram_pair_count = anagram_pair_count + n_from_words            

            output_list_cp[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_row_id, n_search_space, curr_time])

print('...time to find parent words...')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = cp.all(output_list_cp >= 0, axis=1)
output_list_cp = output_list_cp[output_indices,]
print(output_list_cp.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
...time to find parent words...
Hours: 0 | minutes: 2 | seconds: 0.1102
...truncating output list...
(73218235, 2)
Hours: 0 | minutes: 2 | seconds: 0.4092


In [56]:
time_df = build_timing_and_output_objects(output_time_list, ls_df = ls_df)

In [57]:
# yikes! That is slower!
# CPU: 84 seconds
# GPU: 112 seconds. 
# Not quite 30 seconds slower.

In [58]:
# count using numpy, and then create a Counter object
from_word_counter = cp.unique(ar=output_list[:, 1], return_counts=True)
to_word_counter = cp.unique(ar=output_list[:, 0], return_counts=True)


In [59]:
# this used to take 45 seconds, it now takes 6
# zip with asterisk notation won't unpack the tuple
from_word_counter = collections.Counter({wg_id: wg_count for wg_id, wg_count in zip(cp.asnumpy(from_word_counter[0]), cp.asnumpy(from_word_counter[1]))})
to_word_counter = collections.Counter({wg_id: wg_count for wg_id, wg_count in zip(cp.asnumpy(to_word_counter[0]), cp.asnumpy(to_word_counter[1]))})

In [60]:
# check to see if the word 'acanthology', with word_group_id 746 is in the counter
print(from_word_counter[746]) # 26
print(to_word_counter[746]) # 329

26
329


In [61]:
# the take away: using a GPU can be faster.
# But, it seems to only be faster when working on large matrices. 
# On smaller matrices, there isn't any speed up. In fact, there is slow down.