In [1]:
# Find anagrams. But use cupy. On Studio3? Perfect the workflow.

In [2]:
# standard libraries
from time import perf_counter_ns
import time

In [3]:

# external libraries
import pandas as pd
import numpy as np

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *


# LOAD INPUT DATA

In [4]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH)


...loading words into a dataframe...
...query execution took: 0.46 seconds...
...loading word groups into a dataframe...
...query execution took: 0.45 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


# let's process 1000 rows using a single lookup using the full matrix

In [5]:
n_possible_anagrams = int(1e6)

In [6]:
f"{n_possible_anagrams :,}"

'1,000,000'

# Using the CPU

In [7]:
# sample 1000 rows
n_samples = 1000
sample_wg_id = wg_df['word_group_id'].sample(n = n_samples, random_state = 42).to_list()
if 746 not in sample_wg_id:
    sample_wg_id.append(746)
    #sample_wg_id = np.aray(sample_wg_id)

# establish counters for record keeping
output_list = np.full(shape=(n_possible_anagrams, 2),
                          fill_value=-1, dtype=int)

row_count = 0
anagram_pair_count = 0
intermediate_to_word_count = collections.Counter()

for wg_id in sample_wg_id:
    # identify parent words
    outcome = wchar_matrix - wchar_matrix[wg_id, ]
    
    # compute the score by finding where rows, across all columns, are GTE 0
    outcome_indices = np.all(outcome >= 0, axis=1)
    outcome = None

    n_from_words = outcome_indices.sum()

    if n_from_words >= 1:
        # extract anagrams based on index values    
        outcome_word_id_list = word_group_id_list[outcome_indices]    

        # we have matches
        # the focal word
        curr_output_list = np.zeros(shape=(n_from_words, 2), dtype=int)

        # update the output list with the word_id_list - these are from/parent words
        curr_output_list[:, 0] = outcome_word_id_list

        # update with the word_id - this is the to/child word
        curr_output_list[:, 1] = wg_id

        # enumerate the from/parent wordsds
        new_anagram_pair_count = anagram_pair_count + n_from_words

        output_list[anagram_pair_count:new_anagram_pair_count,
                    :] = curr_output_list

        # n_to_word_counter = collections.Counter(output_list[:, 0])
        intermediate_to_word_count.update(outcome_word_id_list.tolist())

        # set the anagram pair count
        anagram_pair_count = new_anagram_pair_count
    
    row_count += 1

    if row_count % 100 == 0:
        print(row_count)
print('truncating list')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
del output_indices
output_list.shape

100
200
300
400
500
600
700
800
900
1000
truncating list


(367672, 2)

In [8]:
from_word_counter = collections.Counter(output_list[:,1])
to_word_counter = collections.Counter(output_list[:,0])

In [9]:
# the number of from word groups
from_word_counter[746]

26

# build a selector

In [10]:
# let's split the matrix, gather the values for each split, and then combine
n_subset_letters = 3
wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]

letter_selector_list = wg_df["letter_selector"].unique()
letter_selector_list.sort()
letter_selector_id_dict = {ls: i_ls for i_ls, ls in enumerate(letter_selector_list)}

wg_df["letter_selector_id"] = wg_df["letter_selector"].map(letter_selector_id_dict)
# here's the thing: I need to be able to identify on a single matrix the rows that match various conditions.
# I can't step through it and create objects at abandon. 
# so, given our wchar_matrix: what are the rows that match to such and such?
# we can add three columns to track this... 

In [11]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,letter_selector_id
0,A,a,1,a,0,0,a,a,1,a,0
1,aa,aa,2,a,1,1,a,a,1,a,0
2,aal,aal,3,a,2,2,al,la,2,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,ma,1114


In [12]:
# load letter ranks
sql = 'select letter, total_letter_rank from letter_count;'

In [13]:
lr_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [14]:
lr_dict = {l:r for l, r in zip(lr_df['letter'], lr_df['total_letter_rank'])}

In [15]:
wg_df['n_records'] = int(1)

In [16]:
col_names = ['letter_selector_id', 'letter_selector', 'n_records']

In [17]:
ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()

In [18]:
ls_df.head()

Unnamed: 0,letter_selector_id,letter_selector,ls_count
0,0,a,2
1,1,ae,1
2,2,ai,1
3,3,b,1
4,4,ba,4


In [19]:
ls_df['ls_count'].sum()

np.int64(215842)

In [20]:
ls_df['ls_count'].describe()

count    2387.000000
mean       90.423963
std       212.422095
min         1.000000
25%         3.000000
50%        14.000000
75%        75.000000
max      2544.000000
Name: ls_count, dtype: float64

In [21]:
def get_ls_index(ls:str):
    return [letter_dict[l] for l in ls]

In [22]:
# this is effectively a column selector
ls_df['ls_index'] = ls_df['letter_selector'].map(get_ls_index)

In [23]:
ls_df.head()

Unnamed: 0,letter_selector_id,letter_selector,ls_count,ls_index
0,0,a,2,[0]
1,1,ae,1,"[0, 4]"
2,2,ai,1,"[0, 8]"
3,3,b,1,[1]
4,4,ba,4,"[1, 0]"


In [24]:
ls_df.head()

Unnamed: 0,letter_selector_id,letter_selector,ls_count,ls_index
0,0,a,2,[0]
1,1,ae,1,"[0, 4]"
2,2,ai,1,"[0, 8]"
3,3,b,1,[1]
4,4,ba,4,"[1, 0]"


In [25]:
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [26]:
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1)
output_time_list = []

# start counting
anagram_pair_count = 0

#for ls_id_index in range(0, 10):
for ls_row_id, ls_row in ls_df.iloc[:None].iterrows():    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
    
    # get letter selector id information
    ls_id = ls_row['letter_selector_id']
    ls_id_index = np.array(ls_row['ls_index'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_id_index] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    n_search_space = temp_wg_id_list.shape[0]
        
    #def my_func(row):
    #    return temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[row, :]) >= 0, axis = 1)]

    #for ii in range(0, ls_wchar_matrix.shape[0]):    
    #for i_curr_wg_id, curr_wg_id in enumerate(temp_wg_id_list):
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'bro'    
    curr_wg_id_list = wg_df.loc[wg_df['letter_selector_id'] == ls_id, 'word_group_id'].to_numpy()
    # n_lookups = curr_wg_id_list.shape[0]
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):
    
        
        #temp_wg_id = wg_id_dict[curr_wg_id]
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)

        #outcome_word_id_list = my_func(row = temp_wg_id)
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
            #print(outcome_word_id_list.shape)
            
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            #print(anagram_pair_count, new_anagram_pair_count)

            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_id, n_search_space, curr_time])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300


In [27]:
col_names =['letter_selector_id', 'n_search_space', 'curr_time']
time_df = pd.DataFrame(data = output_time_list, columns=col_names)

In [28]:
get_hms(seconds = time_df['curr_time'].sum(),round_seconds_digits=4)

('0', '1', '24.6488')

In [29]:
# join in the other information
time_df = pd.merge(left = time_df, right = ls_df)

In [30]:
time_df.head()

Unnamed: 0,letter_selector_id,n_search_space,curr_time,letter_selector,ls_count,ls_index
0,0,133001,0.031666,a,2,[0]
1,1,84952,0.017679,ae,1,"[0, 4]"
2,2,82414,0.01636,ai,1,"[0, 8]"
3,3,35319,0.007622,b,1,[1]
4,4,23469,0.012168,ba,4,"[1, 0]"


In [31]:
time_df.tail()

Unnamed: 0,letter_selector_id,n_search_space,curr_time,letter_selector,ls_count,ls_index
2382,2382,397,0.005323,zyp,59,"[25, 24, 15]"
2383,2383,584,0.004198,zyr,1,"[25, 24, 17]"
2384,2384,412,0.004667,zys,12,"[25, 24, 18]"
2385,2385,631,0.004468,zyt,9,"[25, 24, 19]"
2386,2386,197,0.004716,zyu,18,"[25, 24, 20]"


In [32]:
time_df['n_search_space'].describe()

count      2387.000000
mean       7444.399665
std       13775.994922
min           1.000000
25%         340.000000
50%        1665.000000
75%        8655.500000
max      145106.000000
Name: n_search_space, dtype: float64

In [33]:
testo = time_df['n_search_space'] >= time_df['ls_count']

In [34]:
testo.sum()

np.int64(2387)

In [35]:
time_df['ls_count'].sum()

np.int64(215842)

In [36]:
time_df['avg_lookup_time'] = time_df['curr_time'] / (time_df['ls_count'] * time_df['n_search_space'])

In [37]:
time_df['avg_lookup_time'].describe()

count    2.387000e+03
mean     2.870869e-05
std      2.423333e-04
min      3.623823e-08
25%      9.609277e-08
50%      2.225363e-07
75%      1.419267e-06
max      4.599100e-03
Name: avg_lookup_time, dtype: float64

In [38]:
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]

...truncating output list...


In [39]:
output_list.shape

(73218235, 2)

In [40]:
from_word_counter = collections.Counter(output_list[:,1])
to_word_counter = collections.Counter(output_list[:,0])

In [41]:
# check to see if the word 'acanthology', with word_group_id 746 is in the counter
print(from_word_counter[746])
print(to_word_counter[746])

26
329


# Do it with CUPY

In [42]:
import cupy as cp

In [43]:
from cupyx.profiler import benchmark

def my_func(a):
    return cp.sqrt(cp.sum(a**2, axis=-1))

a = cp.random.random((256, 1024))
print(benchmark(my_func, (a,), n_repeat=20))  


my_func             :    CPU:    58.995 us   +/- 12.949 (min:    41.600 / max:   101.700) us     GPU-0:   195.998 us   +/-  9.537 (min:   189.152 / max:   226.272) us


In [44]:
# initialize some gpu stats
start_gpu = cp.cuda.Event()
end_gpu = cp.cuda.Event()

start_gpu.record()
start_cpu = time.perf_counter()
out = my_func(a)
end_cpu = time.perf_counter()
end_gpu.record()
end_gpu.synchronize()
t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
t_cpu = end_cpu - start_cpu
print(t_gpu, t_cpu)

0.5140479803085327 0.0004891999997198582


In [45]:
# create cupy objects

In [46]:
wchar_matrix_cp = cp.asarray(a = wchar_matrix)
word_group_id_list_cp = cp.asarray(a = word_group_id_list)

In [61]:
def format_output_list_cp(outcome_word_id_list: cp.ndarray, wg_id: int) -> cp.ndarray:
        
    output_list = cp.zeros(shape=(outcome_word_id_list.shape[0], 2), dtype=int)

    # update the output list with the word_id_list - these are from/parent words
    output_list[:, 0] = outcome_word_id_list
    

    # update with the word_id - this is the to/child word
    output_list[:, 1] = wg_id

    return output_list


In [62]:
# create the output list
output_list_cp = cp.full(shape = (n_possible_anagrams, 2), fill_value=-1)
output_time_list = []

# start counting
anagram_pair_count = 0

#for ls_id_index in range(0, 10):
for ls_row_id, ls_row in ls_df.iloc[:100].iterrows():    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
    
    # get letter selector id information
    ls_id = ls_row['letter_selector_id']
    #ls_id_index = cp.array(ls_row['ls_index'])    
    ls_id_index = ls_row['ls_index']

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices_cp = cp.all(wchar_matrix_cp[:, ls_id_index] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix_cp = wchar_matrix_cp[outcome_indices_cp, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list_cp = word_group_id_list_cp[outcome_indices_cp]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    n_search_space = temp_wg_id_list_cp.shape[0]
        
    #def my_func(row):
    #    return temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[row, :]) >= 0, axis = 1)]

    #for ii in range(0, ls_wchar_matrix.shape[0]):    
    #for i_curr_wg_id, curr_wg_id in enumerate(temp_wg_id_list):
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'bro'    
    curr_wg_id_list = wg_df.loc[wg_df['letter_selector_id'] == ls_id, 'word_group_id'].to_list()
    # n_lookups = curr_wg_id_list.shape[0]
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):    
        
        #temp_wg_id = wg_id_dict[curr_wg_id]
        temp_wg_id = cp.where(temp_wg_id_list_cp == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)

        #outcome_word_id_list = my_func(row = temp_wg_id)
        outcome_word_id_list_cp = temp_wg_id_list_cp[cp.all(a = (ls_wchar_matrix_cp - ls_wchar_matrix_cp[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list_cp.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list_cp(outcome_word_id_list=outcome_word_id_list_cp, wg_id=curr_wg_id)
            #print(outcome_word_id_list.shape)
            
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            #print(anagram_pair_count, new_anagram_pair_count)

            output_list_cp[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_id, n_search_space, curr_time])

0


In [63]:
col_names =['letter_selector_id', 'n_search_space', 'curr_time']
time_df = pd.DataFrame(data = output_time_list, columns=col_names)

In [64]:
get_hms(seconds = time_df['curr_time'].sum(),round_seconds_digits=4)

('0', '0', '8.5176')

In [65]:
# join in the other information
time_df = pd.merge(left = time_df, right = ls_df)
time_df['n_search_space'].describe()

count       100.000000
mean       8983.270000
std       17556.425612
min         447.000000
25%        2678.000000
50%        4210.500000
75%        9260.500000
max      133001.000000
Name: n_search_space, dtype: float64

In [66]:
print('...truncating output list...')
output_indices_cp = cp.all(output_list_cp >= 0, axis=1)
output_list_cp = output_list_cp[output_indices_cp,]

output_list_cp.shape

...truncating output list...


(3112755, 2)

In [None]:
from_word_counter = collections.Counter(cp.asnumpy(output_list_cp)[:,1])
to_word_counter = collections.Counter(cp.asnumpy(output_list_cp)[:,0])

# check to see if the word 'acanthology', with word_group_id 746 is in the counter
print(from_word_counter[746])
print(to_word_counter[746])
