In [1]:
# standard libraries
from time import perf_counter_ns
import time

In [2]:
# external libraries
from matplotlib.patches import Rectangle
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd
import seaborn as sns

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *

In [3]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH)

...loading words into a dataframe...
...query execution took: 1.26 seconds...
...loading word groups into a dataframe...
...query execution took: 1.28 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [4]:
# add a column to count records - because pandas is weird
wg_df['n_records'] = int(1)

In [5]:
def build_letter_selector(df:pd.DataFrame,
                          ls_nchar:int, col_names:str,
                          letter_selector_col_name:str,                          
                          letter_selector_id_col_name:str):
    df[letter_selector_col_name] = df['letter_group_ranked'].str[:ls_nchar + 1]    
    ls_df = df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()    
    ls_df['ls_nchar_iter'] = ls_nchar
    ls_df['ls_nchar'] = ls_df[letter_selector_col_name].str.len()
    ls_df[letter_selector_id_col_name] = range(0, ls_df.shape[0])

    return ls_df

In [6]:
# create the total list of different letter selector combinations based on
# number of characters
col_names = ['letter_selector_temp', 'n_records']
ls_df_list = []
for ls_nchar in range(1, 17):
    ls_df = build_letter_selector(df = wg_df, ls_nchar=ls_nchar,
                                  col_names = col_names, 
                                  letter_selector_col_name='letter_selector_temp',
                                  letter_selector_id_col_name='letter_selector_temp_id')
    
    ls_df_list.append(ls_df)
    
tot_ls_df = pd.concat(objs=ls_df_list,axis = 0)

In [7]:
# now, load the previously calculated search space counts

In [8]:
ss_df = pd.read_csv(filepath_or_buffer='search_space_count.csv')
# convert the list as string to a list
ss_df['ls_index']= ss_df['ls_index'].map(eval)

In [9]:
ss_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_index,n_search_space
0,a,2,[0],133001
1,ae,1,"[0, 4]",84952
2,ai,1,"[0, 8]",82414
3,b,1,[1],35319
4,ba,12,"[1, 0]",23469


In [10]:
tot_ls_df = pd.merge(left = tot_ls_df, right = ss_df)

In [11]:
tot_ls_df.shape

(1196843, 7)

In [12]:
tot_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar_iter,ls_nchar,letter_selector_temp_id,ls_index,n_search_space
0,a,2,1,1,0,[0],133001
1,ae,1,1,2,1,"[0, 4]",84952
2,ai,1,1,2,2,"[0, 8]",82414
3,b,1,1,1,3,[1],35319
4,ba,12,1,2,4,"[1, 0]",23469


In [13]:
tot_ls_df['tot_comps'] = tot_ls_df['ls_count'] * tot_ls_df['n_search_space']
tot_ls_df['ls_nchar'] = tot_ls_df['letter_selector_temp'].str.len()

In [14]:
# check to make sure the counts are correct
tot_ls_df.loc[tot_ls_df['ls_nchar_iter'] == 2, 'ls_nchar'].value_counts().sum()

np.int64(2387)

In [15]:
# let's select where ls_nchar_iter == 3
new_ls_df = tot_ls_df.loc[(tot_ls_df['ls_nchar_iter']==2) &
                      (tot_ls_df['ls_nchar'] == 3), :].copy()

In [16]:
# the graph suggests that search spaces greater than 30K are the issue

In [17]:
new_ls_df['n_search_space_cut'] = (new_ls_df['n_search_space'] >= 40000).astype(int)

In [18]:
new_ls_df['n_search_space_cut'].sum()
# so, do we need to split up half of the records?
# this will add how many records?

np.int64(29)

In [19]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar_iter,ls_nchar,letter_selector_temp_id,ls_index,n_search_space,tot_comps,n_search_space_cut
346,bae,2,2,3,5,"[1, 0, 4]",15662,31324,0
347,bai,6,2,3,6,"[1, 0, 8]",13453,80718,0
348,bca,7,2,3,7,"[1, 2, 0]",7893,55251,0
349,bci,2,2,3,8,"[1, 2, 8]",6966,13932,0
350,bcl,553,2,3,9,"[1, 2, 11]",6039,3339567,0


In [20]:
add_split = new_ls_df.loc[new_ls_df['n_search_space_cut'] == 1, 'letter_selector_temp'].tolist()

In [21]:
len(add_split)

29

In [22]:
add_split = set(add_split)

In [23]:
len(add_split)

29

In [24]:
def compute_letter_selector(lgr):
    ls = lgr[:3]
    if ls in add_split:
        ls = lgr[:4]
    return ls

In [25]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_temp
0,A,a,1,a,0,0,a,a,1,1,a
1,aa,aa,2,a,1,1,a,a,1,1,a
2,aal,aal,3,a,2,2,al,la,2,1,la
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai
4,aam,aam,3,a,4,4,am,ma,2,1,ma


In [26]:
wg_df['letter_selector_temp'] = wg_df['letter_group_ranked'].map(compute_letter_selector)

In [27]:
# create a new letter_selector
col_names = ['letter_selector_temp', 'n_records']
new_ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()
new_ls_df['ls_nchar'] = new_ls_df['letter_selector_temp'].str.len()

In [28]:
new_ls_df.shape

(2403, 3)

In [29]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar
0,a,2,1
1,ae,1,2
2,ai,1,2
3,b,1,1
4,ba,4,2


In [30]:
new_ls_df = pd.merge(left = new_ls_df, right = ss_df)

In [31]:
new_ls_df.shape

(2403, 5)

In [32]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar,ls_index,n_search_space
0,a,2,1,[0],133001
1,ae,1,2,"[0, 4]",84952
2,ai,1,2,"[0, 8]",82414
3,b,1,1,[1],35319
4,ba,4,2,"[1, 0]",23469


In [33]:
new_ls_df['tot_comps'] = new_ls_df['ls_count'] * new_ls_df['n_search_space']

In [34]:
new_ls_df.loc[new_ls_df['ls_nchar'] >= 3, 'tot_comps'].max()

19842264

In [35]:
# load the total number of anagrams
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH,
                                             db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [36]:
new_ls_df = new_ls_df.reset_index(drop = True)

In [37]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar,ls_index,n_search_space,tot_comps
0,a,2,1,[0],133001,266002
1,ae,1,2,"[0, 4]",84952,84952
2,ai,1,2,"[0, 8]",82414,82414
3,b,1,1,[1],35319,35319
4,ba,4,2,"[1, 0]",23469,93876


In [38]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_temp
0,A,a,1,a,0,0,a,a,1,1,a
1,aa,aa,2,a,1,1,a,a,1,1,a
2,aal,aal,3,a,2,2,al,la,2,1,la
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai
4,aam,aam,3,a,4,4,am,ma,2,1,ma


In [39]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar,ls_index,n_search_space,tot_comps
0,a,2,1,[0],133001,266002
1,ae,1,2,"[0, 4]",84952,84952
2,ai,1,2,"[0, 8]",82414,82414
3,b,1,1,[1],35319,35319
4,ba,4,2,"[1, 0]",23469,93876


In [40]:
new_ls_df['letter_selector_id'] = range(0, new_ls_df.shape[0])

In [41]:
col_names = ['letter_selector_temp', 'letter_selector_id']
wg_df = pd.merge(left = wg_df, right = new_ls_df[col_names])

In [42]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_temp,letter_selector_id
0,A,a,1,a,0,0,a,a,1,1,a,0
1,aa,aa,2,a,1,1,a,a,1,1,a,0
2,aal,aal,3,a,2,2,al,la,2,1,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,1,ma,1115


In [43]:
new_ls_df.head()

Unnamed: 0,letter_selector_temp,ls_count,ls_nchar,ls_index,n_search_space,tot_comps,letter_selector_id
0,a,2,1,[0],133001,266002,0
1,ae,1,2,"[0, 4]",84952,84952,1
2,ai,1,2,"[0, 8]",82414,82414,2
3,b,1,1,[1],35319,35319,3
4,ba,4,2,"[1, 0]",23469,93876,4


In [44]:
# TODO: here!

In [45]:
# run it!
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1)
output_time_list = []

# start counting
anagram_pair_count = 0

#for ls_id_index in range(0, 10):
for ls_row_id, ls_row in new_ls_df.iloc[:None].iterrows():    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
    
    # get letter selector id information    
    ls_id = ls_row['letter_selector_id']
    ls_id_index = np.array(ls_row['ls_index'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_id_index] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    n_search_space = temp_wg_id_list.shape[0]
        
    #def my_func(row):
    #    return temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[row, :]) >= 0, axis = 1)]

    #for ii in range(0, ls_wchar_matrix.shape[0]):    
    #for i_curr_wg_id, curr_wg_id in enumerate(temp_wg_id_list):
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'bro'    
    curr_wg_id_list = wg_df.loc[wg_df['letter_selector_id'] == ls_id, 'word_group_id'].to_numpy()
    # n_lookups = curr_wg_id_list.shape[0]
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):
    
        
        #temp_wg_id = wg_id_dict[curr_wg_id]
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)

        #outcome_word_id_list = my_func(row = temp_wg_id)
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
            #print(outcome_word_id_list.shape)
            
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            #print(anagram_pair_count, new_anagram_pair_count)

            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_id, n_search_space, curr_time])

print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
#compute_el_time(seconds=time_proc)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
...truncating output list...
(73218235, 2)


In [None]:
# the "simple" letter connect

In [None]:
def get_ls_index(ls:str):
    return [letter_dict[l] for l in ls]

In [None]:
col_names = ['letter_selector', 'n_records']
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:3]    
simple_ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()
simple_ls_df['simple_letter_selector_id'] = range(0, simple_ls_df.shape[0])
simple_ls_df['ls_index_simple'] = simple_ls_df['letter_selector'].map(get_ls_index)


In [None]:
simple_ls_df.head()

In [None]:
wg_df.head()

In [None]:
wg_df['letter_selector'] = wg_df['letter_group_ranked'].str[:3]

In [None]:
simple_ls_df.head()

In [None]:
drop_col_names = ['letter_selector', 'ls_count', 'simple_letter_selector_id',
                  'ls_index', 'ls_index_simple']
#wg_df = wg_df.drop(labels = drop_col_names, axis = 1)

In [None]:
wg_df = pd.merge(left=wg_df, right = simple_ls_df)

In [None]:
wg_df.head()

In [None]:
simple_ls_df.head()

In [None]:
# run it!
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1)
output_time_list = []

# start counting
anagram_pair_count = 0

#for ls_id_index in range(0, 10):
for ls_row_id, ls_row in simple_ls_df.iloc[:None].iterrows():    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
    
    # get letter selector id information    
    ls_id = ls_row['simple_letter_selector_id']
    ls_id_index = np.array(ls_row['ls_index_simple'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_id_index] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    n_search_space = temp_wg_id_list.shape[0]
        
    #def my_func(row):
    #    return temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[row, :]) >= 0, axis = 1)]

    #for ii in range(0, ls_wchar_matrix.shape[0]):    
    #for i_curr_wg_id, curr_wg_id in enumerate(temp_wg_id_list):
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'bro'    
    curr_wg_id_list = wg_df.loc[wg_df['simple_letter_selector_id'] == ls_id, 'word_group_id'].to_numpy()
    # n_lookups = curr_wg_id_list.shape[0]
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):
    
        
        #temp_wg_id = wg_id_dict[curr_wg_id]
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)

        #outcome_word_id_list = my_func(row = temp_wg_id)
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
            #print(outcome_word_id_list.shape)
            
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            #print(anagram_pair_count, new_anagram_pair_count)

            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_id, n_search_space, curr_time])

print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
#compute_el_time(seconds=time_proc)

# old stuff

In [None]:
# let's split the matrix, gather the values for each split, and then combine
n_subset_letters = 3
wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]

letter_selector_list = wg_df["letter_selector"].unique()
letter_selector_list.sort()
letter_selector_id_dict = {ls: i_ls for i_ls, ls in enumerate(letter_selector_list)}

wg_df["letter_selector_id"] = wg_df["letter_selector"].map(letter_selector_id_dict)
# here's the thing: I need to be able to identify on a single matrix the rows that match various conditions.
# I can't step through it and create objects at abandon. 
# so, given our wchar_matrix: what are the rows that match to such and such?
# we can add three columns to track this... 

In [None]:
wg_df['n_records'] = int(1)

In [None]:
wg_df.head()

In [None]:
col_names = ['letter_selector_id', 'letter_selector', 'n_records']

In [None]:
ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()

In [None]:
ls_df.head()

In [None]:
def get_ls_index(ls:str):
    return [letter_dict[l] for l in ls]

In [None]:
# this is effectively a column selector
ls_df['ls_index'] = ls_df['letter_selector'].map(get_ls_index)

In [None]:
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1)
output_time_list = []

# start counting
anagram_pair_count = 0

#for ls_id_index in range(0, 10):
for ls_row_id, ls_row in ls_df.iloc[:None].iterrows():    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
    
    # get letter selector id information
    ls_id = ls_row['letter_selector_id']
    ls_id_index = np.array(ls_row['ls_index'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_id_index] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    n_search_space = temp_wg_id_list.shape[0]
        
    #def my_func(row):
    #    return temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[row, :]) >= 0, axis = 1)]

    #for ii in range(0, ls_wchar_matrix.shape[0]):    
    #for i_curr_wg_id, curr_wg_id in enumerate(temp_wg_id_list):
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'bro'    
    curr_wg_id_list = wg_df.loc[wg_df['letter_selector_id'] == ls_id, 'word_group_id'].to_numpy()
    # n_lookups = curr_wg_id_list.shape[0]
    # n_search_space >= n_lookups, always. 
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):
    
        
        #temp_wg_id = wg_id_dict[curr_wg_id]
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)

        #outcome_word_id_list = my_func(row = temp_wg_id)
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
            #print(outcome_word_id_list.shape)
            
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            #print(anagram_pair_count, new_anagram_pair_count)

            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_id, n_search_space, curr_time])

print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapase_time(seconds=time_proc)

In [None]:
def build_timing_and_output_objects(output_time_list:list, ls_df:pd.DataFrame):
    
    col_names =['letter_selector_id', 'n_search_space', 'total_time']
    time_df = pd.DataFrame(data = output_time_list, columns=col_names)
    get_hms(seconds = time_df['total_time'].sum(),round_seconds_digits=4)
    # join in the other information
    time_df = pd.merge(left = time_df, right = ls_df)    

    time_df['avg_lookup_time'] = time_df['total_time'] / (time_df['ls_count'])

    return time_df   

In [None]:
time_df = build_timing_and_output_objects(output_time_list=output_time_list,
                                          ls_df = ls_df)

In [None]:
time_df.head()

In [None]:
time_df['ls_count'].describe()

In [None]:
time_df['ls_n_chars'] = time_df['letter_selector'].str.len()

In [None]:
time_df.head()

In [None]:
# so, what is the relationship between total time and number of characters?
# n_search_space: the size of the letter selector
# ls_count: the number of look-ups of a search space of that size

In [None]:
time_df['ls_count'].sum()

In [None]:
time_df['n_ss_ls_ratio'] = time_df['n_search_space'] /  time_df['ls_count']

In [None]:
time_df['n_ss_ls_ratio'].describe()

In [None]:
w_time_df = time_df.loc[time_df['ls_n_chars'] == 3, :].copy()

In [None]:
w_time_df['n_ss_ls_ratio'].describe()

In [None]:
w_time_df.head()

In [None]:
w_time_df['total_comps'] = w_time_df['n_search_space'] * w_time_df['ls_count']

In [None]:
w_time_df['total_comps'].describe()

In [None]:
w_time_df['total_comps_log'] = np.log10(w_time_df['total_comps'])

In [None]:
w_time_df['total_comps_log'].describe()

In [None]:
w_time_df['total_time'].describe()

In [None]:
w_time_df['n_ss_ls_ratio'].describe()

In [None]:
10, 100, 500, 1000, 10000, 20000, 40000

In [None]:
w_time_df['n_ss_ls_ratio_log'] = np.log10(w_time_df['n_ss_ls_ratio'])
w_time_df['total_time_log'] = np.log10(w_time_df['total_time'])

In [None]:
w_time_df['n_ss_ls_ratio_log'].describe()

In [None]:
w_time_df['total_time_log'].describe()

In [None]:
w_time_df['total_time'].describe()

In [None]:
sns.set_theme(style = "whitegrid")
f, ax = plt.subplots(figsize = (20, 5))

# define the color map

my_plot = sns.scatterplot(data=w_time_df, x="n_ss_ls_ratio_log", y="total_time_log",                          
                         marker='o')

y_ticks = [0.001, .05, .01, .025, .05, .1, .25, .5, 1, 1.5]
y_ticks_log = [np.log10(yt) for yt in y_ticks]
y_tick_labels_formatted = ['{:.3f}'.format(ytl) for ytl in y_ticks]   


x_ticks = [10, 100, 500, 1000, 10000, 20000, 40000]
x_ticks_log = [np.log10(xt) for xt in x_ticks]
print(x_ticks)
x_tick_labels_formatted = ['{:,}'.format(xtl) for xtl in x_ticks]   

my_plot.set_yticks(ticks = y_ticks_log)
my_plot.set_yticklabels(labels = y_tick_labels_formatted, rotation=0)

my_plot.set_xticks(ticks = x_ticks_log)
my_plot.set_xticklabels(labels = x_tick_labels_formatted, rotation=0)

plt.title(label = "Total time by search space size over number of lookups")
plt.xlabel(xlabel = 'Search space size over number of lookups')
plt.ylabel(ylabel = "Total Time (seconds)")
output_file_name = '..\\graphics\\test_graphic_total_time_by_search_space_size_over_number_of_lookups.png'
output_file_name = os.path.normpath(output_file_name)
print(output_file_name)
my_plot.get_figure().savefig(fname = output_file_name)
plt.show()

In [None]:
w_time_df.head()

In [None]:
w_time_df['avg_lookup_time'].describe()

In [None]:
w_time_df['n_search_space'].describe()

In [None]:
w_time_df['n_search_space_log'] = np.log10(w_time_df['n_search_space'])
w_time_df['avg_lookup_time_log'] = np.log10(w_time_df['avg_lookup_time'])

In [None]:
sns.set_theme(style = "whitegrid")
f, ax = plt.subplots(figsize = (20, 5))

# define the color map

my_plot = sns.scatterplot(data=w_time_df, x="n_search_space_log", y="avg_lookup_time_log",
                          hue='total_time',
                         marker='o')

y_ticks = [0.00001, .0001, .001, .01, .1, .2]
y_ticks_log = [np.log10(yt) for yt in y_ticks]
y_tick_labels_formatted = ['{:.5f}'.format(ytl) for ytl in y_ticks]   


x_ticks = [10, 100, 500, 1000, 10000, 20000, 50000]
x_ticks_log = [np.log10(xt) for xt in x_ticks]
print(x_ticks)
x_tick_labels_formatted = ['{:,}'.format(xtl) for xtl in x_ticks]   

my_plot.set_yticks(ticks = y_ticks_log)
my_plot.set_yticklabels(labels = y_tick_labels_formatted, rotation=0)

my_plot.set_xticks(ticks = x_ticks_log)
my_plot.set_xticklabels(labels = x_tick_labels_formatted, rotation=0)

plt.title(label = "Avg lookup time by search space size")
plt.xlabel(xlabel = 'Search space size')
plt.ylabel(ylabel = "Average time (seconds)")
output_file_name = '..\\graphics\\test_graphic_total_time_by_search_space_size_over_number_of_lookups.png'
output_file_name = os.path.normpath(output_file_name)
print(output_file_name)
my_plot.get_figure().savefig(fname = output_file_name)
plt.show()

In [None]:
w_time_df.head()

In [None]:
sns.set_theme(style = "whitegrid")
f, ax = plt.subplots(figsize = (20, 5))

# define the color map

my_plot = sns.scatterplot(data=w_time_df, x="total_comps_log", y="total_time_log",  
                          hue='n_search_space', size = 
                         marker='o')

y_ticks = [0.001, .05, .01, .025, .05, .1, .25, .5, 1, 1.5]
y_ticks_log = [np.log10(yt) for yt in y_ticks]
y_tick_labels_formatted = ['{:.3f}'.format(ytl) for ytl in y_ticks]   


x_ticks = [0, 1, 2, 3, 4, 5, 6, 7, 8]
x_ticks_log = [10 ** xt for xt in x_ticks]
x_tick_labels_formatted = ['{:,}'.format(xtl) for xtl in x_ticks_log]   

my_plot.set_yticks(ticks = y_ticks_log)
my_plot.set_yticklabels(labels = y_tick_labels_formatted, rotation=0)

my_plot.set_xticks(ticks = x_ticks)
my_plot.set_xticklabels(labels = x_tick_labels_formatted, rotation=0)

plt.title(label = "Total time by total comps")
plt.xlabel(xlabel = 'Total comparisons')
plt.ylabel(ylabel = "Total Time (seconds)")
output_file_name = '..\\graphics\\test_graphic_total_time_by_total_comps.png'
output_file_name = os.path.normpath(output_file_name)
print(output_file_name)
my_plot.get_figure().savefig(fname = output_file_name)
plt.show()

In [None]:
# let's try to keep the total comps to less than 30K
w_time_df.loc[w_time_df['total_comps'] < 30000, 'total_time'].describe()

In [None]:
wg_df.head()

In [None]:
ls_df.shape

In [None]:
wg_df['n_chars'].describe()

In [None]:
n_char_split_dict = []

In [None]:
# UPDATE THE SELECTORS
# let's split the matrix, gather the values for each split, and then combine
n_subset_letters = 4
wg_df["letter_selector"] = wg_df["letter_group_ranked"].str[:n_subset_letters]

letter_selector_list = wg_df["letter_selector"].unique()
letter_selector_list.sort()
letter_selector_id_dict = {ls: i_ls for i_ls, ls in enumerate(letter_selector_list)}

wg_df["letter_selector_id"] = wg_df["letter_selector"].map(letter_selector_id_dict)
# here's the thing: I need to be able to identify on a single matrix the rows that match various conditions.
# I can't step through it and create objects at abandon. 
# so, given our wchar_matrix: what are the rows that match to such and such?
# we can add three columns to track this... 

In [None]:
# let's get cute and compute splits for all characters!

In [None]:
sorted(wg_df['letter_group_ranked'].str.len().unique().tolist())

In [None]:
col_names = ['letter_selector_temp', 'n_records']
ls_df_list = []
for ls_nchar in range(1, 17):
    wg_df['letter_selector_temp'] = wg_df['letter_group_ranked'].str[:ls_nchar + 1]    
    ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()    
    ls_df_list.append(ls_df)
    
tot_ls_df = pd.concat(objs=ls_df_list,axis = 0)

In [None]:
tot_ls_df.shape

In [None]:
tot_ls_df.head()

In [None]:
tot_ls_df.head()

In [None]:
tot_ls_df.shape

In [None]:
tot_ls_df = tot_ls_df.drop_duplicates(subset = ['letter_selector_temp', 'ls_count'])

In [None]:
tot_ls_df.shape

In [None]:
tot_ls_df.head()

In [None]:
wg_df.head()

In [None]:
col_names = ['letter_selector_id', 'letter_selector', 'n_records']
ls_df = wg_df[col_names].groupby(col_names[:-1]).agg(ls_count = ('n_records', 'sum')).reset_index()
# this is effectively a column selector
ls_df['ls_index'] = ls_df['letter_selector'].map(get_ls_index)

In [None]:
ls_df.shape

In [None]:
ls_df.head()

In [None]:
tot_ls_df['ls_index'] = tot_ls_df['letter_selector_temp'].map(get_ls_index)

In [None]:
tot_ls_df.head()

In [None]:
tot_ls_df = tot_ls_df.reset_index(drop=True)

In [None]:
tot_ls_df.shape

In [None]:
tot_ls_df.index

In [None]:
n_search_space_list = []
for ls_row_id, ls_row in tot_ls_df.iloc[:None].iterrows():    
    if ls_row_id % 1000 == 0:
        print(ls_row_id)    
    
    # get letter selector id information    
    ls_id_index = np.array(ls_row['ls_index'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_id_index] >= 1, axis=1)
    #print(outcome_indices.sum())
    
    # this is the sub-matrix from which to query
    #ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    #temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    #n_search_space = temp_wg_id_list.shape[0]
    n_search_space_list.append(outcome_indices.sum())

In [None]:
tot_ls_df['n_search_space'] = n_search_space_list

In [None]:
tot_ls_df.head()

In [None]:
tot_ls_df.to_csv(path_or_buf='search_space_count.csv', index = False)

In [None]:
# try the above with cupy!
import cupy as cp

In [None]:
wchar_matrix_cp = cp.asarray(a=wchar_matrix)

In [None]:
n_search_space_list = []
for ls_row_id, ls_row in tot_ls_df.iloc[:None].iterrows():    
    if ls_row_id % 1000 == 0:
        print(ls_row_id)    
    
    # get letter selector id information    
    ls_id_index = cp.asarray(ls_row['ls_index'])    

    ##
    # BUILD A COLUMN SELECTOR
    ##
    # make sure that only values GTE 0 are selected so that the right number of
    # columns are return.
    #curr_ls_id = ls_id_index[ls_id_index >= 0]
    
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = cp.all(wchar_matrix_cp[:, ls_id_index] >= 1, axis=1)
    #print(outcome_indices.sum())
    
    # this is the sub-matrix from which to query
    #ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    #temp_wg_id_list = word_group_id_list[outcome_indices]
    # place into a dictionary to go from wg_id to wg_index. What is the index
    # of wg_id 675?
    # wg_id_dict = {wg_id:wg_index for wg_index, wg_id in enumerate(temp_wg_id_list)}

    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'bro'    
    #n_search_space = temp_wg_id_list.shape[0]
    n_search_space_list.append(outcome_indices.sum())

In [None]:
w_time_df.head()

In [None]:
ls_df['n_search_space'] = n_search_space_list

In [None]:
ls_df.head()


In [None]:
ls_df['ls_n_chars'] = ls_df['letter_selector'].str.len()

In [None]:
ls_df['ls_n_chars'].value_counts()

In [None]:
ls_df['total_comps'] = ls_df['ls_count'] * ls_df['n_search_space']

In [None]:
ls_df.loc[ls_df['ls_n_chars'] >= 3, 'total_comps'].describe()