# Exp 05: Split large letter selectors
Use the data generated in the previous step to identify large search spaces and split them accordingly.


In [1]:
# standard libraries
from time import perf_counter_ns
import time

In [2]:
# external libraries
import numpy as np
import pandas as pd

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *

In [3]:
# specify which letter selector to use
# set to False to use the modified search space
use_existing = True

# values controlling the split sizes - larger values will produce fewer letter selectors
search_space_cut = 40000
# additional number of characters to use when splitting large letter selectors.
# Larger numbers produce more letter selectors
# A value of 3 produces nothing different
# A value of 4 creates 16 additional letters
additional_letter_length = 4

In [4]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH)

...loading words into a dataframe...
...query execution took: 1.32 seconds...
...loading word groups into a dataframe...
...query execution took: 1.51 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [5]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count
0,A,a,1,a,0,0,a,a,1
1,aa,aa,2,a,1,1,a,a,1
2,aal,aal,3,a,2,2,al,la,2
3,aalii,aalii,5,a,3,3,ail,lai,1
4,aam,aam,3,a,4,4,am,ma,2


In [6]:
# add a column to count records - because pandas is weird
wg_df['n_records'] = int(1)

In [7]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records
0,A,a,1,a,0,0,a,a,1,1
1,aa,aa,2,a,1,1,a,a,1,1
2,aal,aal,3,a,2,2,al,la,2,1
3,aalii,aalii,5,a,3,3,ail,lai,1,1
4,aam,aam,3,a,4,4,am,ma,2,1


In [8]:
ls_df = build_letter_selector_df(df = wg_df, ls_nchar=3,                          
                                 letter_selector_col_name='letter_selector_mod',
                                 letter_selector_id_col_name='letter_selector_mod_id')

In [9]:
# now, load the previously calculated search space counts
sql = 'select * from exp_02_search_space_size;'

In [10]:
tot_ls_df = query_db(sql = sql, db_path = rc.DB_PATH, db_name = rc.DB_NAME)

...query execution took: 6.74 seconds...


In [11]:
tot_ls_df.head()

Unnamed: 0,letter_selector,ls_count,ls_nchar_iter,ls_nchar,letter_selector_id,n_records,letter_selector_unique_id,n_search_space
0,a,4,1,1,0,1,0,133001
1,b,24368,1,1,1,1,1,35319
2,c,3921,1,1,2,1,2,80811
3,d,16164,1,1,3,1,3,56705
4,e,1,1,1,4,1,4,145106


In [12]:
tot_ls_df['tot_comps'] = tot_ls_df['ls_count'] * tot_ls_df['n_search_space']
tot_ls_df['ls_nchar'] = tot_ls_df['letter_selector'].str.len()

In [13]:
# check to make sure the counts are correct
tot_ls_df.loc[tot_ls_df['ls_nchar_iter'] == 3, 'ls_nchar'].value_counts().sum()

np.int64(2387)

In [14]:
# c1
# the graph suggests that search spaces greater than 30K are the issue
tot_ls_df['n_search_space_cut'] = (tot_ls_df['n_search_space'] >= search_space_cut).astype(int)

In [15]:
# let's select where ls_nchar_iter == 3
mod_ls_df = tot_ls_df.loc[(tot_ls_df['ls_nchar_iter']==3) &
                      (tot_ls_df['ls_nchar'] == 3), :].copy()

In [16]:
mod_ls_df['n_search_space_cut'].sum()
# this will add up to how many records?

np.int64(29)

In [17]:
mod_ls_df.head()

Unnamed: 0,letter_selector,ls_count,ls_nchar_iter,ls_nchar,letter_selector_id,n_records,letter_selector_unique_id,n_search_space,tot_comps,n_search_space_cut
372,bae,2,3,3,5,1,341,15662,31324,0
373,bai,6,3,3,6,1,342,13453,80718,0
374,bca,7,3,3,7,1,343,7893,55251,0
375,bci,2,3,3,8,1,344,6966,13932,0
376,bcl,553,3,3,9,1,345,6039,3339567,0


In [18]:
add_split = mod_ls_df.loc[mod_ls_df['n_search_space_cut'] == 1, 'letter_selector'].tolist()
add_split = set(add_split)
len(add_split)

29

In [19]:
# split large search spaces by an extra letter
def compute_letter_selector(lgr):
    ls = lgr[:3]
    if ls in add_split:
        ls = lgr[:additional_letter_length]
    return ls

In [20]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_mod
0,A,a,1,a,0,0,a,a,1,1,a
1,aa,aa,2,a,1,1,a,a,1,1,a
2,aal,aal,3,a,2,2,al,la,2,1,la
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai
4,aam,aam,3,a,4,4,am,ma,2,1,ma


In [21]:
wg_df['letter_selector_mod'] = wg_df['letter_group_ranked'].map(compute_letter_selector)

In [22]:
# create a new letter_selector
mod_ls_df = build_letter_selector_df(df = wg_df,
                                     ls_nchar=3,
                                     letter_selector_col_name='letter_selector_mod',
                                     letter_selector_id_col_name='letter_selector_mod_id',
                                     create_letter_selector=False)
mod_ls_df.shape

(2403, 5)

In [23]:
mod_ls_df.head()

Unnamed: 0,letter_selector_mod,ls_count,ls_nchar_iter,ls_nchar,letter_selector_mod_id
0,a,2,3,1,0
1,ae,1,3,2,1
2,ai,1,3,2,2
3,b,1,3,1,3
4,ba,4,3,2,4


In [24]:
mod_ls_df = get_ls_index(df = mod_ls_df, letter_selector_col_name='letter_selector_mod')

...loading the letter dictionary...


In [25]:
mod_ls_df['ls_nchar'].value_counts()

ls_nchar
3    2250
2     111
1      26
4      16
Name: count, dtype: int64

In [26]:
# load the total number of anagrams
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH,
                                             db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [27]:
mod_ls_df = mod_ls_df.reset_index(drop = True)

In [28]:
mod_ls_df.head()

Unnamed: 0,letter_selector_mod,ls_count,ls_nchar_iter,ls_nchar,letter_selector_mod_id,ls_index
0,a,2,3,1,0,"[True, False, False, False, False, False, Fals..."
1,ae,1,3,2,1,"[True, False, False, False, True, False, False..."
2,ai,1,3,2,2,"[True, False, False, False, False, False, Fals..."
3,b,1,3,1,3,"[False, True, False, False, False, False, Fals..."
4,ba,4,3,2,4,"[True, True, False, False, False, False, False..."


In [29]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_mod
0,A,a,1,a,0,0,a,a,1,1,a
1,aa,aa,2,a,1,1,a,a,1,1,a
2,aal,aal,3,a,2,2,al,la,2,1,la
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai
4,aam,aam,3,a,4,4,am,ma,2,1,ma


In [30]:
mod_ls_df.head()

Unnamed: 0,letter_selector_mod,ls_count,ls_nchar_iter,ls_nchar,letter_selector_mod_id,ls_index
0,a,2,3,1,0,"[True, False, False, False, False, False, Fals..."
1,ae,1,3,2,1,"[True, False, False, False, True, False, False..."
2,ai,1,3,2,2,"[True, False, False, False, False, False, Fals..."
3,b,1,3,1,3,"[False, True, False, False, False, False, Fals..."
4,ba,4,3,2,4,"[True, True, False, False, False, False, False..."


In [31]:
col_names = ['letter_selector_mod', 'letter_selector_mod_id']
wg_df = pd.merge(left = wg_df, right = mod_ls_df[col_names])

In [32]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_mod,letter_selector_mod_id
0,A,a,1,a,0,0,a,a,1,1,a,0
1,aa,aa,2,a,1,1,a,a,1,1,a,0
2,aal,aal,3,a,2,2,al,la,2,1,la,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai,1083
4,aam,aam,3,a,4,4,am,ma,2,1,ma,1115


In [33]:
curr_ls_df = build_letter_selector_df(df = wg_df,
                                      ls_nchar=3,
                                      letter_selector_col_name='letter_selector',
                                      letter_selector_id_col_name='letter_selector_id')
curr_ls_df = get_ls_index(df = curr_ls_df, letter_selector_col_name='letter_selector')

...loading the letter dictionary...


In [34]:
curr_ls_df.head()

Unnamed: 0,letter_selector,ls_count,ls_nchar_iter,ls_nchar,letter_selector_id,ls_index
0,a,2,3,1,0,"[True, False, False, False, False, False, Fals..."
1,ae,1,3,2,1,"[True, False, False, False, True, False, False..."
2,ai,1,3,2,2,"[True, False, False, False, False, False, Fals..."
3,b,1,3,1,3,"[False, True, False, False, False, False, Fals..."
4,ba,4,3,2,4,"[True, True, False, False, False, False, False..."


In [35]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,n_records,letter_selector_mod,letter_selector_mod_id,letter_selector
0,A,a,1,a,0,0,a,a,1,1,a,0,a
1,aa,aa,2,a,1,1,a,a,1,1,a,0,a
2,aal,aal,3,a,2,2,al,la,2,1,la,1081,la
3,aalii,aalii,5,a,3,3,ail,lai,1,1,lai,1083,lai
4,aam,aam,3,a,4,4,am,ma,2,1,ma,1115,ma


In [36]:
col_names = ['letter_selector', 'letter_selector_id']
wg_df = pd.merge(left = wg_df, right = curr_ls_df[col_names])

In [37]:
# number of letter selectors using the existing technique
wg_df['letter_selector'].unique().shape

(2387,)

In [38]:
# number of letter selectors using the modified technique
wg_df['letter_selector_mod'].unique().shape

(2403,)

In [39]:
# use the existing letter selector or the modified letter selector?
if use_existing:
    print('using the existing splits')
    ls_df = curr_ls_df.copy()
    ls_id_col_name = 'letter_selector_id'
    ls_id_wg_id = wg_df[['letter_selector_id', 'word_group_id']].to_numpy(dtype = np.int32)
    ls_index_array = np.array(ls_df['ls_index'].to_list())
else:
    print('using the modified splits')
    print(f'Search space cut: {search_space_cut} | Additional letter length {additional_letter_length}')
    ls_df = mod_ls_df.copy()
    ls_id_col_name = 'letter_selector_mod_id'
    ls_id_wg_id = wg_df[['letter_selector_mod_id', 'word_group_id']].to_numpy(dtype = np.int32)
    ls_index_array = np.array(ls_df['ls_index'].to_list())

# run it!
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams, 2), fill_value=-1, dtype=np.int32)
output_time_list = []

# start counting
anagram_pair_count = 0

for ls_row_id, ls_row in enumerate(ls_index_array):    
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()    
        
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_row] >= 1, axis=1)
    
    # this is the sub-matrix from which to query
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
        
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
    
    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'buc'    
    n_search_space = temp_wg_id_list.shape[0]        
    
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'buc'    
    curr_wg_id_list = ls_id_wg_id[ls_id_wg_id[:, 0] == ls_row_id, 1]
     
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):    
            
        # get different word group ids?
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        #print(curr_wg_id, temp_wg_id)
        
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]        
                
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
                        
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            
            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list

            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_row_id, n_search_space, curr_time])

print('...time to find parent/child word relationships')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

using the existing splits
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
...time to find parent/child word relationships
Hours: 0 | minutes: 1 | seconds: 47.168
...truncating output list...
(73218235, 2)
Hours: 0 | minutes: 1 | seconds: 51.4087


In [40]:
# c2
# current technique: 1 minute, 19 seconds
# modified technique, with 40K cutoff, up to four characters: 1 minute, 34 seconds
# modified technique, with 35K cutoff, up to four characters: 1 minute, 57 seconds
# modified technique, with 30K cutoff, up to four characters: 1 minute, 30 seconds
# modified technique, with 40K cutoff, up to five characters: 1 minute, 33 seconds

In [41]:
# approximately 19 seconds
from_word_counter, to_word_counter = build_counters(output_list = output_list)

In [42]:
# the number of from word groups
print(from_word_counter[746]) # should be 26
print(to_word_counter[746]) # should be 329

26
329


In [43]:
# The take away: modestly adjusting the search space does not decrease processing time