# Exp 06: Hard code NumPy data types
NumPy's default integer datatype is 64-bit.
Many of the operations have values that do not exceed the max 8-bit or 32-bit value.

In [1]:
# standard libraries
from time import perf_counter_ns
import time

In [2]:
# external libraries
import numpy as np
import pandas as pd

# custom libraries
from _run_constants import *
from part_00_file_db_utils import *
from part_00_process_functions import *

# Load Data

In [3]:
word_df, wg_df, letter_dict, char_matrix, \
    word_group_id_list, word_id_list, wchar_matrix = load_input_data(
        db_path=rc.DB_PATH, db_name=rc.DB_NAME,
        in_file_path=rc.IN_FILE_PATH)

...loading words into a dataframe...
...query execution took: 1.17 seconds...
...loading word groups into a dataframe...
...query execution took: 1.14 seconds...
...loading the letter dictionary...
...loading the char matrix...
...subsetting the char matrix...


In [4]:
col_names = ['letter_selector_mod', 'n_records']
ls_df = build_letter_selector_df(df = wg_df, ls_nchar=3,                          
                                 letter_selector_col_name='letter_selector',
                                 letter_selector_id_col_name='letter_selector_id')

In [5]:
ls_df = get_ls_index(df = ls_df)


...loading the letter dictionary...


In [6]:
ls_df['ls_nchar'].value_counts()

ls_nchar
3    2250
2     111
1      26
Name: count, dtype: int64

In [7]:
# this is the count of lookups for each letter selector
ls_df['ls_count'].describe()

count    2387.000000
mean       90.423963
std       212.422095
min         1.000000
25%         3.000000
50%        14.000000
75%        75.000000
max      2544.000000
Name: ls_count, dtype: float64

In [8]:
# how many are at 99-percent?
np.quantile(a = ls_df['ls_count'], q = .99)

np.float64(1038.5199999999977)

In [9]:
# load the total number of anagrams
n_possible_anagrams = load_possible_anagrams(db_path=rc.DB_PATH,
                                             db_name=rc.DB_NAME)

...query execution took: 0.0 seconds...


In [10]:
ls_df = ls_df.reset_index(drop = True)

In [11]:
# merge to identify each word's letter_selector
col_names = ['letter_selector', 'letter_selector_id']
wg_df = pd.merge(left = wg_df, right = ls_df[col_names])

In [12]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count,letter_selector,n_records,letter_selector_id
0,A,a,1,a,0,0,a,a,1,a,1,0
1,aa,aa,2,a,1,1,a,a,1,a,1,0
2,aal,aal,3,a,2,2,al,la,2,la,1,1081
3,aalii,aalii,5,a,3,3,ail,lai,1,lai,1,1083
4,aam,aam,3,a,4,4,am,ma,2,ma,1,1114


In [13]:
ls_id_wg_id, ls_index_array = build_ls_index_arrays(wg_df=wg_df, ls_df = ls_df)

In [14]:
wg_df['letter_selector'].unique().shape

(2387,)

In [15]:
# what are the max values of the NumPy integer DataTypes
print(np.iinfo(np.int8))
print(np.iinfo(np.int16))
print(np.iinfo(np.int32))
print(np.iinfo(np.int64))

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------



In [16]:
# check the max value of the wchar_matrix
print(wchar_matrix.dtype)
print(wchar_matrix.max())

int32
8


In [17]:
# max value of the word_group_id_list
print(word_group_id_list.dtype)
print(word_group_id_list.max())

int64
215841


In [18]:
# the only two numpy arrays in use in the code below - not created when
# the code runs - are the wchar_matrix and the word_group_id_list
# convert the wchar_matrix to int8 
# convert the word_group_id_list to int32

In [19]:
wchar_matrix = wchar_matrix.astype(np.int8)
word_group_id_list = word_group_id_list.astype(np.int32)
ls_id_wg_id = ls_id_wg_id.astype(np.int32)


In [20]:
ls_df.head()

Unnamed: 0,letter_selector,ls_count,ls_nchar_iter,ls_nchar,letter_selector_id,ls_index
0,a,2,3,1,0,"[True, False, False, False, False, False, Fals..."
1,ae,1,3,2,1,"[True, False, False, False, True, False, False..."
2,ai,1,3,2,2,"[True, False, False, False, False, False, Fals..."
3,b,1,3,1,3,"[False, True, False, False, False, False, Fals..."
4,ba,4,3,2,4,"[True, True, False, False, False, False, False..."


In [21]:
# run it!
run_start_time=perf_counter_ns()
# create the output list
output_list = np.full(shape = (n_possible_anagrams , 2), fill_value=-1, dtype=np.int32)
output_time_list = []

# start counting
anagram_pair_count = 0

for ls_row_id, ls_row in enumerate(ls_index_array):    
        
    if ls_row_id % 100 == 0:
        print(ls_row_id)
    start_time = perf_counter_ns()
        
    ##
    # SUBSET THE wchar_matrix by column selector
    ##    
    outcome_indices = np.all(wchar_matrix[:, ls_row] >= 1, axis=1)    
    
    # sub-matrix that we will use to find parent words
    ls_wchar_matrix = wchar_matrix[outcome_indices, :]
            
    # this is the list of word group ids that correspond to the word group ids
    # in the ls_wchar_matrix
    temp_wg_id_list = word_group_id_list[outcome_indices]
        
    # this is the number of word groups that meet certain criteria. 
    # for example, words that feature the letters: 'buc'    
    n_search_space = temp_wg_id_list.shape[0]        
    
    # the current list of words featuring the set of least common letters.
    # these are the words have the least common letters of 'buc'        
    curr_wg_id_list = ls_id_wg_id[ls_id_wg_id[:, 0] == ls_row_id, 1]
    # with a three-letter letter selector, ranges in size from 1 to 2544
         
    for i_curr_wg_id, curr_wg_id in enumerate(curr_wg_id_list):    
            
        # get the re-alignment of the word group id
        temp_wg_id = np.where(temp_wg_id_list == curr_wg_id)[0][0]
        
        outcome_word_id_list = temp_wg_id_list[np.all(a = (ls_wchar_matrix - ls_wchar_matrix[temp_wg_id, :]) >= 0, axis = 1)]        
                        
        n_from_words = outcome_word_id_list.shape[0]
        
        if n_from_words > 0:
            outcome_word_id_list = format_output_list(outcome_word_id_list=outcome_word_id_list, wg_id=curr_wg_id)
                                    
            # enumerate the from/parent words
            new_anagram_pair_count = anagram_pair_count + n_from_words
            
            output_list[anagram_pair_count:new_anagram_pair_count, :] = outcome_word_id_list
            
            # update the anagram pair count
            anagram_pair_count = new_anagram_pair_count

    curr_time = calc_time(time_start=start_time, round_digits=8)
    output_time_list.append([ls_row_id, n_search_space, curr_time])

print('...time to find parent/child word relationships')
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)
print('...truncating output list...')
output_indices = np.all(output_list >= 0, axis=1)
output_list = output_list[output_indices,]
print(output_list.shape)
time_proc = calc_time(time_start=run_start_time, round_digits=4)
compute_elapsed_time(seconds=time_proc)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
...time to find parent/child word relationships
Hours: 0 | minutes: 1 | seconds: 27.9575
...truncating output list...
(73218235, 2)
Hours: 0 | minutes: 1 | seconds: 31.6505


In [22]:
# c2
# current technique: 2 minutes, 37 seconds
# setting the numpy data types: 1 minute, 41 seconds.
# being even more strict with the data types: 1 minutes, 26 seconds

In [23]:
# count using numpy, and then create a Counter object
from_word_counter, to_word_counter = build_counters(output_list=output_list)
# this used to take 45 seconds, it now takes 6

In [24]:
# the number of from word groups: should be 26
print(from_word_counter[746]) # should be 26
print(to_word_counter[746]) # should be 329

26
329
