# Mike Babb
# babbm@uw.edu
# Find Anagrams
## Part 1: Structure the data

In [1]:
# standard libraries - installed by default
import collections
import itertools
import os
import string

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
# custom, user-defined functions
from part_00_process_functions import save_pickle

### set input and output paths

In [4]:
# path and name of input data
in_file_path = '/git/finding_anagrams/data/'
in_file_name = 'words.txt'

In [5]:
# construct the input file path
in_fpn = os.path.join(in_file_path, in_file_name)

In [6]:
# paths to output directories
base_output_file_path = '/project/finding_anagrams'
data_output_file_path = os.path.join(base_output_file_path, 'data')
tabulation_output_file_path = os.path.join(base_output_file_path, 'tabulations')

In [7]:
# setup the data output path
if os.path.exists(data_output_file_path):
    pass
else:
    os.makedirs(data_output_file_path)

In [8]:
# setup the tabulation output path
if os.path.exists(tabulation_output_file_path):
    pass
else:
    os.makedirs(tabulation_output_file_path)

### import list of words, shape data

In [9]:
# use pandas to load the data
# htps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
print('...Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None)

...Reading in list of words...


In [10]:
# check the first few rows
word_df.head()

Unnamed: 0,0
0,A
1,a
2,aa
3,aal
4,aalii


In [11]:
# specify a a more appropriate column name
col_names = ['word']
word_df.columns = col_names

In [12]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

...found 235,886 words to find anagrams for...


In [13]:
# convert the only column to a string - just to be safe.
# 'nan' is a word in the dictionary. 'nan' is an internal python value.
# same with 'null'
word_df['word'] = word_df['word'].astype(str)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  word_df['word'] = word_df['word'].astype(np.str)


In [14]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()

In [15]:
# remove hyphens
word_df['lcase'] = word_df['lcase'].str.replace('-', '')

In [16]:
# and now drop duplicates, based on the lowercase version of each word
word_df = word_df.drop_duplicates('lcase')

In [17]:
# find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [18]:
# extract the first letter of each word
word_df['first_letter'] = word_df['lcase'].str[:1]

In [19]:
# create an index
word_df['word_id'] = range(0, len(word_df))

In [20]:
# add a hash id to capture the sorted letters in each word
# use map() with a lambda function to chain several operations together
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
word_df['hash_id'] = word_df['lcase'].map(lambda x: hash(''.join(sorted(x))))

In [21]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,hash_id
0,A,a,1,a,0,-9144218192760223962
2,aa,aa,2,a,1,4026245254588369834
3,aal,aal,3,a,2,8941231830840609756
4,aalii,aalii,5,a,3,5603671830354132068
5,aam,aam,3,a,4,-1994003346442777947


In [22]:
word_id_hash_id_df = word_df['hash_id'].drop_duplicates().to_frame()

In [23]:
word_id_hash_id_df['word_group_id'] = range(0, len(word_id_hash_id_df))

In [24]:
# create a dictionary of the hash values using zip
# https://docs.python.org/3/library/functions.html#zip
hash_id_dict = {hash_id:word_group_id for word_group_id, hash_id in zip(word_id_hash_id_df['word_group_id'], word_id_hash_id_df['hash_id'])}

In [25]:
word_df['word_group_id'] = word_df['hash_id'].map(hash_id_dict)

In [26]:
# drop the hash id, no longer needed
word_df = word_df.drop('hash_id', 1)

In [27]:
word_df.tail()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id
235881,zythem,zythem,6,z,234365,215837
235882,Zythia,zythia,6,z,234366,215838
235883,zythum,zythum,6,z,234367,215839
235884,Zyzomys,zyzomys,7,z,234368,215840
235885,Zyzzogeton,zyzzogeton,10,z,234369,215841


In [28]:
# use dictionary comprehension to store the letter and the
# index of the letter for fast look ups
letter_dict = {l:li for li, l in enumerate(string.ascii_lowercase)}

In [29]:
# generate a sorted list of letters from the dictionary keys
letters = sorted(letter_dict.keys())

In [30]:
# get the unique letters in each word and then sort those letters
word_df['letter_group'] = word_df['lcase'].map(lambda x: ''.join(sorted(set(x))))

In [31]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group
0,A,a,1,a,0,0,a
2,aa,aa,2,a,1,1,a
3,aal,aal,3,a,2,2,al
4,aalii,aalii,5,a,3,3,ail
5,aam,aam,3,a,4,4,am


### count letter frequency

In [32]:
# several versions of the anagram determination require subsetting by letters in each word. 
# generate those data and use a ranking technique to help with anagram group identification

In [33]:
# use a counter object to count the occurence of each letter
# counters are a special type of dictionary. 
# https://docs.python.org/3/library/collections.html#collections.Counter
# very fast
letter_counter = collections.Counter()
# enumerate each word and then each letter
for i_cw, curr_word in enumerate(word_df['lcase']):    
    for i_cl, cl in enumerate(curr_word):
        letter_counter[cl] += 1

In [34]:
# make a dataframe from the counter object and then order from low to high
letter_count_df = pd.DataFrame.from_dict(data=letter_counter, orient = 'index')

In [35]:
letter_count_df = letter_count_df.reset_index()

In [36]:
letter_count_df.columns = ['letter', 'letter_count']

In [37]:
letter_count_df = letter_count_df.sort_values(by = 'letter_count', ascending = False)

In [38]:
letter_count_df['rank'] = range(1, len(letter_count_df) + 1)

In [39]:
letter_count_df['letter_percent'] = letter_count_df['letter_count'] / letter_count_df['letter_count'].sum()

In [40]:
letter_count_df.head(n=30)
# j is the least common letter while e is the most common letter

Unnamed: 0,letter,letter_count,rank,letter_percent
14,e,234526,1,0.104326
2,i,200272,2,0.089089
0,a,198359,3,0.088238
10,o,170115,4,0.075674
5,r,160284,5,0.071301
4,n,158116,6,0.070336
13,t,152237,7,0.067721
19,s,138993,8,0.061829
1,l,129962,9,0.057812
12,c,103021,10,0.045828


In [41]:
# join with the count of words that start with a focal letter. 

In [42]:
word_count_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame()

In [43]:
word_count_df.columns = ['n_words']

In [44]:
word_count_df = word_count_df.reset_index()

In [45]:
word_count_df['word_percent'] = word_count_df['n_words'] / word_count_df['n_words'].sum()

In [46]:
word_count_df.head()

Unnamed: 0,first_letter,n_words,word_percent
0,a,16974,0.072424
1,b,10963,0.046776
2,c,19783,0.084409
3,d,10849,0.04629
4,e,8703,0.037134


In [47]:
letter_count_df.head()

Unnamed: 0,letter,letter_count,rank,letter_percent
14,e,234526,1,0.104326
2,i,200272,2,0.089089
0,a,198359,3,0.088238
10,o,170115,4,0.075674
5,r,160284,5,0.071301


In [48]:
word_count_df = word_count_df.sort_values(by='n_words', ascending = False)

In [49]:
word_count_df['word_count_rank'] = range(1, len(word_count_df) + 1)

In [50]:
# joins
letter_count_df = pd.merge(left=letter_count_df, right = word_count_df,
                          left_on=['letter'], right_on = ['first_letter'])

In [51]:
letter_count_df = letter_count_df.drop('first_letter', axis = 1)

In [52]:
letter_count_df.head()

Unnamed: 0,letter,letter_count,rank,letter_percent,n_words,word_percent,word_count_rank
0,e,234526,1,0.104326,8703,0.037134,13
1,i,200272,2,0.089089,8786,0.037488,12
2,a,198359,3,0.088238,16974,0.072424,4
3,o,170115,4,0.075674,7830,0.033409,14
4,r,160284,5,0.071301,9613,0.041016,10


In [53]:
# sort and reorder the columns
letter_count_df = letter_count_df.sort_values(by = 'letter')
col_names = ['letter','letter_count','letter_percent','rank','n_words','word_percent','word_count_rank']
letter_count_df = letter_count_df[col_names]

In [54]:
# place the letter and its rank into a dictionary 
# as well as the rank and the corresponding letter
# {'k':21, 21:'k'}
letter_count_rank_dict = {}
for cl, clr in zip(letter_count_df['letter'], letter_count_df['rank']):
    letter_count_rank_dict[cl] = clr
    letter_count_rank_dict[clr] = cl

In [55]:
# what letter is ranked 21st?
letter_count_rank_dict[21]

'k'

In [56]:
# what is the rank of letter k?
letter_count_rank_dict['k']

21

In [57]:
# write a function to order the unique letters in each word by
# least common letter to most common letter
def get_least_common_letters(x):    
    if len(x) == 1:
        lcl = x
    else:
        # ranking of each letter
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in x]        
        # sort the ranking
        rank_list = sorted(rank_list, reverse = True)
        # generate the letters sorted by rank
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in rank_list]
        lcl = ''.join(rank_list)
    return lcl
    

In [58]:
# extract letters by ranking
word_df['letter_group_ranked'] = word_df['letter_group'].map(get_least_common_letters)

In [59]:
word_df.tail()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
235881,zythem,zythem,6,z,234365,215837,ehmtyz,zyhmte
235882,Zythia,zythia,6,z,234366,215838,ahityz,zyhtai
235883,zythum,zythum,6,z,234367,215839,hmtuyz,zyhmut
235884,Zyzomys,zyzomys,7,z,234368,215840,mosyz,zymso
235885,Zyzzogeton,zyzzogeton,10,z,234369,215841,egnotyz,zgytnoe


### generate the character matrix

In [60]:
# count the occurences of each letter in each word and store the results in a matrix
# populate the char_matrix and the word_id dictionary
# Use the apply function to the word_df. Effectively, apply a function to each row in the 
# dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# Upon intialization, the char_matrix is all zero.
# the entry for emit (as do the entriees for time, mite, item) has the following value:
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
# we need to find all words that have matching rows with at least these values.
# for example, 'terminator'.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0]

# the zero-filled matrix will be populated once the 
# score_row() function is applied to the word_df
char_matrix = np.zeros(shape=(len(word_df), 26), dtype=np.int32)
# same with the word_dict.
word_dict = {}
def score_word(row):
    # get a word from the current row
    curr_word = row['lcase']    
    ri = row['word_id'] # row index / word index
    word_length = row['n_chars'] # number of character in each word
    first_letter = row['first_letter'] # first letter of the word
    letter_group = row['letter_group'] # letter group
    letter_group_ranked = row['letter_group_ranked'] # letter group ranked
    word_dict[ri] = (curr_word, word_length, first_letter, letter_group, letter_group_ranked)
    # populate the char matrix
    for i_letter, letter in enumerate(curr_word):
        if letter in letter_dict:
            # find the corresponding column index of that letter
            li = letter_dict[letter]
            # increment the count of letters in the current row and current column
            char_matrix[ri, li] += 1
    return None

# catch the output from the function and delete
output = word_df.apply(score_word, 1)
del output

In [61]:
# how many letters are in use in our words?
char_matrix.sum()

2248005

In [62]:
# what about if we wanted to see how many times the letter 'e' is used?
char_matrix[:, 4].sum()

234526

In [63]:
# this is the same as:
letter_counter['e']

234526

In [64]:
# what is the percentage of characters that feature the letter 'e'?
char_matrix[:, 4].sum() / char_matrix.sum()

0.10432628041307737

In [65]:
# let's see how many words have the letter 'a' in them or the letter 's'. 
for curr_letter, letter_index in letter_dict.items():    
    outcome = np.where(char_matrix[:, letter_index] > 0)
    n_rows = np.shape(outcome)[1]        
    print(curr_letter, n_rows)

a 144511
b 37353
c 85776
d 60650
e 157437
f 21743
g 43074
h 56937
i 144651
j 3073
k 15153
l 104989
m 62486
n 120538
o 122891
p 67333
q 3624
r 128458
s 103107
t 119832
u 75262
v 19346
w 13158
x 6870
y 48370
z 8012


### save data to disk

In [66]:
# save the char matrix
output_name = 'char_matrix.npy'
opn = os.path.join(data_output_file_path, output_name)
np.save(file = opn, arr = char_matrix)

In [67]:
# letter dictionary
output_name = 'letter_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = letter_dict)

In [68]:
# word dictionary
output_name = 'word_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = word_dict)

In [69]:
# save the word df
output_name = 'word_df.csv'
opn = os.path.join(data_output_file_path, output_name)
word_df.to_csv(path_or_buf = opn, sep = '\t', header = True, index = False)       

### count sub-matrices for processing option 2

In [70]:
# option 2: by letter count
df02 = word_df['n_chars'].groupby(word_df['n_chars']).agg(np.size).to_frame()
df02.columns = ['n_words']
df02 = df02.reset_index()

In [71]:
df02 = df02.sort_values(by='n_chars', ascending = False)

In [72]:
# this determines the number of rows in each submatrix
df02['total_words'] = df02['n_words'].cumsum()

In [73]:
df02 = df02.sort_values(by='n_chars')

In [74]:
df02.head()

Unnamed: 0,n_chars,n_words,total_words
0,1,26,234370
1,2,139,234344
2,3,1294,234205
3,4,4993,232911
4,5,9972,227918


### count sub-matrices for processing option 3

In [75]:
# extract the word_ids as an numpy arry
word_id_list = word_df['word_id'].to_numpy()

In [76]:
# option 3: by letter count and presence of first letter
first_letter_df = word_df[['n_chars', 'first_letter']].drop_duplicates()

In [77]:
# how many sub-matrices?
n_sub_matrices = len(first_letter_df)
print('...creating', n_sub_matrices, 'sub matrices')
output_list = []

# create dictionaries to expedite this. We only need to determine the sets of rows of each component
# once. After determination, we can store in a dictionary and then look up.
# this will hold the set of rows by word length
n_char_word_id_list_dict = {}
# this will hold the set of rows by presence of the first letter
fl_word_id_list_dict = {}

loop_count = 0
for n_chars, fl in zip(first_letter_df['n_chars'], first_letter_df['first_letter']):
        
    # word id by character length - check if the set of row ids have already been identified
    # if not, create it and store it
    if n_chars in n_char_word_id_list_dict:
        curr_n_char_word_id_set = n_char_word_id_list_dict[n_chars]
    else:
        # extract the row ids that meet the criteria,
        # use the word_df for this.
        curr_n_char_word_id_set  = word_df.loc[(word_df['n_chars']>=n_chars), 'word_id'].tolist()
        # create a set
        curr_n_char_word_id_set = set(curr_n_char_word_id_set)
        # store
        n_char_word_id_list_dict[n_chars] = curr_n_char_word_id_set
    
    # word id by letter match
    # use the char_matrix to identify these rows
    if fl in fl_word_id_list_dict:
        curr_letter_select_word_id_set = fl_word_id_list_dict[fl]
    else:                       
        # build the oolumn selector using list comprehension
        column_selector = [letter_dict[curr_letter] for curr_letter in fl]
        
        # create a true-false matrix where only certain columns, corresponding to
        # letter indices, have a value of 1 or more
        outcome = char_matrix[:, column_selector] > 0    
        
        # which rows in the above matrix evaluate to all True
        outcome_indices = np.all(a = outcome, axis = 1)
        
        # these indices match with the word_id_list, perform the subset        
        curr_letter_select_word_id_set = word_id_list[outcome_indices]
        curr_letter_select_word_id_set = set(curr_letter_select_word_id_set)
        fl_word_id_list_dict[fl] = curr_letter_select_word_id_set        
        
    # perform the intersection
    curr_word_id_set = curr_n_char_word_id_set.intersection(curr_letter_select_word_id_set)
    curr_nrows = len(curr_word_id_set)
    curr_list = [fl, n_chars, curr_nrows]
    output_list.append(curr_list)
    
    loop_count += 1
    if loop_count % 100 == 0:
        print(loop_count)          

...creating 543 sub matrices
100
200
300
400
500


In [78]:
df03 = pd.DataFrame(data=output_list, columns = ['focal_letter', 'n_chars', 'n_words'])

In [79]:
# make it wide
df03_wide =  pd.pivot_table(data = df03,
                            values = 'n_words',
                            index = 'focal_letter',
                            columns = 'n_chars',
                            fill_value = 0
                           )

In [80]:
df03_wide = df03_wide.reset_index()

In [81]:
df03_wide.head()

n_chars,focal_letter,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a,144511,144510,144475,144048,142022,137229,128533,115710,98781,...,10353,5664,2916,1397,676,297,123,54,21,0
1,b,37353,37352,37346,37224,36687,35455,33012,29423,24752,...,2477,1352,672,306,144,66,26,8,4,0
2,c,85776,85775,85773,85693,85170,83481,79817,73704,65225,...,7919,4473,2350,1151,555,252,110,50,0,0
3,d,60650,60649,60639,60486,59809,58269,54910,49604,41697,...,3739,2038,1064,516,243,115,53,23,0,0
4,e,157437,157436,157409,157105,155490,151370,141778,127876,108824,...,10385,5615,2851,1363,645,284,118,52,17,0


In [82]:
df03['n_words'].describe()

count       543.000000
mean      33400.215470
std       42634.064202
min           1.000000
25%        1135.500000
50%       11153.000000
75%       56838.500000
max      157437.000000
Name: n_words, dtype: float64

### count sub-matrices for processing option 4

In [83]:
# option 4: by letter count and presence of least two common letters
n_common_letters = 2
word_df['letter_selector'] = word_df['letter_group_ranked'].str[:n_common_letters]
letter_selector_df = word_df[['n_chars', 'letter_selector']].drop_duplicates()

In [84]:
n_sub_matrices = len(letter_selector_df)
print('...creating', n_sub_matrices, 'sub matrices')

output_list = []
# same as above - store the set of row ids that match the words of at least a given length
n_char_word_id_list_dict = {}
# the set of rows ids that contain the words with the n_common_letters
ls_word_id_list_dict = {}

loop_count = 0
for n_chars, ls in zip(letter_selector_df['n_chars'], letter_selector_df['letter_selector']):
        
    # word id by character length
    if n_chars in n_char_word_id_list_dict:
        curr_n_char_word_id_set = n_char_word_id_list_dict[n_chars]
    else:
        curr_n_char_word_id_set  = word_df.loc[(word_df['n_chars']>=n_chars) , 'word_id'].tolist()
        curr_n_char_word_id_set = set(curr_n_char_word_id_set)
        n_char_word_id_list_dict[n_chars] = curr_n_char_word_id_set
    
    # word id by presense of least common letters
    if ls in ls_word_id_list_dict:
        curr_letter_select_word_id_set = ls_word_id_list_dict[ls]
    else:                       
        # build the oolumn selector using list comprehension
        column_selector = [letter_dict[curr_letter] for curr_letter in ls]
        
        # create a true-false matrix where only certain columns, corresponding to
        # letter indices, have a value of 1 or more
        outcome = char_matrix[:, column_selector] > 0    
        
        # which rows in the above matrix evaluate to all True
        outcome_indices = np.all(a = outcome, axis = 1)
        
        # these indices match with the word_is_list, perform the subset        
        curr_letter_select_word_id_set = word_id_list[outcome_indices]
        curr_letter_select_word_id_set = set(curr_letter_select_word_id_set)
        ls_word_id_list_dict[ls] = curr_letter_select_word_id_set        
        
    # perform the intersection
    curr_word_id_set = curr_n_char_word_id_set.intersection(curr_letter_select_word_id_set)
    curr_nrows = len(curr_word_id_set)
    # store the counts in the list
    curr_list = [ls, n_chars, curr_nrows]
    output_list.append(curr_list)
    
    loop_count += 1
    if loop_count % 100 == 0:
        print(loop_count)          

...creating 3488 sub matrices
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


In [85]:
df04 = pd.DataFrame(data=output_list, columns = ['letter_group', 'n_chars', 'n_words'])

In [86]:
# make it wide
df04_wide =  pd.pivot_table(data = df04,
                            values = 'n_words',
                            index = 'letter_group',
                            columns = 'n_chars',
                            fill_value = 0
                           )
df04_wide = df04_wide.reset_index()

In [87]:
df04_wide.head()

n_chars,letter_group,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a,144511,144510,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ae,0,91948,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ai,0,88006,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,b,37353,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ba,0,24833,24831,24793,24582,23985,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### save the counts of sub-matrices to an excel file

In [88]:
# save the dataframes statistics to an excel file
e_writer_file_name = 'matrix_extraction_option_counts.xlsx'
e_writer_file_path_name = os.path.join(tabulation_output_file_path, e_writer_file_name)
e_writer = pd.ExcelWriter(path=e_writer_file_path_name)

In [89]:
df02.to_excel(excel_writer = e_writer, sheet_name = 'me_02', index = False)

In [90]:
df03.to_excel(excel_writer = e_writer, sheet_name = 'me_03', index = False)

In [91]:
df03_wide.to_excel(excel_writer = e_writer, sheet_name = 'me_03_wide', index = False)

In [92]:
df04.to_excel(excel_writer = e_writer, sheet_name = 'me_04', index = False)

In [93]:
df04_wide.to_excel(excel_writer = e_writer, sheet_name = 'me_04_wide', index = False)

In [94]:
# sort and reorder the the letter count df columns

In [95]:
letter_count_df = letter_count_df.sort_values(by = 'letter')

col_names = ['letter','letter_count','letter_percent','rank','n_words','word_percent','word_count_rank']

letter_count_df = letter_count_df[col_names]

In [96]:
letter_count_df.to_excel(excel_writer = e_writer, sheet_name = 'letter_rank', index = False)

In [97]:
# save and close the excel file object
e_writer.save()
e_writer.close()