# Mike Babb
# babbm@uw.edu
# Find Anagrams
## Part 1: Structure the data

In [1]:
# standard libraries - installed by default
import collections
import itertools
import os
import string

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
# custom, user-defined functions
from part_00_file_db_utils import *

### set input and output paths

In [4]:
# path and name of input data
in_file_path = '/git/finding_anagrams/data/'
in_file_name = 'words.txt'

In [5]:
# construct the input file path
in_fpn = os.path.join(in_file_path, in_file_name)

In [6]:
# paths to output directories
base_output_file_path = '/project/finding_anagrams'
data_output_file_path = os.path.join(base_output_file_path, 'data')
tabulation_output_file_path = os.path.join(base_output_file_path, 'tabulations')

In [7]:
# setup the data output path
if os.path.exists(data_output_file_path):
    pass
else:
    os.makedirs(data_output_file_path)

In [8]:
# setup the tabulation output path
if os.path.exists(tabulation_output_file_path):
    pass
else:
    os.makedirs(tabulation_output_file_path)

### import list of words, shape data

In [9]:
# use pandas to load the data
# htps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
print('...Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None)

...Reading in list of words...


In [10]:
# check the first few rows
word_df.head()

Unnamed: 0,0
0,A
1,a
2,aa
3,aal
4,aalii


In [11]:
# specify a a more appropriate column name
col_names = ['word']
word_df.columns = col_names

In [12]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

...found 235,886 words to find anagrams for...


In [13]:
# convert the only column to a string - just to be safe.
# 'nan' is a word in the dictionary. nan is an internal python value.
# same with 'null'
word_df['word'] = word_df['word'].astype(str)

In [14]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()

In [15]:
# remove hyphens
word_df['lcase'] = word_df['lcase'].str.replace('-', '')

In [16]:
# and now drop duplicates, based on the lowercase version of each word
word_df = word_df.drop_duplicates('lcase')

In [17]:
word_df.shape

(234370, 2)

In [18]:
# Approximately 234K words. That's a lot of words. 

In [19]:
# find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [20]:
# extract the first letter of each word
word_df['first_letter'] = word_df['lcase'].str[:1]

In [21]:
# create an id
word_df['word_id'] = range(0, len(word_df))

In [22]:
# add a hash id to capture the sorted letters in each word
# use map() with a lambda function to chain several operations together
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
# as an example of what this is doing...

In [23]:
sorted('example')

['a', 'e', 'e', 'l', 'm', 'p', 'x']

In [24]:
''.join(sorted('example'))

'aeelmpx'

In [25]:
# has an example of what this is doing...
hash(''.join(sorted('example')))

8279614307103089220

In [26]:
# now, do this for all 234K words. 
word_df['hash_id'] = word_df['lcase'].map(lambda x: hash(''.join(sorted(x))))

In [27]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,hash_id
0,A,a,1,a,0,6147221603990161248
2,aa,aa,2,a,1,-5287726351405729054
3,aal,aal,3,a,2,4964197477303848490
4,aalii,aalii,5,a,3,5392406340955304598
5,aam,aam,3,a,4,-4020726638139728665


In [28]:
# 234K words, but after sorting the letters in each word, there are about 216K unique words. 
word_df['hash_id'].unique().shape

(215842,)

In [29]:
# create a dataframe of the unique, hashed values
word_id_hash_id_df = word_df['hash_id'].drop_duplicates().to_frame()

In [30]:
word_id_hash_id_df['word_group_id'] = range(0, len(word_id_hash_id_df))

In [31]:
word_id_hash_id_df.shape

(215842, 2)

In [32]:
# create a dictionary using dictionary comprehension of the hash values using zip
# https://docs.python.org/3/library/functions.html#zip
hash_id_dict = {hash_id:word_group_id for word_group_id, hash_id in zip(word_id_hash_id_df['word_group_id'], word_id_hash_id_df['hash_id'])}

In [33]:
# apply the word group id to the 
word_df['word_group_id'] = word_df['hash_id'].map(hash_id_dict)

In [34]:
# drop the hash id, no longer needed
word_df = word_df.drop(labels = 'hash_id', axis = 1)

In [35]:
word_df.tail()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id
235881,zythem,zythem,6,z,234365,215837
235882,Zythia,zythia,6,z,234366,215838
235883,zythum,zythum,6,z,234367,215839
235884,Zyzomys,zyzomys,7,z,234368,215840
235885,Zyzzogeton,zyzzogeton,10,z,234369,215841


In [36]:
# use dictionary comprehension to store the letter and the
# we'll import the letters from string.ascii_lowercase 
# index of the letter for fast look ups
letter_dict = {l:li for li, l in enumerate(string.ascii_lowercase)}

In [37]:
# generate a sorted list of letters from the dictionary keys
letters = string.ascii_lowercase

In [38]:
# get the unique letters in each word and then sort those letters
word_df['letter_group'] = word_df['lcase'].map(lambda x: ''.join(sorted(set(x))))

In [39]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group
0,A,a,1,a,0,0,a
2,aa,aa,2,a,1,1,a
3,aal,aal,3,a,2,2,al
4,aalii,aalii,5,a,3,3,ail
5,aam,aam,3,a,4,4,am


### count letter frequency

In [40]:
# several versions of the anagram determination technique require subsetting by letters in each word. 
# generate those data and use a ranking technique to help with anagram group identification

In [41]:
# use a counter object to count the occurence of each letter
# counters are a special type of dictionary. 
# https://docs.python.org/3/library/collections.html#collections.Counter
# very fast
letter_counter = collections.Counter()
# enumerate each word and then each letter
for i_cw, curr_word in enumerate(word_df['lcase']):    
    for i_cl, cl in enumerate(curr_word):
        letter_counter[cl] += 1

In [42]:
# make a dataframe from the counter object and then order from low to high
letter_count_df = pd.DataFrame.from_dict(data=letter_counter, orient = 'index', columns = ['letter_count']).reset_index(names=['letter'])

In [43]:
letter_count_df.head()

Unnamed: 0,letter,letter_count
0,a,198359
1,l,129962
2,i,200272
3,m,70358
4,n,158116


In [44]:
# compute the letter rank
letter_count_df['letter_rank'] = letter_count_df['letter_count'].rank(ascending=False).astype(int)

In [45]:
letter_count_df.head()

Unnamed: 0,letter,letter_count,letter_rank
0,a,198359,3
1,l,129962,9
2,i,200272,2
3,m,70358,13
4,n,158116,6


In [46]:
# sort by letter count
letter_count_df = letter_count_df.sort_values(by = 'letter_count', ascending = False)

In [47]:
letter_count_df['letter_percent'] = letter_count_df['letter_count'] / letter_count_df['letter_count'].sum()

In [48]:
letter_count_df.head(n=26)
# 'j' is the least common letter while 'e' is the most common letter

Unnamed: 0,letter,letter_count,letter_rank,letter_percent
14,e,234526,1,0.104326
2,i,200272,2,0.089089
0,a,198359,3,0.088238
10,o,170115,4,0.075674
5,r,160284,5,0.071301
4,n,158116,6,0.070336
13,t,152237,7,0.067721
19,s,138993,8,0.061829
1,l,129962,9,0.057812
12,c,103021,10,0.045828


In [49]:
# across all words, how many letters are used?
letter_count_df['letter_count'].sum()

2248005

In [50]:
# join with the count of words that start with a focal letter. 

In [51]:
word_count_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame(name = 'word_count').reset_index(names = ['first_letter'])

In [52]:
word_count_df.head()

Unnamed: 0,first_letter,word_count
0,a,16974
1,b,10963
2,c,19783
3,d,10849
4,e,8703


In [53]:
word_count_df['word_percent'] = word_count_df['word_count'] / word_count_df['word_count'].sum()

In [54]:
word_count_df.head()

Unnamed: 0,first_letter,word_count,word_percent
0,a,16974,0.072424
1,b,10963,0.046776
2,c,19783,0.084409
3,d,10849,0.04629
4,e,8703,0.037134


In [55]:
word_count_df['word_rank'] = word_count_df['word_count'].rank(ascending = False).astype(int)

In [56]:
# joins
word_letter_count_df = pd.merge(left=letter_count_df, right = word_count_df,
                          left_on=['letter'], right_on = ['first_letter'])

In [57]:
word_letter_count_df = word_letter_count_df.drop('first_letter', axis = 1)

In [58]:
word_letter_count_df.head()

Unnamed: 0,letter,letter_count,letter_rank,letter_percent,word_count,word_percent,word_rank
0,e,234526,1,0.104326,8703,0.037134,13
1,i,200272,2,0.089089,8786,0.037488,12
2,a,198359,3,0.088238,16974,0.072424,4
3,o,170115,4,0.075674,7830,0.033409,14
4,r,160284,5,0.071301,9613,0.041016,10


In [59]:
# sort and reorder the columns
word_letter_count_df = word_letter_count_df.sort_values(by = 'letter')
col_names = ['letter','letter_count','letter_percent','letter_rank','word_count','word_percent','word_rank']
word_letter_count_df = word_letter_count_df[col_names]

In [60]:
word_letter_count_df.head()

Unnamed: 0,letter,letter_count,letter_percent,letter_rank,word_count,word_percent,word_rank
2,a,198359,0.088238,3,16974,0.072424,4
17,b,40214,0.017889,18,10963,0.046776,8
9,c,103021,0.045828,10,19783,0.084409,3
13,d,67966,0.030234,14,10849,0.04629,9
0,e,234526,0.104326,1,8703,0.037134,13


In [61]:
# place the letter and its rank into a dictionary 
# as well as the rank and the corresponding letter
# {'k':21, 21:'k'}
letter_count_rank_dict = {}
for cl, clr in zip(word_letter_count_df['letter'], word_letter_count_df['letter_rank']):
    letter_count_rank_dict[cl] = clr
    letter_count_rank_dict[clr] = cl

In [62]:
# what letter is ranked 21st?
letter_count_rank_dict[21]

'k'

In [63]:
# what is the rank of letter k?
letter_count_rank_dict['k']

21

In [64]:
# write a function to order the unique letters in each word by
# least common letter to most common letter
def get_least_common_letters(x):    
    if len(x) == 1:
        lcl = x
    else:
        # ranking of each letter
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in x]        
        # sort the ranking
        rank_list = sorted(rank_list, reverse = True)
        # generate the letters sorted by rank
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in rank_list]
        lcl = ''.join(rank_list)
    return lcl
    

In [65]:
# extract letters by ranking
word_df['letter_group_ranked'] = word_df['letter_group'].map(get_least_common_letters)

In [66]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
0,A,a,1,a,0,0,a,a
2,aa,aa,2,a,1,1,a,a
3,aal,aal,3,a,2,2,al,la
4,aalii,aalii,5,a,3,3,ail,lai
5,aam,aam,3,a,4,4,am,ma


### generate the character matrix

In [67]:
# count the occurences of each letter in each word and store the results in a matrix
# populate the char_matrix and the word_id dictionary
# Aapply a function to each row in the dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# Upon intialization, the char_matrix is zero-filled.
# Each row in the char_matrix corresponds to a word.
# The char_matrix is 26 columns wide. Each column corresponds to a letter.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# Each cell is a count of the number of times each letter occurs in each word.  
# the entry for emit (as do the entriees for time, mite, item) has the following value:
# [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
# we need to find all words that have matching rows with at least these values.
# for example, 'terminator'.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0]

# the zero-filled matrix will be populated once the 
# score_word() function is applied to the word_df
char_matrix = np.zeros(shape=(len(word_df), 26), dtype=int)
def score_word(row):
    # get a word from the current row
    curr_word = row['lcase']    
    ri = row['word_id'] # row index / word index    
    # populate the char matrix
    for i_letter, letter in enumerate(curr_word):
        if letter in letter_dict:
            # find the corresponding column index of that letter
            li = letter_dict[letter]
            # increment the count of letters in the current row and current column
            char_matrix[ri, li] += 1
    return None

# catch the output from the function and delete
output = word_df.apply(score_word, 1)
del output

In [68]:
# what does it look like?
char_matrix

array([[1, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 2, 2],
       [0, 0, 0, ..., 0, 1, 3]])

In [69]:
# how many letters are in use in our words?
char_matrix.sum()

2248005

In [70]:
# across all words, how many letters are used?
letter_count_df['letter_count'].sum()

2248005

In [71]:
# what about if we wanted to see how many times the letter 'e' is used?
char_matrix[:, 4].sum()

234526

In [72]:
# this is the same as:
letter_counter['e']

234526

In [73]:
# what is the percentage of characters that feature the letter 'e'?
char_matrix[:, 4].sum() / char_matrix.sum()

0.10432628041307737

In [74]:
word_letter_count_df.head(n=10)

Unnamed: 0,letter,letter_count,letter_percent,letter_rank,word_count,word_percent,word_rank
2,a,198359,0.088238,3,16974,0.072424,4
17,b,40214,0.017889,18,10963,0.046776,8
9,c,103021,0.045828,10,19783,0.084409,3
13,d,67966,0.030234,14,10849,0.04629,9
0,e,234526,0.104326,1,8703,0.037134,13
18,f,24104,0.010722,19,6836,0.029168,15
16,g,46889,0.020858,17,6771,0.02889,16
14,h,64115,0.028521,15,8992,0.038367,11
1,i,200272,0.089089,2,8786,0.037488,12
25,j,3112,0.001384,26,1603,0.00684,22


In [75]:
# let's see how many words have the letter 'a' in them or the letter 's'. 
# this is different than the number of times each letter is used
# we can save this to our dataframe
word_letter_count = []
for curr_letter, letter_index in letter_dict.items():    
    outcome = np.where(char_matrix[:, letter_index] > 0)
    n_rows = np.shape(outcome)[1]        
    print(curr_letter, n_rows)
    word_letter_count.append(n_rows)

a 144511
b 37353
c 85776
d 60650
e 157437
f 21743
g 43074
h 56937
i 144651
j 3073
k 15153
l 104989
m 62486
n 120538
o 122891
p 67333
q 3624
r 128458
s 103107
t 119832
u 75262
v 19346
w 13158
x 6870
y 48370
z 8012


In [76]:
word_letter_count_df['letter_word_count'] = word_letter_count

In [77]:
word_letter_count_df['letter_word_percent'] = word_letter_count_df['letter_word_count'] / word_df.shape[0]

In [78]:
word_letter_count_df['letter_word_rank'] = word_letter_count_df['letter_word_count'].rank(ascending = False).astype(int)

In [79]:
# we can see that 67% of words (two in every three!) feature the letter 'e'. Only 1.3% of words feature the letter 'j'. 

In [80]:
word_letter_count_df.head(n=26)

Unnamed: 0,letter,letter_count,letter_percent,letter_rank,word_count,word_percent,word_rank,letter_word_count,letter_word_percent,letter_word_rank
2,a,198359,0.088238,3,16974,0.072424,4,144511,0.616593,3
17,b,40214,0.017889,18,10963,0.046776,8,37353,0.159376,18
9,c,103021,0.045828,10,19783,0.084409,3,85776,0.365985,10
13,d,67966,0.030234,14,10849,0.04629,9,60650,0.258779,14
0,e,234526,0.104326,1,8703,0.037134,13,157437,0.671746,1
18,f,24104,0.010722,19,6836,0.029168,15,21743,0.092772,19
16,g,46889,0.020858,17,6771,0.02889,16,43074,0.183786,17
14,h,64115,0.028521,15,8992,0.038367,11,56937,0.242936,15
1,i,200272,0.089089,2,8786,0.037488,12,144651,0.617191,2
25,j,3112,0.001384,26,1603,0.00684,22,3073,0.013112,26


# Extract and save the word_group dataframes

In [81]:
# drop duplicates based on the word group. 
# by default, this will only keep the first record and it will drop all others
wg_df = word_df.drop_duplicates(subset = ['word_group_id']).copy()

In [82]:
word_group_counter = collections.Counter(word_df['word_group_id'])

In [83]:
wg_df['word_group_count'] = wg_df['word_group_id'].map(word_group_counter)

### save data to disk - first the char matrix and the letter dictionary

In [84]:
# save the char matrix
output_name = 'char_matrix.npy'
opn = os.path.join(data_output_file_path, output_name)
np.save(file = opn, arr = char_matrix)

In [85]:
# letter dictionary
output_name = 'letter_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = letter_dict)

In [86]:
# Now some dataframes

In [87]:
# save the word df to sqlite db

In [88]:
# base file path
base_file_path = '/project/finding_anagrams'

In [89]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [90]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [91]:
db_name = 'words.db'

In [92]:
# create database connection objects
db_conn = build_db_conn(db_path = db_path, db_name = db_name)

In [93]:
word_df.to_sql(name='words', con=db_conn, if_exists='replace', index = False)    

234370

In [94]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
0,A,a,1,a,0,0,a,a
2,aa,aa,2,a,1,1,a,a
3,aal,aal,3,a,2,2,al,la
4,aalii,aalii,5,a,3,3,ail,lai
5,aam,aam,3,a,4,4,am,ma


In [95]:
wg_df.to_sql(name='word_groups', con=db_conn, if_exists='replace', index = False)    

215842

In [96]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,word_group_count
0,A,a,1,a,0,0,a,a,1
2,aa,aa,2,a,1,1,a,a,1
3,aal,aal,3,a,2,2,al,la,2
4,aalii,aalii,5,a,3,3,ail,lai,1
5,aam,aam,3,a,4,4,am,ma,2


In [97]:
# now, the word / letter count
word_letter_count_df.to_sql(name='word_letter_count', con=db_conn, if_exists='replace', index = False)    

26

In [98]:
db_conn.close()