# Mike Babb
# babbm@uw.edu
# Find Anagrams
## Part 1: Structure the data

In [1]:
# standard libraries - installed by default
import collections
import itertools
import os
import string

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
# custom, user-defined functions
from part_00_process_functions import query_db, save_pickle, execute_sql_statement, build_db_conn

### set input and output paths

In [4]:
# path and name of input data
in_file_path = '/git/finding_anagrams/data/'
in_file_name = 'words.txt'

In [5]:
# construct the input file path
in_fpn = os.path.join(in_file_path, in_file_name)

In [6]:
# paths to output directories
base_output_file_path = '/project/finding_anagrams'
data_output_file_path = os.path.join(base_output_file_path, 'data')
tabulation_output_file_path = os.path.join(base_output_file_path, 'tabulations')

In [7]:
# setup the data output path
if os.path.exists(data_output_file_path):
    pass
else:
    os.makedirs(data_output_file_path)

In [8]:
# setup the tabulation output path
if os.path.exists(tabulation_output_file_path):
    pass
else:
    os.makedirs(tabulation_output_file_path)

### import list of words, shape data

In [9]:
# use pandas to load the data
# htps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
print('...Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None)

...Reading in list of words...


In [10]:
# check the first few rows
word_df.head()

Unnamed: 0,0
0,A
1,a
2,aa
3,aal
4,aalii


In [11]:
# specify a a more appropriate column name
col_names = ['word']
word_df.columns = col_names

In [12]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

...found 235,886 words to find anagrams for...


In [13]:
# convert the only column to a string - just to be safe.
# 'nan' is a word in the dictionary. 'nan' is an internal python value.
# same with 'null'
word_df['word'] = word_df['word'].astype(str)

In [14]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()

In [15]:
# remove hyphens
word_df['lcase'] = word_df['lcase'].str.replace('-', '')

In [16]:
# and now drop duplicates, based on the lowercase version of each word
word_df = word_df.drop_duplicates('lcase')

In [17]:
# find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [18]:
# extract the first letter of each word
word_df['first_letter'] = word_df['lcase'].str[:1]

In [19]:
# create an index
word_df['word_id'] = range(0, len(word_df))

In [20]:
# add a hash id to capture the sorted letters in each word
# use map() with a lambda function to chain several operations together
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
word_df['hash_id'] = word_df['lcase'].map(lambda x: hash(''.join(sorted(x))))

In [21]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,hash_id
0,A,a,1,a,0,-7417365306412638318
2,aa,aa,2,a,1,8397826984338003052
3,aal,aal,3,a,2,4698928497154998752
4,aalii,aalii,5,a,3,3234713363651605398
5,aam,aam,3,a,4,5388998009005939743


In [22]:
word_id_hash_id_df = word_df['hash_id'].drop_duplicates().to_frame()

In [23]:
word_id_hash_id_df['word_group_id'] = range(0, len(word_id_hash_id_df))

In [24]:
# create a dictionary of the hash values using zip
# https://docs.python.org/3/library/functions.html#zip
hash_id_dict = {hash_id:word_group_id for word_group_id, hash_id in zip(word_id_hash_id_df['word_group_id'], word_id_hash_id_df['hash_id'])}

In [25]:
word_df['word_group_id'] = word_df['hash_id'].map(hash_id_dict)

In [26]:
# drop the hash id, no longer needed
word_df = word_df.drop(labels = 'hash_id', axis = 1)

In [27]:
word_df.tail()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id
235881,zythem,zythem,6,z,234365,215837
235882,Zythia,zythia,6,z,234366,215838
235883,zythum,zythum,6,z,234367,215839
235884,Zyzomys,zyzomys,7,z,234368,215840
235885,Zyzzogeton,zyzzogeton,10,z,234369,215841


In [28]:
# use dictionary comprehension to store the letter and the
# index of the letter for fast look ups
letter_dict = {l:li for li, l in enumerate(string.ascii_lowercase)}

In [29]:
# generate a sorted list of letters from the dictionary keys
letters = sorted(letter_dict.keys())

In [30]:
# get the unique letters in each word and then sort those letters
word_df['letter_group'] = word_df['lcase'].map(lambda x: ''.join(sorted(set(x))))

In [31]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group
0,A,a,1,a,0,0,a
2,aa,aa,2,a,1,1,a
3,aal,aal,3,a,2,2,al
4,aalii,aalii,5,a,3,3,ail
5,aam,aam,3,a,4,4,am


### count letter frequency

In [32]:
# several versions of the anagram determination technique require subsetting by letters in each word. 
# generate those data and use a ranking technique to help with anagram group identification

In [33]:
# use a counter object to count the occurence of each letter
# counters are a special type of dictionary. 
# https://docs.python.org/3/library/collections.html#collections.Counter
# very fast
letter_counter = collections.Counter()
# enumerate each word and then each letter
for i_cw, curr_word in enumerate(word_df['lcase']):    
    for i_cl, cl in enumerate(curr_word):
        letter_counter[cl] += 1

In [34]:
# make a dataframe from the counter object and then order from low to high
letter_count_df = pd.DataFrame.from_dict(data=letter_counter, orient = 'index')

In [35]:
letter_count_df = letter_count_df.reset_index()

In [36]:
letter_count_df.columns = ['letter', 'letter_count']

In [37]:
letter_count_df = letter_count_df.sort_values(by = 'letter_count', ascending = False)

In [38]:
letter_count_df['rank'] = range(1, len(letter_count_df) + 1)

In [39]:
letter_count_df['letter_percent'] = letter_count_df['letter_count'] / letter_count_df['letter_count'].sum()

In [40]:
letter_count_df.head(n=30)
# j is the least common letter while e is the most common letter

Unnamed: 0,letter,letter_count,rank,letter_percent
14,e,234526,1,0.104326
2,i,200272,2,0.089089
0,a,198359,3,0.088238
10,o,170115,4,0.075674
5,r,160284,5,0.071301
4,n,158116,6,0.070336
13,t,152237,7,0.067721
19,s,138993,8,0.061829
1,l,129962,9,0.057812
12,c,103021,10,0.045828


In [41]:
# across all words, how many letters are used?
letter_count_df['letter_count'].sum()

2248005

In [42]:
# join with the count of words that start with a focal letter. 

In [43]:
word_count_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame()

In [44]:
word_count_df.columns = ['n_words']

In [45]:
word_count_df = word_count_df.reset_index()

In [46]:
word_count_df['word_percent'] = word_count_df['n_words'] / word_count_df['n_words'].sum()

In [47]:
word_count_df.head()

Unnamed: 0,first_letter,n_words,word_percent
0,a,16974,0.072424
1,b,10963,0.046776
2,c,19783,0.084409
3,d,10849,0.04629
4,e,8703,0.037134


In [48]:
letter_count_df.head()

Unnamed: 0,letter,letter_count,rank,letter_percent
14,e,234526,1,0.104326
2,i,200272,2,0.089089
0,a,198359,3,0.088238
10,o,170115,4,0.075674
5,r,160284,5,0.071301


In [49]:
word_count_df = word_count_df.sort_values(by='n_words', ascending = False)

In [50]:
word_count_df['word_count_rank'] = range(1, len(word_count_df) + 1)

In [51]:
# joins
letter_count_df = pd.merge(left=letter_count_df, right = word_count_df,
                          left_on=['letter'], right_on = ['first_letter'])

In [52]:
letter_count_df = letter_count_df.drop('first_letter', axis = 1)

In [53]:
letter_count_df.head()

Unnamed: 0,letter,letter_count,rank,letter_percent,n_words,word_percent,word_count_rank
0,e,234526,1,0.104326,8703,0.037134,13
1,i,200272,2,0.089089,8786,0.037488,12
2,a,198359,3,0.088238,16974,0.072424,4
3,o,170115,4,0.075674,7830,0.033409,14
4,r,160284,5,0.071301,9613,0.041016,10


In [54]:
# sort and reorder the columns
letter_count_df = letter_count_df.sort_values(by = 'letter')
col_names = ['letter','letter_count','letter_percent','rank','n_words','word_percent','word_count_rank']
letter_count_df = letter_count_df[col_names]

In [55]:
# place the letter and its rank into a dictionary 
# as well as the rank and the corresponding letter
# {'k':21, 21:'k'}
letter_count_rank_dict = {}
for cl, clr in zip(letter_count_df['letter'], letter_count_df['rank']):
    letter_count_rank_dict[cl] = clr
    letter_count_rank_dict[clr] = cl

In [56]:
# what letter is ranked 21st?
letter_count_rank_dict[21]

'k'

In [57]:
# what is the rank of letter k?
letter_count_rank_dict['k']

21

In [58]:
# write a function to order the unique letters in each word by
# least common letter to most common letter
def get_least_common_letters(x):    
    if len(x) == 1:
        lcl = x
    else:
        # ranking of each letter
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in x]        
        # sort the ranking
        rank_list = sorted(rank_list, reverse = True)
        # generate the letters sorted by rank
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in rank_list]
        lcl = ''.join(rank_list)
    return lcl
    

In [59]:
# extract letters by ranking
word_df['letter_group_ranked'] = word_df['letter_group'].map(get_least_common_letters)

In [60]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
0,A,a,1,a,0,0,a,a
2,aa,aa,2,a,1,1,a,a
3,aal,aal,3,a,2,2,al,la
4,aalii,aalii,5,a,3,3,ail,lai
5,aam,aam,3,a,4,4,am,ma


### generate the character matrix

In [61]:
# count the occurences of each letter in each word and store the results in a matrix
# populate the char_matrix and the word_id dictionary
# Use the apply function to the word_df. Effectively, apply a function to each row in the 
# dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# Upon intialization, the char_matrix is all zero.
# the entry for emit (as do the entriees for time, mite, item) has the following value:
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
# we need to find all words that have matching rows with at least these values.
# for example, 'terminator'.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0]

# the zero-filled matrix will be populated once the 
# score_row() function is applied to the word_df
char_matrix = np.zeros(shape=(len(word_df), 26), dtype=np.int32)
# same with the word_dict.
word_dict = {}
def score_word(row):
    # get a word from the current row
    curr_word = row['lcase']    
    ri = row['word_id'] # row index / word index
    word_length = row['n_chars'] # number of character in each word
    first_letter = row['first_letter'] # first letter of the word
    letter_group = row['letter_group'] # letter group
    letter_group_ranked = row['letter_group_ranked'] # letter group ranked
    word_dict[ri] = (curr_word, word_length, first_letter, letter_group, letter_group_ranked)
    # populate the char matrix
    for i_letter, letter in enumerate(curr_word):
        if letter in letter_dict:
            # find the corresponding column index of that letter
            li = letter_dict[letter]
            # increment the count of letters in the current row and current column
            char_matrix[ri, li] += 1
    return None

# catch the output from the function and delete
output = word_df.apply(score_word, 1)
del output

In [62]:
# what does it look like?
char_matrix

array([[1, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 2, 2],
       [0, 0, 0, ..., 0, 1, 3]])

In [63]:
# how many letters are in use in our words?
char_matrix.sum()

2248005

In [64]:
# what about if we wanted to see how many times the letter 'e' is used?
char_matrix[:, 4].sum()

234526

In [65]:
# this is the same as:
letter_counter['e']

234526

In [66]:
# what is the percentage of characters that feature the letter 'e'?
char_matrix[:, 4].sum() / char_matrix.sum()

0.10432628041307737

In [67]:
# let's see how many words have the letter 'a' in them or the letter 's'. 
for curr_letter, letter_index in letter_dict.items():    
    outcome = np.where(char_matrix[:, letter_index] > 0)
    n_rows = np.shape(outcome)[1]        
    print(curr_letter, n_rows)

a 144511
b 37353
c 85776
d 60650
e 157437
f 21743
g 43074
h 56937
i 144651
j 3073
k 15153
l 104989
m 62486
n 120538
o 122891
p 67333
q 3624
r 128458
s 103107
t 119832
u 75262
v 19346
w 13158
x 6870
y 48370
z 8012


In [68]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
0,A,a,1,a,0,0,a,a
2,aa,aa,2,a,1,1,a,a
3,aal,aal,3,a,2,2,al,la
4,aalii,aalii,5,a,3,3,ail,lai
5,aam,aam,3,a,4,4,am,ma


### save data to disk

In [69]:
# save the char matrix
output_name = 'char_matrix.npy'
opn = os.path.join(data_output_file_path, output_name)
np.save(file = opn, arr = char_matrix)

In [70]:
# letter dictionary
output_name = 'letter_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = letter_dict)

In [71]:
# word dictionary
output_name = 'word_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = word_dict)

In [72]:
# save the word df
output_name = 'word_df.csv'
opn = os.path.join(data_output_file_path, output_name)
word_df.to_csv(path_or_buf = opn, sep = '\t', header = True, index = False)       

In [73]:
# save the word df to sqlite db

In [74]:
# base file path
base_file_path = '/project/finding_anagrams'

In [75]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [76]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [77]:
db_name = 'words.db'

In [78]:
# create database connection objects
db_conn = build_db_conn(db_path = db_path, db_name = db_name)

In [79]:
word_df.to_sql(name='words', con=db_conn, if_exists='replace', index = False)    

234370