# Mike Babb
# babbm@uw.edu
# Find Anagrams
## Part 1: Structure the data

In [None]:
# standard libraries - installed by default
import collections
import itertools
import os
import string

In [None]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [None]:
# custom, user-defined functions
from part_00_file_db_utils import *

### set input and output paths

In [None]:
# path and name of input data
in_file_path = '/git/finding_anagrams/data/'
in_file_name = 'words.txt'

In [None]:
# construct the input file path
in_fpn = os.path.join(in_file_path, in_file_name)

In [None]:
# paths to output directories
base_output_file_path = '/project/finding_anagrams'
data_output_file_path = os.path.join(base_output_file_path, 'data')
tabulation_output_file_path = os.path.join(base_output_file_path, 'tabulations')

In [None]:
# setup the data output path
if os.path.exists(data_output_file_path):
    pass
else:
    os.makedirs(data_output_file_path)

In [None]:
# setup the tabulation output path
if os.path.exists(tabulation_output_file_path):
    pass
else:
    os.makedirs(tabulation_output_file_path)

### import list of words, shape data

In [None]:
# use pandas to load the data
# htps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
print('...Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None, names = ['word'])

In [None]:
# check the first few rows
word_df.head()

In [None]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

In [None]:
# convert the only column to a string - just to be safe.
# 'nan' is a word in the dictionary. nan is an internal python value.
# same with 'null'
word_df['word'] = word_df['word'].astype(str)

In [None]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()

In [None]:
# remove hyphens
word_df['lcase'] = word_df['lcase'].str.replace('-', '')

In [None]:
# and now drop duplicates, based on the lowercase version of each word
word_df = word_df.drop_duplicates('lcase')

In [None]:
word_df.shape

In [None]:
# Approximately 234K words. That's a lot of words. 

In [None]:
# find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [None]:
# extract the first letter of each word
word_df['first_letter'] = word_df['lcase'].str[:1]

In [None]:
# create an id
word_df['word_id'] = range(0, len(word_df))

In [None]:
# add a hash id to capture the sorted letters in each word
# use map() with a lambda function to chain several operations together
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
# as an example of what this is doing...

In [None]:
sorted('example')

In [None]:
''.join(sorted('example'))

In [None]:
# has an example of what this is doing...
hash(''.join(sorted('example')))

In [None]:
# now, do this for all 234K words. 
word_df['hash_id'] = word_df['lcase'].map(lambda x: hash(''.join(sorted(x))))

In [None]:
word_df.head()

In [None]:
# 234K words, but after sorting the letters in each word, there are about 216K unique words. 
word_df['hash_id'].unique().shape

In [None]:
# what is the percentage?
word_df['hash_id'].unique().shape[0] / word_df.shape[0]

In [None]:
# create a dataframe of the unique, hashed values
word_id_hash_id_df = word_df['hash_id'].drop_duplicates().to_frame()

In [None]:
# add a unique id
word_id_hash_id_df['word_group_id'] = range(0, len(word_id_hash_id_df))

In [None]:
word_id_hash_id_df.shape

In [None]:
# create a dictionary using dictionary comprehension of the hash values using zip
# https://docs.python.org/3/library/functions.html#zip
hash_id_dict = {hash_id:word_group_id for word_group_id, hash_id in zip(word_id_hash_id_df['word_group_id'], word_id_hash_id_df['hash_id'])}

In [None]:
# apply the word group id to the 
word_df['word_group_id'] = word_df['hash_id'].map(hash_id_dict)

In [None]:
# drop the hash id, no longer needed
word_df = word_df.drop(labels = 'hash_id', axis = 1)

In [None]:
word_df.tail()

In [None]:
# use dictionary comprehension to store the letter
# we'll import the letters from string.ascii_lowercase 
# index of the letter for fast look ups
letter_dict = {l:li for li, l in enumerate(string.ascii_lowercase)}

In [None]:
# generate a list of letters from the string.ascii_lowercase
letters = string.ascii_lowercase

In [None]:
# get the unique letters in each word and then sort those letters
word_df['letter_group'] = word_df['lcase'].map(lambda x: ''.join(sorted(set(x))))

In [None]:
word_df.head()

### count letter frequency

In [None]:
# several versions of the anagram determination technique require subsetting by letters in each word. 
# generate those data and use a ranking technique to help with anagram group identification

In [None]:
# use a counter object to count the total occurences of each letter AND
# a counter to count the number of words that feature each letter
# counters are a special type of dictionary. 
# https://docs.python.org/3/library/collections.html#collections.Counter
# very fast
total_letter_counter = collections.Counter()
single_letter_counter = collections.Counter()

# enumerate each word and then each letter
for curr_word in word_df['lcase'].to_numpy():
    total_letter_counter.update(list(curr_word))

for curr_letter_group in word_df['letter_group'].to_numpy():
    single_letter_counter.update(list(curr_letter_group))
    

In [None]:
# make a dataframe from the counter object and then order from low to high
letter_count_df = pd.DataFrame.from_dict(data=total_letter_counter, orient = 'index', columns = ['total_letter_count']).reset_index(names=['letter'])

In [None]:
letter_count_df

In [None]:
# 'a' is used 198,359 times. This is different than the number of words that feature the letter a.

In [None]:
letter_count_df['single_letter_count'] = letter_count_df['letter'].map(single_letter_counter)

In [None]:
letter_count_df.head()

In [None]:
# in this case, 'a' is featured in 144,511 words.

In [None]:
# compute the total letter rank and the single_letter_count
letter_count_df['total_letter_rank'] = letter_count_df['total_letter_count'].rank(ascending=False).astype(int)
letter_count_df['single_letter_rank'] = letter_count_df['single_letter_count'].rank(ascending=False).astype(int)

In [None]:
letter_count_df.head()

In [None]:
# sort by letter count
letter_count_df = letter_count_df.sort_values(by = 'total_letter_count', ascending = False)

In [None]:
letter_count_df['total_letter_percent'] = letter_count_df['total_letter_count'] / letter_count_df['total_letter_count'].sum()
# note the denomiantor - we are computing which words have a letter, most words have multiple letters. 
# two thirds of words feature the letter 'e'. Wow. 
letter_count_df['single_letter_percent'] = letter_count_df['single_letter_count'] / word_df.shape[0]

In [None]:
letter_count_df.head(n=26)
# 'j' is the least common letter while 'e' is the most common letter

In [None]:
# across all words, how many letters are used?
letter_count_df['total_letter_count'].sum()

In [None]:
# join with the count of words that start with a focal letter. 

In [None]:
fl_count_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame(name = 'first_letter_word_count').reset_index(names = ['letter'])

In [None]:
fl_count_df.head()

In [None]:
fl_count_df['first_letter_word_percent'] = fl_count_df['first_letter_word_count'] / fl_count_df['first_letter_word_count'].sum()

In [None]:
fl_count_df.head()

In [None]:
fl_count_df['first_letter_rank'] = fl_count_df['first_letter_word_count'].rank(ascending = False).astype(int)

In [None]:
fl_count_df.head()

In [None]:
letter_count_df.head()

In [None]:
# joins
letter_count_df = pd.merge(left=letter_count_df, right = fl_count_df,
                          left_on=['letter'], right_on = ['letter'])

In [None]:
letter_count_df.head()

In [None]:
letter_count_df.columns.tolist()

In [None]:
# sort the records
letter_count_df = letter_count_df.sort_values(by = 'letter')

In [None]:
# reorder columns
col_names = ['letter',
'total_letter_count',
'single_letter_count',
'first_letter_word_count',
'total_letter_percent',
'single_letter_percent',
'first_letter_word_percent',
'total_letter_rank',
'single_letter_rank',
'first_letter_rank']
letter_count_df = letter_count_df[col_names]

In [None]:
letter_count_df.head()

In [None]:
# place the letter and its rank into a dictionary 
# as well as the rank and the corresponding letter
# {'k':21, 21:'k'}
letter_count_rank_dict = {}
for cl, clr in zip(letter_count_df['letter'], letter_count_df['total_letter_rank']):
    letter_count_rank_dict[cl] = clr
    letter_count_rank_dict[clr] = cl

In [None]:
# what letter is ranked 21st?
letter_count_rank_dict[21]

In [None]:
# what is the rank of letter k?
letter_count_rank_dict['k']

In [None]:
# write a function to order the unique letters in each word by
# least common letter to most common letter
def get_least_common_letters(x):    
    if len(x) == 1:
        lcl = x
    else:
        # ranking of each letter
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in x]        
        # sort the ranking
        rank_list = sorted(rank_list, reverse = True)
        # generate the letters sorted by rank
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in rank_list]
        lcl = ''.join(rank_list)
    return lcl
    

In [None]:
# extract letters by ranking
word_df['letter_group_ranked'] = word_df['letter_group'].map(get_least_common_letters)

In [None]:
word_df.head()

### generate the character matrix

In [None]:
# count the occurences of each letter in each word and store the results in a matrix
# populate the char_matrix and the word_id dictionary
# Aapply a function to each row in the dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# Upon intialization, the char_matrix is zero-filled.
# Each row in the char_matrix corresponds to a word.
# The char_matrix is 26 columns wide. Each column corresponds to a letter.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# Each cell is a count of the number of times each letter occurs in each word.  
# the entry for emit (as do the entriees for time, mite, item) has the following value:
# [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
# we need to find all words that have matching rows with at least these values.
# for example, 'terminator'.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0]

# the zero-filled matrix will be populated once the 
# fill_char_matrix() function is applied to the word_df
char_matrix = np.zeros(shape=(len(word_df), 26), dtype=int)
def fill_char_matrix(row):
    # get a word from the current row
    curr_word = row['lcase']    
    ri = row['word_id'] # row index / word index    
    # populate the char matrix
    for i_letter, letter in enumerate(curr_word):
        if letter in letter_dict:
            # find the corresponding column index of that letter
            li = letter_dict[letter]
            # increment the count of letters in the current row and current column
            char_matrix[ri, li] += 1
    return None

# catch the output from the function and delete
output = word_df.apply(fill_char_matrix, 1)
del output

In [None]:
# what does it look like?
char_matrix

In [None]:
# how many letters are in use in our words?
char_matrix.sum()

In [None]:
# across all words, how many letters are used?
letter_count_df['total_letter_count'].sum()

In [None]:
# what if we wanted to see how many times the letter 'e' is used?
char_matrix[:, 4].sum()

In [None]:
# this is the same as:
total_letter_counter['e']

In [None]:
# what is the percentage of characters that feature the letter 'e'?
char_matrix[:, 4].sum() / char_matrix.sum()

In [None]:
letter_count_df.head(n=10)

In [None]:
# let's use the char matrix to compute how many words have the letter 'a' in them or the letter 's'. 
# this is different than the number of times each letter is used
# we can save this to our dataframe
single_letter_count = []
for curr_letter, letter_index in letter_dict.items():    
    outcome = np.where(char_matrix[:, letter_index] > 0)
    n_rows = np.shape(outcome)[1]        
    print(curr_letter, n_rows)
    single_letter_count.append(n_rows)

In [None]:
# these are the same values as:
letter_count_df[['letter', 'single_letter_count']].head(n = 26)

# Extract and save the word_group dataframes

In [None]:
# drop duplicates based on the word group. 
# by default, this will only keep the first record and it will drop all others
wg_df = word_df.drop_duplicates(subset = ['word_group_id']).copy()

In [None]:
word_group_counter = collections.Counter(word_df['word_group_id'])

In [None]:
wg_df['word_group_count'] = wg_df['word_group_id'].map(word_group_counter)

### save data to disk - first the char matrix and the letter dictionary

In [None]:
# save the char matrix
output_name = 'char_matrix.npy'
opn = os.path.join(data_output_file_path, output_name)
np.save(file = opn, arr = char_matrix)

In [None]:
# letter dictionary
output_name = 'letter_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = letter_dict)

In [None]:
# Now, the dataframes

In [None]:
# save the word df to sqlite db

In [None]:
# base file path
base_file_path = '/project/finding_anagrams'

In [None]:
# input path
in_file_path = 'data'
in_file_path = os.path.join(base_file_path, in_file_path)

In [None]:
# output db path and name
db_path = 'db'
db_path = os.path.join(base_file_path, db_path)

In [None]:
db_name = 'words.db'

In [None]:
write_data_to_sqlite(df = word_df, table_name = 'words', db_path = db_path, db_name = db_name)

In [None]:
write_data_to_sqlite(df = wg_df, table_name = 'word_groups', db_path = db_path, db_name = db_name)

In [None]:
# now, the word / letter count
write_data_to_sqlite(df = letter_count_df, table_name = 'letter_count', db_path = db_path, db_name = db_name)