# Mike Babb
# babbm@uw.edu
# Find Anagrams
## Part 1: Structure the data

In [None]:
# standard libraries - installed by default
import collections
import itertools
import os
import string

In [None]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [None]:
# custom, user-defined functions
from part_00_process_functions import save_pickle

### set input and output paths

In [None]:
# path and name of input data
in_file_path = '/git/finding_anagrams/data/'
in_file_name = 'words.txt'

In [None]:
# construct the input file path
in_fpn = os.path.join(in_file_path, in_file_name)

In [None]:
# paths to output directories
base_output_file_path = '/project/finding_anagrams'
data_output_file_path = os.path.join(base_output_file_path, 'data')
tabulation_output_file_path = os.path.join(base_output_file_path, 'tabulations')

In [None]:
# setup the data output path
if os.path.exists(data_output_file_path):
    pass
else:
    os.makedirs(data_output_file_path)

In [None]:
# setup the tabulation output path
if os.path.exists(tabulation_output_file_path):
    pass
else:
    os.makedirs(tabulation_output_file_path)

### import list of words, shape data

In [None]:
# use pandas to load the data
# htps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
print('...Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None)

In [None]:
# check the first few rows
word_df.head()

In [None]:
# specify a a more appropriate column name
col_names = ['word']
word_df.columns = col_names

In [None]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

In [None]:
# convert the only column to a string - just to be safe.
# 'nan' is a word in the dictionary. 'nan' is an internal python value.
# same with 'null'
word_df['word'] = word_df['word'].astype(np.str)

In [None]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()

In [None]:
# remove hyphens
word_df['lcase'] = word_df['lcase'].str.replace('-', '')

In [None]:
# and now drop duplicates, based on the lowercase version of each word
word_df = word_df.drop_duplicates('lcase')

In [None]:
# find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [None]:
# extract the first letter of each word
word_df['first_letter'] = word_df['lcase'].str[:1]

In [None]:
# create an index
word_df['word_id'] = range(0, len(word_df))

In [None]:
# add a hash id to capture the sorted letters in each word
# use map() with a lambda function to chain several operations together
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
word_df['hash_id'] = word_df['lcase'].map(lambda x: hash(''.join(sorted(x))))

In [None]:
word_df.head()

In [None]:
# create a dictionary of the hash values using zip
# https://docs.python.org/3/library/functions.html#zip
hash_id_dict = {hash_id:word_group_id for word_group_id, hash_id in zip(word_df['word_id'], word_df['hash_id'])}

In [None]:
word_df['word_group_id'] = word_df['hash_id'].map(hash_id_dict)

In [None]:
# drop the hash id, no longer needed
word_df = word_df.drop('hash_id', 1)

In [None]:
word_df.tail()

In [None]:
# use dictionary comprehension to store the letter and the
# index of the letter for fast look ups
letter_dict = {l:li for li, l in enumerate(string.ascii_lowercase)}

In [None]:
# generate a sorted list of letters from the dictionary keys
letters = sorted(letter_dict.keys())

In [None]:
# get the unique letters in each word and then sort those letters
word_df['letter_group'] = word_df['lcase'].map(lambda x: ''.join(sorted(set(x))))

In [None]:
word_df.head()

### count letter frequency

In [None]:
# several versions of the anagram determination require subsetting by letters in each word. 
# generate those data and use a ranking technique to help with anagram group identification

In [None]:
# use a counter object to count the occurence of each letter
# counters are a special type of dictionary. 
# https://docs.python.org/3/library/collections.html#collections.Counter
# very fast
letter_counter = collections.Counter()
# enumerate each word and then each letter
for i_cw, curr_word in enumerate(word_df['lcase']):    
    for i_cl, cl in enumerate(curr_word):
        letter_counter[cl] += 1

In [None]:
# make a dataframe from the counter object and then order from low to high
letter_count_df = pd.DataFrame.from_dict(data=letter_counter, orient = 'index')

In [None]:
letter_count_df = letter_count_df.reset_index()

In [None]:
letter_count_df.columns = ['letter', 'letter_count']

In [None]:
letter_count_df = letter_count_df.sort_values(by = 'letter_count', ascending = False)

In [None]:
letter_count_df['rank'] = range(1, len(letter_count_df) + 1)

In [None]:
letter_count_df['letter_percent'] = letter_count_df['letter_count'] / letter_count_df['letter_count'].sum()

In [None]:
letter_count_df.head(n=30)
# j is the least common letter while e is the most common letter

In [None]:
# join with the count of words that start with a focal letter. 

In [None]:
word_count_df = word_df['first_letter'].groupby(word_df['first_letter']).agg(np.size).to_frame()

In [None]:
word_count_df.columns = ['n_words']

In [None]:
word_count_df = word_count_df.reset_index()

In [None]:
word_count_df['word_percent'] = word_count_df['n_words'] / word_count_df['n_words'].sum()

In [None]:
word_count_df.head()

In [None]:
letter_count_df.head()

In [None]:
word_count_df = word_count_df.sort_values(by='n_words', ascending = False)

In [None]:
word_count_df['word_count_rank'] = range(1, len(word_count_df) + 1)

In [None]:
# joins
letter_count_df = pd.merge(left=letter_count_df, right = word_count_df,
                          left_on=['letter'], right_on = ['first_letter'])

In [None]:
letter_count_df = letter_count_df.drop('first_letter', axis = 1)

In [None]:
letter_count_df.head()

In [None]:
# sort and reorder the columns
letter_count_df = letter_count_df.sort_values(by = 'letter')
col_names = ['letter','letter_count','letter_percent','rank','n_words','word_percent','word_count_rank']
letter_count_df = letter_count_df[col_names]

In [None]:
# place the letter and its rank into a dictionary 
# as well as the rank and the corresponding letter
# {'k':21, 21:'k'}
letter_count_rank_dict = {}
for cl, clr in zip(letter_count_df['letter'], letter_count_df['rank']):
    letter_count_rank_dict[cl] = clr
    letter_count_rank_dict[clr] = cl

In [None]:
# what letter is ranked 21st?
letter_count_rank_dict[21]

In [None]:
# what is the rank of letter k?
letter_count_rank_dict['k']

In [None]:
# write a function to order the unique letters in each word by
# least common letter to most common letter
def get_least_common_letters(x):    
    if len(x) == 1:
        lcl = x
    else:
        # ranking of each letter
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in x]        
        # sort the ranking
        rank_list = sorted(rank_list, reverse = True)
        # generate the letters sorted by rank
        rank_list = [letter_count_rank_dict[curr_letter] for curr_letter in rank_list]
        lcl = ''.join(rank_list)
    return lcl
    

In [None]:
# extract letters by ranking
word_df['letter_group_ranked'] = word_df['letter_group'].map(get_least_common_letters)

In [None]:
word_df.tail()

### generate the character matrix

In [None]:
# count the occurences of each letter in each word and store the results in a matrix
# populate the char_matrix and the word_id dictionary
# Use the apply function to the word_df. Effectively, apply a function to each row in the 
# dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# Upon intialization, the char_matrix is all zero.
# the entry for emit (as do the entriees for time, mite, item) has the following value:
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
# we need to find all words that have matching rows with at least these values.
# for example, 'terminator'.
# ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
# [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0]

# the zero-filled matrix will be populated once the 
# score_row() function is applied to the word_df
char_matrix = np.zeros(shape=(len(word_df), 26), dtype=np.int32)
# same with the word_dict.
word_dict = {}
def score_word(row):
    # get a word from the current row
    curr_word = row['lcase']    
    ri = row['word_id'] # row index / word index
    word_length = row['n_chars'] # number of character in each word
    first_letter = row['first_letter'] # first letter of the word
    letter_group = row['letter_group'] # letter group
    letter_group_ranked = row['letter_group_ranked'] # letter group ranked
    word_dict[ri] = (curr_word, word_length, first_letter, letter_group, letter_group_ranked)
    # populate the char matrix
    for i_letter, letter in enumerate(curr_word):
        if letter in letter_dict:
            # find the corresponding column index of that letter
            li = letter_dict[letter]
            # increment the count of letters in the current row and current column
            char_matrix[ri, li] += 1
    return None

# catch the output from the function and delete
output = word_df.apply(score_word, 1)
del output

In [None]:
# how many letters are in use in our words?
char_matrix.sum()

In [None]:
# what about if we wanted to see how many times the letter 'e' is used?
char_matrix[:, 4].sum()

In [None]:
# this is the same as:
letter_counter['e']

In [None]:
# what is the percentage of characters that feature the letter 'e'?
char_matrix[:, 4].sum() / char_matrix.sum()

In [None]:
# let's see how many words have the letter 'a' in them or the letter 's'. 
for curr_letter, letter_index in letter_dict.items():    
    outcome = np.where(char_matrix[:, letter_index] > 0)
    n_rows = np.shape(outcome)[1]        
    print(curr_letter, n_rows)

### save data to disk

In [None]:
# save the char matrix
output_name = 'char_matrix.npy'
opn = os.path.join(data_output_file_path, output_name)
np.save(file = opn, arr = char_matrix)

In [None]:
# letter dictionary
output_name = 'letter_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = letter_dict)

In [None]:
# word dictionary
output_name = 'word_dict.pkl'
save_pickle(file_path = data_output_file_path, file_name = output_name, obj = word_dict)

In [None]:
# save the word df
output_name = 'word_df.csv'
opn = os.path.join(data_output_file_path, output_name)
word_df.to_csv(path_or_buf = opn, sep = '\t', header = True, index = False)       

### count sub-matrices for processing option 2

In [None]:
# option 2: by letter count
df02 = word_df['n_chars'].groupby(word_df['n_chars']).agg(np.size).to_frame()
df02.columns = ['n_words']
df02 = df02.reset_index()

In [None]:
df02 = df02.sort_values(by='n_chars', ascending = False)

In [None]:
# this determines the number of rows in each submatrix
df02['total_words'] = df02['n_words'].cumsum()

In [None]:
df02 = df02.sort_values(by='n_chars')

In [None]:
df02.head()

### count sub-matrices for processing option 3

In [None]:
# extract the word_ids as an numpy arry
word_id_list = word_df['word_id'].to_numpy()

In [None]:
# option 3: by letter count and presence of first letter
first_letter_df = word_df[['n_chars', 'first_letter']].drop_duplicates()

In [None]:
# how many sub-matrices?
n_sub_matrices = len(first_letter_df)
print('...creating', n_sub_matrices, 'sub matrices')
output_list = []

# create dictionaries to expedite this. We only need to determine the sets of rows of each component
# once. After determination, we can store in a dictionary and then look up.
# this will hold the set of rows by word length
n_char_word_id_list_dict = {}
# this will hold the set of rows by presence of the first letter
fl_word_id_list_dict = {}

loop_count = 0
for n_chars, fl in zip(first_letter_df['n_chars'], first_letter_df['first_letter']):
        
    # word id by character length - check if the set of row ids have already been identified
    # if not, create it and store it
    if n_chars in n_char_word_id_list_dict:
        curr_n_char_word_id_set = n_char_word_id_list_dict[n_chars]
    else:
        # extract the row ids that meet the criteria,
        # use the word_df for this.
        curr_n_char_word_id_set  = word_df.loc[(word_df['n_chars']>=n_chars), 'word_id'].tolist()
        # create a set
        curr_n_char_word_id_set = set(curr_n_char_word_id_set)
        # store
        n_char_word_id_list_dict[n_chars] = curr_n_char_word_id_set
    
    # word id by letter match
    # use the char_matrix to identify these rows
    if fl in fl_word_id_list_dict:
        curr_letter_select_word_id_set = fl_word_id_list_dict[fl]
    else:                       
        # build the oolumn selector using list comprehension
        column_selector = [letter_dict[curr_letter] for curr_letter in fl]
        
        # create a true-false matrix where only certain columns, corresponding to
        # letter indices, have a value of 1 or more
        outcome = char_matrix[:, column_selector] > 0    
        
        # which rows in the above matrix evaluate to all True
        outcome_indices = np.all(a = outcome, axis = 1)
        
        # these indices match with the word_id_list, perform the subset        
        curr_letter_select_word_id_set = word_id_list[outcome_indices]
        curr_letter_select_word_id_set = set(curr_letter_select_word_id_set)
        fl_word_id_list_dict[fl] = curr_letter_select_word_id_set        
        
    # perform the intersection
    curr_word_id_set = curr_n_char_word_id_set.intersection(curr_letter_select_word_id_set)
    curr_nrows = len(curr_word_id_set)
    curr_list = [fl, n_chars, curr_nrows]
    output_list.append(curr_list)
    
    loop_count += 1
    if loop_count % 100 == 0:
        print(loop_count)          

In [None]:
df03 = pd.DataFrame(data=output_list, columns = ['focal_letter', 'n_chars', 'n_words'])

In [None]:
# make it wide
df03_wide =  pd.pivot_table(data = df03,
                            values = 'n_words',
                            index = 'focal_letter',
                            columns = 'n_chars',
                            fill_value = 0
                           )

In [None]:
df03_wide = df03_wide.reset_index()

In [None]:
df03_wide.head()

In [None]:
df03['n_words'].describe()

### count sub-matrices for processing option 4

In [None]:
# option 4: by letter count and presence of least two common letters
n_common_letters = 2
word_df['letter_selector'] = word_df['letter_group_ranked'].str[:n_common_letters]
letter_selector_df = word_df[['n_chars', 'letter_selector']].drop_duplicates()

In [None]:
n_sub_matrices = len(letter_selector_df)
print('...creating', n_sub_matrices, 'sub matrices')

output_list = []
# same as above - store the set of row ids that match the words of at least a given length
n_char_word_id_list_dict = {}
# the set of rows ids that contain the words with the n_common_letters
ls_word_id_list_dict = {}

loop_count = 0
for n_chars, ls in zip(letter_selector_df['n_chars'], letter_selector_df['letter_selector']):
        
    # word id by character length
    if n_chars in n_char_word_id_list_dict:
        curr_n_char_word_id_set = n_char_word_id_list_dict[n_chars]
    else:
        curr_n_char_word_id_set  = word_df.loc[(word_df['n_chars']>=n_chars) , 'word_id'].tolist()
        curr_n_char_word_id_set = set(curr_n_char_word_id_set)
        n_char_word_id_list_dict[n_chars] = curr_n_char_word_id_set
    
    # word id by presense of least common letters
    if ls in ls_word_id_list_dict:
        curr_letter_select_word_id_set = ls_word_id_list_dict[ls]
    else:                       
        # build the oolumn selector using list comprehension
        column_selector = [letter_dict[curr_letter] for curr_letter in ls]
        
        # create a true-false matrix where only certain columns, corresponding to
        # letter indices, have a value of 1 or more
        outcome = char_matrix[:, column_selector] > 0    
        
        # which rows in the above matrix evaluate to all True
        outcome_indices = np.all(a = outcome, axis = 1)
        
        # these indices match with the word_is_list, perform the subset        
        curr_letter_select_word_id_set = word_id_list[outcome_indices]
        curr_letter_select_word_id_set = set(curr_letter_select_word_id_set)
        ls_word_id_list_dict[ls] = curr_letter_select_word_id_set        
        
    # perform the intersection
    curr_word_id_set = curr_n_char_word_id_set.intersection(curr_letter_select_word_id_set)
    curr_nrows = len(curr_word_id_set)
    # store the counts in the list
    curr_list = [ls, n_chars, curr_nrows]
    output_list.append(curr_list)
    
    loop_count += 1
    if loop_count % 100 == 0:
        print(loop_count)          

In [None]:
df04 = pd.DataFrame(data=output_list, columns = ['letter_group', 'n_chars', 'n_words'])

In [None]:
# make it wide
df04_wide =  pd.pivot_table(data = df04,
                            values = 'n_words',
                            index = 'letter_group',
                            columns = 'n_chars',
                            fill_value = 0
                           )
df04_wide = df04_wide.reset_index()

In [None]:
df04_wide.head()

### save the counts of sub-matrices to an excel file

In [None]:
# save the dataframes statistics to an excel file
e_writer_file_name = 'matrix_extraction_option_counts.xlsx'
e_writer_file_path_name = os.path.join(tabulation_output_file_path, e_writer_file_name)
e_writer = pd.ExcelWriter(path=e_writer_file_path_name)

In [None]:
df02.to_excel(excel_writer = e_writer, sheet_name = 'me_02', index = False)

In [None]:
df03.to_excel(excel_writer = e_writer, sheet_name = 'me_03', index = False)

In [None]:
df03_wide.to_excel(excel_writer = e_writer, sheet_name = 'me_03_wide', index = False)

In [None]:
df04.to_excel(excel_writer = e_writer, sheet_name = 'me_04', index = False)

In [None]:
df04_wide.to_excel(excel_writer = e_writer, sheet_name = 'me_04_wide', index = False)

In [None]:
# sort and reorder the the letter count df columns

In [None]:
letter_count_df = letter_count_df.sort_values(by = 'letter')

col_names = ['letter','letter_count','letter_percent','rank','n_words','word_percent','word_count_rank']

letter_count_df = letter_count_df[col_names]

In [None]:
letter_count_df.to_excel(excel_writer = e_writer, sheet_name = 'letter_rank', index = False)

In [None]:
# save and close the excel file object
e_writer.save()
e_writer.close()