In [None]:
# mike babb
# 2022 03 03
# import and format data

In [None]:
# standard
import pickle
import os
from string import ascii_lowercase

In [None]:
# external
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [None]:
# import the 
df = pd.read_csv(filepath_or_buffer='words5.csv', header = None, names = ['word'])

In [None]:
df.head()

In [None]:
# make everything lowercase
df['lcase'] = df['word'].str.lower()

In [None]:
df.head()

In [None]:
# let's sort, and then create an ID
df = df.sort_values(by = ['lcase', 'word'])

In [None]:
df['word_id'] = df.index

In [None]:
df.head()

In [None]:
# now, we need to create a matrix that we will use to "score" each word

In [None]:
df.shape

In [None]:
char_matrix = np.zeros(shape = (len(df), 26))

In [None]:
# we will also need a dictionary that tracks the position of each letter
letter_dict = {l:p for p, l in enumerate(ascii_lowercase)}

In [None]:
# now, we need to enumerate each row in the df and populate the matrix
# the id of the word corresponds to the word
def pop_matrix(row):
    curr_index = row['word_id']
    curr_word = row['lcase']
    for i_cl, cl in enumerate(curr_word):
        # use our dicationary
        char_matrix[curr_index, letter_dict[cl]] += 1
        
    return None

In [None]:
output = df.apply(pop_matrix, 1)

In [None]:
char_matrix

In [None]:
# row sum to check!
rs_check = char_matrix.sum(1)

In [None]:
# okay! now... figure out how many words have each letter?
# do a sum
letter_count = char_matrix.sum(0)

In [None]:
# should be 26!
letter_count.shape

In [None]:
(letter_count / letter_count.sum()) * 100

In [None]:
# put each one of these value in a dictionary
letter_rank_dict = {l:lr for l, lr in zip(ascii_lowercase, letter_count)}

In [None]:
letter_rank_dict

In [None]:
letter_rank_df = pd.DataFrame.from_dict(data = letter_rank_dict, orient = 'index', 
                                       columns = ['score']).reset_index()

In [None]:
letter_rank_df.head()

In [None]:
letter_rank_df.columns = ['letter', 'score']

In [None]:
letter_rank_df['score_percent'] = letter_rank_df['score']* 100 / letter_rank_df['score'].sum()

In [None]:
letter_rank_df.head()

In [None]:
letter_rank_df['score_percent'].describe()

In [None]:
letter_rank_df['is_vowel'] = int(0)

In [None]:
letter_rank_df['colors'] = 'black'

In [None]:
letter_rank_df.loc[letter_rank_df['letter'].isin(vowels), 'colors'] = 'red'

In [None]:
vowels = ['a', 'e', 'i', 'o', 'u', 'y']
letter_rank_df.loc[letter_rank_df['letter'].isin(vowels), 'is_vowel'] = 1

In [None]:
# plot
#plt.style.use('_mpl-gallery')
%matplotlib inline 
fig, ax = plt.subplots(tight_layout = True)

ax.bar(x=letter_rank_df['letter'], height=letter_rank_df['score_percent'],
       width=1, edgecolor="white", linewidth=1, color = letter_rank_df['colors'])

ax.set(xlim=(-1, 26), xticks=np.arange(0, 26),
       ylim=(0, 12), yticks=np.arange(0, 12))
ax.set_title('Frequency of Letter Occurrence in 5-letter words')
ax.set_xlabel('Letter')
ax.set_ylabel('Frequency (%)')

plt.savefig(fname = 'letter_score.png', format = 'png',
            dpi=96, pad_inches = .5, facecolor='white')
#plt.show()

In [None]:
letter_rank_df = letter_rank_df.sort_values(by = 'score', ascending=False)

In [None]:
letter_rank_df.head(10)

In [None]:
# now, score each word
df['n_unique_chars'] = df['lcase'].map(lambda x: len(set(x)))

In [None]:
# now, add each letter.
def sum_letters(x):
    curr_score = 0
    for l in x:
        curr_score += letter_rank_dict[l]
    
    return curr_score

In [None]:
df['word_score'] = df['lcase'].map(sum_letters)

In [None]:
df.head()

In [None]:
sorted('bat')

In [None]:
# identify the word group
df['word_group'] = df['lcase'].map(lambda x: hash(tuple(sorted(x))))

In [None]:
# now, sort
wdf = df.sort_values(by = ['n_unique_chars', 'word_score'], ascending = False)

In [None]:
wdf.head()

In [None]:
wg_df = wdf.loc[wdf['n_unique_chars'] == 5, ['word_group', 'word_score']].drop_duplicates()

In [None]:
wg_df = wg_df.sort_values(by = ['word_score'], ascending = False )

In [None]:
top_five_wg = wg_df['word_group'].iloc[:10]

In [None]:
top_five_wg

In [None]:
for wg in top_five_wg:
    #print(wg)
    curr_word = wdf.loc[wdf['word_group']==wg, 'lcase'].tolist()
    print(curr_word)
    

In [None]:
wg_df.head()

In [None]:
wg_df.head()

In [None]:
# aries, arise, raise, and serai are the best words to start with

In [None]:
# save this stuff
# the df, the letter dictionary, the char_matrix, and the letter_dict_Rank

In [None]:
df.to_pickle(path = 'word_df.pkl')

In [None]:
with open('letter_dict.pkl', 'wb') as handle:
    pickle.dump(letter_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('char_matrix.pkl', 'wb') as handle:
    pickle.dump(char_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('letter_rank_dict.pkl', 'wb') as handle:
    pickle.dump(letter_rank_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# now, let's try this another way: count the letters in each position
letter_matrix = np.zeros(shape = (26, 5))

In [None]:
def pop_letter_matrix(x):
    
    for i_ll, ll in enumerate(x):
        # row, then columns
        letter_matrix[letter_dict[ll], i_ll] += 1
        
    return None  

In [None]:
output = df['lcase'].map(pop_letter_matrix)

In [None]:
letter_matrix.shape

In [None]:
# now, we use the which max to find the index of the most common letter
# this will tell us highest ranked position of each letter across all letters
output = np.argmax(a = letter_matrix, axis = 1)

In [None]:
output