In [1]:
# mike babb
# 2022 03 03
# import and format data

In [71]:
# standard
import pickle
import os
from string import ascii_lowercase

In [3]:
# external
import pandas as pd
import numpy as np

In [4]:
# import the 
df = pd.read_csv(filepath_or_buffer='words5.csv', header = None, names = ['word'])

In [5]:
df.head()

Unnamed: 0,word
0,aalii
1,Aaron
2,abaca
3,aback
4,abaff


In [6]:
# make everything lowercase
df['lcase'] = df['word'].str.lower()

In [7]:
df.head()

Unnamed: 0,word,lcase
0,aalii,aalii
1,Aaron,aaron
2,abaca,abaca
3,aback,aback
4,abaff,abaff


In [8]:
# let's sort, and then create an ID
df = df.sort_values(by = ['lcase', 'word'])

In [9]:
df['word_id'] = df.index

In [10]:
df.head()

Unnamed: 0,word,lcase,word_id
0,aalii,aalii,0
1,Aaron,aaron,1
2,abaca,abaca,2
3,aback,aback,3
4,abaff,abaff,4


In [11]:
# now, we need to create a matrix that we will use to "score" each word

In [12]:
char_matrix = np.zeros(shape = (len(df), 26))

In [14]:
# we will also need a dictionary that tracks the position of each letter
letter_dict = {l:p for p, l in enumerate(ascii_lowercase)}

In [15]:
# now, we need to enumerate each row in the df and populate the matrix
# the id of the word corresponds to the word
def pop_matrix(row):
    curr_index = row['word_id']
    curr_word = row['lcase']
    for i_cl, cl in enumerate(curr_word):
        # use our dicationary
        char_matrix[curr_index, letter_dict[cl]] += 1
        
    return None

In [16]:
output = df.apply(pop_matrix, 1)

In [18]:
char_matrix

array([[2., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.],
       [3., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 1., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.]])

In [19]:
# row sum to check!
rs_check = char_matrix.sum(1)

In [25]:
# okay! now... figure out how many words have each letter?
# do a sum
letter_count = char_matrix.sum(0)

In [26]:
# should be 26!
letter_count.shape

(26,)

In [28]:
(letter_count / letter_count.sum()) * 100

array([11.28559968,  2.67950261,  3.59406338,  3.28920979,  9.62896109,
        1.45206578,  2.52908143,  3.10870437,  6.49418371,  0.46730846,
        2.15403129,  5.45326915,  3.16887284,  5.39711191,  6.4921781 ,
        2.9081428 ,  0.19655034,  6.93943041,  5.64380265,  5.33894906,
        4.540714  ,  1.08503811,  1.48415564,  0.46730846,  3.58804653,
        0.61371841])

In [29]:
# put each one of these value in a dictionary
letter_rank_dict = {l:lr for l, lr in zip(ascii_lowercase, letter_count)}

In [30]:
letter_rank_dict

{'a': 5627.0,
 'b': 1336.0,
 'c': 1792.0,
 'd': 1640.0,
 'e': 4801.0,
 'f': 724.0,
 'g': 1261.0,
 'h': 1550.0,
 'i': 3238.0,
 'j': 233.0,
 'k': 1074.0,
 'l': 2719.0,
 'm': 1580.0,
 'n': 2691.0,
 'o': 3237.0,
 'p': 1450.0,
 'q': 98.0,
 'r': 3460.0,
 's': 2814.0,
 't': 2662.0,
 'u': 2264.0,
 'v': 541.0,
 'w': 740.0,
 'x': 233.0,
 'y': 1789.0,
 'z': 306.0}

In [33]:
# now, score each word
df['n_unique_chars'] = df['lcase'].map(lambda x: len(set(x)))

In [42]:
# now, add each letter.
def sum_letters(x):
    curr_score = 0
    for l in x:
        curr_score += letter_rank_dict[l]
    
    return curr_score

In [43]:
df['word_score'] = df['lcase'].map(sum_letters)

In [44]:
df.head()

Unnamed: 0,word,lcase,word_id,n_unique_chars,word_score
0,aalii,aalii,0,3,20449.0
1,Aaron,aaron,1,4,20642.0
2,abaca,abaca,2,3,20009.0
3,aback,aback,3,4,15456.0
4,abaff,abaff,4,3,14038.0


In [48]:
# now, sort
wdf = df.sort_values(by = ['n_unique_chars', 'word_score'], ascending = False)

In [49]:
wdf.head()

Unnamed: 0,word,lcase,word_id,n_unique_chars,word_score
568,Aries,aries,568,5,19940.0
572,arise,arise,572,5,19940.0
6726,raise,raise,6726,5,19940.0
7430,serai,serai,7430,5,19940.0
592,arose,arose,592,5,19939.0


In [None]:
# aries, arise, raise, and serai are the best words to start with

In [None]:
# save this stuff
# the df, the letter dictionary, the char_matrix, and the letter_dict_Rank

In [70]:
df.to_pickle(path = 'word_df.pkl')

In [72]:
with open('letter_dict.pkl', 'wb') as handle:
    pickle.dump(letter_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [73]:
with open('char_matrix.pkl', 'wb') as handle:
    pickle.dump(char_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [74]:
with open('letter_rank_dict.pkl', 'wb') as handle:
    pickle.dump(letter_rank_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
# now, let's try this another way: count the letters in each position
letter_matrix = np.zeros(shape = (26, 5))

In [54]:
def pop_letter_matrix(x):
    
    for i_ll, ll in enumerate(x):
        # row, then columns
        letter_matrix[letter_dict[ll], i_ll] += 1
        
    return None  

In [56]:
output = df['lcase'].map(pop_letter_matrix)

In [59]:
letter_matrix.shape

(26, 5)

In [67]:
# now, we use the which max to find the index of the most common letter
# this will tell us highest ranked position of each letter across all letters
output = np.argmax(a = letter_matrix, axis = 1)

In [68]:
output

array([1, 0, 0, 4, 4, 0, 0, 1, 1, 0, 4, 2, 0, 2, 1, 0, 0, 2, 0, 4, 1, 2,
       0, 4, 4, 3], dtype=int64)

AttributeError: 'numpy.ndarray' object has no attribute 'head'