# Module Imports

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import words

print(f'pandas: {pd.__version__}')
print(f're: {re.__version__}')
print(f'nltk: {nltk.__version__}')

pandas: 1.4.1
re: 2.2.1
nltk: 3.7


# Global Variables

In [7]:
word_length = 5

# Kagle Dataset
Obtained From:
https://www.kaggle.com/rtatman/english-word-frequency  
Accessed: 03/07/2022

In [8]:
kaggle = pd.read_csv('Data-Original/unigram_freq.csv')
print(len(kaggle))
kaggle['word'] = kaggle['word'].astype('str').str.upper()
kaggle.head()

333333


Unnamed: 0,word,count
0,THE,23135851162
1,OF,13151942776
2,AND,12997637966
3,TO,12136980858
4,A,9081174698


In [9]:
mask = kaggle['word'].apply(lambda x: len(x) == word_length)
kaggle = kaggle[mask]
print(len(kaggle))
master_list = kaggle
master_list = master_list.rename(columns={'count':'wordFreq'})
master_list.head()

39933


Unnamed: 0,word,wordFreq
35,ABOUT,1226734006
45,OTHER,978481319
56,WHICH,810514085
57,THEIR,782849411
62,THERE,701170205


# Import wordfrequency.info Data
Obtained From:
https://www.wordfrequency.info/samples.asp  
Accessed: 03/06/2022  

In [10]:
word_freq = pd.read_excel('Data-Original/lemmas_60k_words.xlsx')
print(len(word_freq))
word_freq.head()

10125


Unnamed: 0,lemRank,lemma,PoS,lemFreq,wordFreq,word
0,5,of,i,23159162,23159162,of
1,15,do,v,8186412,4501047,do
2,15,do,v,8186412,1889734,did
3,15,do,v,8186412,964997,does
4,15,do,v,8186412,461455,doing


## Filter and Transform

In [11]:
word_freq = word_freq[['word', 'wordFreq']].copy()
word_freq['wordFreq'] = word_freq['wordFreq'].astype('int')
word_freq['word'] = word_freq['word'].astype('str').str.upper()

# Remove punctuation
word_freq['word'] = word_freq['word'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
# Filter by word length
mask = word_freq['word'].apply(lambda x: len(x) == word_length)
word_freq = word_freq[mask]
print(len(word_freq))
# Filter out words already have
mask = word_freq['word'].apply(lambda x: x in list(master_list['word']))
word_freq = word_freq[~mask]
print(len(word_freq))

word_freq.head()

680
15


Unnamed: 0,word,wordFreq
5215,UNSAY,19
6408,EXCOP,239
6664,FWORD,231
6815,WILED,8
7051,NIQAB,197


In [12]:
print(len(master_list))
master_list = pd.concat([master_list, word_freq], ignore_index=True)
print(len(master_list))

39933
39948


# NLTK Data
Import  
Filter and Transform

In [13]:
nltk_list = words.words()
print(len(nltk_list))

nltk_list = [x.upper() for x in nltk_list if len(x) == word_length]
print(len(nltk_list))
nltk_list = [x for x in nltk_list if x not in list(master_list['word'])]
print(len(nltk_list))
nltk_list[:10]

236736
10422
3919


['AALII',
 'ABAFF',
 'ABAMA',
 'ABASH',
 'ABASK',
 'ABAVE',
 'ABAZE',
 'ABDAT',
 'ABEAR',
 'ABILO']

## Combine Data

In [14]:
master_list = pd.concat([master_list, pd.DataFrame(data={'word':nltk_list, 'wordFreq':[0] * len(nltk_list)})], ignore_index=True).sort_values(by='word')

print(len(master_list))
print(master_list.head())
print(master_list.tail())

43867
        word  wordFreq
17094  AAAAH     52821
22206  AAAAI     33992
15752  AAACE     60117
8831   AAACN    161080
30023  AAAHH     20882
        word  wordFreq
18002  ZYVEX     48519
27222  ZYVOX     24931
4277   ZYXEL    591121
24718  ZZINE     28813
21533  ZZRIL     35796


# Add Wordle Wordlist
Obtained From: 
https://www.nytimes.com/games/wordle/index.html  
Inspect -> Application -> vars Ma and Oa  
Accessed: 03/10/22

In [15]:
with open('Data-Original/wordle_words_03_10_22.txt', 'r') as file:
    wordle_words = file.read().replace('\n', '').replace('"', '').replace(',', '').upper().split(' ')

only_wordle_words = master_list.copy()
# Filter out words not in Wordle words
mask = only_wordle_words['word'].apply(lambda x: x in wordle_words)
only_wordle_words = only_wordle_words[mask]
print(len(only_wordle_words))

9531


# Save Preprocessed Data

In [16]:
master_list.to_csv('Data-Preprocessed/word_freq.csv', index=False)
only_wordle_words.to_csv('Data-Preprocessed/word_freq_wordle_only.csv', index=False)