In [1]:
import numpy as np
import glob
import sys
import time
from itertools import chain

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/tigress/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tigress/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tigress/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/tigress/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# A bunch of utility functions.

def depunctuate(word, unwanted_chars="_'.,/;[]\-=`~_+{}|:<>?()@#$%^&* "):
    '''
    Remove all instances of each char in <unwanted_chars> from <word>.
    <unwanted_chars> can be either a string or a list.
    '''
    
    newWord = ''.join(i for i in word if not i in unwanted_chars)
    return newWord

def isAllDigits(word):
    '''
    Return True if the word consists solely of numbers.
    '''
    for letter in word:
        if letter not in "0123456789":
            return False
    return True

# 1. Combine words from a bunch of datasets.

In [21]:
unfiltered = []

## WordNet

In [22]:
# https://stackoverflow.com/a/33538878
lemmas_in_wn = set(chain(*[ss.lemma_names() for ss in wn.all_synsets()]))
nLemmas = len(lemmas_in_wn)

# Get rid of numbers.
temp = []
for lemma in lemmas_in_wn:
    if not isAllDigits(lemma): temp.append(lemma)
lemmas_in_wn = temp

# Get rid of words less than 3 chars long.
lemmas = [word for word in lemmas_in_wn if len(word) > 2]
wordLengths = [len(word) for word in lemmas]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print(len(lemmas))

# unfiltered += lemmas

148140


In [26]:
# Save just the WordNet words into 'scripts/prefilter_data/_.txt'
incr = 1000
for i in range(len(lemmas) // incr):
    with open('./prefilter_data/wordnet_%d.txt' %i, 'w') as f:
        for word in lemmas[incr*i:incr*(i+1)]:
            f.write(word)
            f.write("\n")

## Broda

In [3]:
broda_list = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    # Most of the smaller words are junk. Would rather just manually add my own.
    if (len(word) >= 5 and score >= 50): broda_list.append(word)
        
# unfiltered += broda_list

In [4]:
# Get rid of duplicates.
temp = [depunctuate(word).upper() for word in broda_list]
temp = list(set(broda_list))
broda_list = temp
print(len(broda_list))

509740


In [6]:
# Save the Broda words into 'scripts/prefilter_data/_.txt'
broda_list = sorted(broda_list, key=len)
incr = 1000
for i in range(len(broda_list) // incr):
    with open('./prefilter_data/broda_%d.txt' %i, 'w') as f:
        for word in broda_list[incr*i:incr*(i+1)]:
            f.write(word)
            f.write("\n")

# Generate final wordlist

In [3]:
# Save de-punctuated, capitalized words from the filtered lists (the files I manually compiled)
# into files sorted by category and wordlength (in that order of hierarchy) 
# under "data" (this is supposed to be the final version.)

files = glob.glob('./filtered_data/*/*.txt', recursive = True)

all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

print("Number of words: %d" %len(all_words))

all_words = [depunctuate(word).upper() for word in all_words]
all_words = set(all_words)
print("Number of words: %d" %len(all_words))

# No categories for now. Just sort by wordlength.
wordLengths = [len(word) for word in all_words]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print("min length: %d\t max length: %d" %(minLength,maxLength))

for i in range(minLength,maxLength+1):
    n = len([word for word in all_words if len(word)==i])
    print("# of words of length %d: %d" %(i, n))

for i in range(minLength, maxLength+1):
    filename = "../data/words_%d.txt" %i
    words = [word for word in all_words if len(word)==i]
    
    with open(filename, 'w') as f:
        for word in words:
            f.write(word)
            f.write("\n")

Number of words: 8656
Number of words: 6665
min length: 0	 max length: 31
# of words of length 0: 1
# of words of length 1: 0
# of words of length 2: 8
# of words of length 3: 388
# of words of length 4: 391
# of words of length 5: 487
# of words of length 6: 792
# of words of length 7: 898
# of words of length 8: 967
# of words of length 9: 590
# of words of length 10: 544
# of words of length 11: 385
# of words of length 12: 318
# of words of length 13: 194
# of words of length 14: 328
# of words of length 15: 100
# of words of length 16: 69
# of words of length 17: 48
# of words of length 18: 22
# of words of length 19: 18
# of words of length 20: 6
# of words of length 21: 10
# of words of length 22: 5
# of words of length 23: 3
# of words of length 24: 2
# of words of length 25: 3
# of words of length 26: 1
# of words of length 27: 25
# of words of length 28: 34
# of words of length 29: 16
# of words of length 30: 11
# of words of length 31: 1


# Auxiliary

In [33]:
# Get only Broda words of length 4. Compare against the finalized words we have so far.

N = 4

files = glob.glob('./filtered_data/*/*.txt', recursive = True)
all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

all_words = [depunctuate(word).upper() for word in all_words]
all_words = [word for word in all_words if len(word)==N]
all_words = set(all_words)
print("Number of words: %d" %len(all_words))

broda_words = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    if (len(word) == N): broda_words.append(word)

broda_words = set(broda_words)
print("Number of words: %d" %len(broda_words))

final_words = []
for word in broda_words:
    if word in all_words: continue
    final_words.append(word)
    
print("Number of words: %d" %len(final_words))

# incr = 1000
# offset = 4000
# nFiles = len(final_words) // incr + 1
# for i in range(nFiles):
#     with open('./prefilter_data/broda_%d.txt' %(offset+i), 'w') as f:
#         if i == nFiles-1: 
#             for word in final_words[incr*i:]:
#                 f.write(word)
#                 f.write("\n")
#         else:
#             for word in final_words[incr*i:incr*(i+1)]:
#                 f.write(word)
#                 f.write("\n")

# Get the last words past 12000 that I forgot before
# incr = 1000
# offset = 4000
# nFiles = len(final_words) // incr + 1
# i = nFiles-1
# with open('./prefilter_data/broda_4012.txt', 'w') as f:
#     for word in final_words[incr*i:]:
#         f.write(word)
#         f.write("\n")

Number of words: 1305
Number of words: 12442
Number of words: 11137


In [21]:
# After filtering some Broda words, determine how accurate the given scores are.

N = 4
# Get all filtered 4-letter words.
files = glob.glob('./filtered_data/broda/filtered_broda_4???.txt', recursive = True)
all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

all_words = [depunctuate(word).upper() for word in all_words]
all_words = [word for word in all_words if len(word)==N]
all_words = set(all_words)
print("Number of words: %d" %len(all_words))

broda_words = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    if (len(word) == N and score >= 50): broda_words.append(word)

broda_words = set(broda_words)
print("Number of words: %d" %len(broda_words))

nFilteredWithHighScores = 0
filteredWithLowScores = []
nScoredWereFiltered = 0
for word in broda_words:
    if word in all_words: nScoredWereFiltered += 1
for word in all_words:
    if word in broda_words: nFilteredWithHighScores += 1
    else: filteredWithLowScores.append(word)

print("nScoredWereFiltered: %d" %nScoredWereFiltered)
print("nFilteredWithHighScores: %d" %nFilteredWithHighScores)
print("Precision: %f" %(nScoredWereFiltered/len(broda_words)))
print("Recall: %f" %(nFilteredWithHighScores/len(all_words)))
print(filteredWithLowScores)

Number of words: 914
Number of words: 9295
nScoredWereFiltered: 912
nFilteredWithHighScores: 912
Precision: 0.098117
Recall: 0.997812
['USPS', 'HEME']


In [34]:
# Winnow down the remaining un-filtered Broda words to ones with scores >= 50,
# since it was determined above that all words that I like have scores at least 50.

N = 4
# Get all filtered 4-letter words.
files = glob.glob('./prefilter_data/broda_4???.txt', recursive = True)
all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

all_words = [depunctuate(word).upper() for word in all_words]
all_words = [word for word in all_words if len(word)==N]
all_words = set(all_words)
print("Number of words: %d" %len(all_words))

broda_words = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    if (len(word) == N and (word in all_words) and score >= 50): broda_words.append(word)

broda_words = list(set(broda_words))
print("Number of words: %d" %len(broda_words))

incr = 1000
offset = 4013
nFiles = len(broda_words) // incr + 1
for i in range(nFiles):
    with open('./prefilter_data/broda_%d.txt' %(offset+i), 'w') as f:
        if i == nFiles-1: 
            for word in broda_words[incr*i:]:
                f.write(word)
                f.write("\n")
        else:
            for word in broda_words[incr*i:incr*(i+1)]:
                f.write(word)
                f.write("\n")

Number of words: 5067
Number of words: 3687


In [38]:
# Re-save rest of Broda words, but add 50+ score filter

# Get all filtered words so far.
files = glob.glob('./filtered_data/broda/filtered_*.txt', recursive = True)
all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

all_words = [depunctuate(word).upper() for word in all_words]
all_words = [word for word in all_words if len(word)==N]
all_words = set(all_words)
print("Number of filtered words: %d" %len(all_words))


# Get remaining Broda words.
broda_list = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    # Most of the smaller words are junk. Would rather just manually add my own.
    if (len(word) >= 6 and (word not in all_words) and score >= 50): broda_list.append(word)
        
# Get rid of duplicates.
temp = [depunctuate(word).upper() for word in broda_list]
temp = list(set(broda_list))
broda_list = temp
print(len(broda_list))
        
broda_list = sorted(broda_list, key=len)
incr = 1000
nFiles = len(broda_words) // incr + 1
for i in range(len(broda_list) // incr):
    with open('./prefilter_data/broda_%d.txt' %i, 'w') as f:
        if i == nFiles-1: 
            for word in broda_words[incr*i:]:
                f.write(word)
                f.write("\n")
        else:
            for word in broda_words[incr*i:incr*(i+1)]:
                f.write(word)
                f.write("\n")

Number of filtered words: 1278
416104
