In [6]:
import numpy as np
import glob
import sys
import time
from itertools import chain

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/tigress/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tigress/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tigress/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/tigress/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [34]:
# A bunch of utility functions.

def depunctuate(word, unwanted_chars="_'.,/;[]\-=`~_+{}|:<>?()@#$%^&* "):
    '''
    Remove all instances of each char in <unwanted_chars> from <word>.
    <unwanted_chars> can be either a string or a list.
    '''
    
    newWord = ''.join(i for i in word if not i in unwanted_chars)
    return newWord

def isAllDigits(word):
    '''
    Return True if the word consists solely of numbers.
    '''
    for letter in word:
        if letter not in "0123456789":
            return False
    return True

# 1. Combine words from a bunch of datasets.

In [21]:
unfiltered = []

## WordNet

In [22]:
# https://stackoverflow.com/a/33538878
lemmas_in_wn = set(chain(*[ss.lemma_names() for ss in wn.all_synsets()]))
nLemmas = len(lemmas_in_wn)

# Get rid of numbers.
temp = []
for lemma in lemmas_in_wn:
    if not isAllDigits(lemma): temp.append(lemma)
lemmas_in_wn = temp

# Get rid of words less than 3 chars long.
lemmas = [word for word in lemmas_in_wn if len(word) > 2]
wordLengths = [len(word) for word in lemmas]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print(len(lemmas))

unfiltered += lemmas

148140


In [26]:
# Save just the WordNet words into 'scripts/prefilter_data/_.txt'
incr = 1000
for i in range(len(lemmas) // incr):
    with open('./prefilter_data/wordnet_%d.txt' %i, 'w') as f:
        for word in lemmas[incr*i:incr*(i+1)]:
            f.write(word)
            f.write("\n")

## Broda

In [23]:
broda_list = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    (word, score) = line.split(";")
    # Most of the smaller words are junk. Would rather just manually add my own.
    if (len(word) >= 5): broda_list.append(word)
        
unfiltered += broda_list

In [24]:
# Get rid of duplicates.
temp = [depunctuate(word).upper() for word in unfiltered]
temp = set(unfiltered)
unfiltered = temp
print(len(unfiltered))

657826


In [None]:
# Output original words to a bunch of files containing 1000 words each, 
# each titled "scripts/raw_data/words_%n.txt".

In [39]:
# Save de-punctuated, capitalized words from the filtered lists (the files I manually compiled)
# into files sorted by category and wordlength (in that order of hierarchy) 
# under "data" (this is supposed to be the final version.)

files = glob.glob('./filtered_data/*/*.txt', recursive = True)

all_words = []
for file in files:
    f = open(file, 'r')
    lines = f.read().splitlines() 
    f.close()
    all_words += lines

print("Number of words: %d" %len(all_words))

all_words = [depunctuate(word).upper() for word in all_words]
all_words = set(all_words)
print("Number of words: %d" %len(all_words))

# No categories for now. Just sort by wordlength.
wordLengths = [len(word) for word in all_words]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print("min length: %d\t max length: %d" %(minLength,maxLength))

for i in range(minLength, maxLength+1):
    filename = "../data/words_%d.txt" %i
    words = [word for word in all_words if len(word)==i]
    
    with open(filename, 'w') as f:
        for word in words:
            f.write(word)
            f.write(" ")

Number of words: 4272
Number of words: 4138
min length: 3	 max length: 25
