In [1]:
import numpy as np
import glob
import sys
import time

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/tigress/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tigress/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tigress/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tigress/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/tigress/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [11]:
def unpunctuate(word, unwanted_chars):
    '''
    Remove all instances of each char in <unwanted_chars> from <word>.
    <unwanted_chars> is a list: ['_', ',', ...]
    '''
    for char in unwanted_chars:
        word = word.replace(char, '')
    return word

def unpunctuate2(word, unwanted_chars):
    '''
    Remove all instances of each char in <unwanted_chars> from <word>.
    <unwanted_chars> can be either a string or a list.
    '''
    
    newWord = ''.join(i for i in word if not i in unwanted_chars)
    return newWord

def isAllDigits(word):
    '''
    Return True if the word consists solely of numbers.
    '''
    for letter in word:
        if letter not in "0123456789":
            return False
    return True

def isNamedEntity(text):
    '''
    https://stackoverflow.com/a/19495220
    
    Detect people's names, organizations. 
    This also picks up Latin scientific names.
    Unfortunately this also picks up a lot of decent words, for example "color_TV".
    '''
    tokenizer = SpaceTokenizer()
    toks = tokenizer.tokenize(text)
    pos = nltk.pos_tag(toks)
    chunked_nes = nltk.ne_chunk(pos) 

    nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)]
    return len(nes) == len(text.split('_'))

def isProperNoun(word):
    tagged = nltk.pos_tag([word])
    return tagged[0][1] == 'NNP'

# Parsing WordNet

In [33]:
xchars = "_'.,/;[]\-=`~_+{}|:<>?" # chars to be "x-ed"
xchars_list = [char for char in xchars]

In [30]:
# https://stackoverflow.com/a/33538878
lemmas_in_wordnet = set(chain(*[ss.lemma_names() for ss in wn.all_synsets()]))
nLemmas = len(lemmas_in_wordnet)

In [104]:
# Detect proper nouns; output to file; manually filter.
namedEntities = []
for lemma in lemmas_in_wordnet:
    if isProperNoun(lemma): namedEntities.append(lemma)
print(len(namedEntities))

# Output to file.
with open('./wordlists/wordnet_properNouns.txt', 'w') as f:
    for word in namedEntities:
        f.write(word)
        f.write("\n")
        
## Break: Go to the file that was just saved and delete the *good* words! Save it as `wordnet_unwanted.txt`.

2876


In [111]:
## Run after saving `wordnet_unwanted.txt`.
f = open('./wordlists/wordnet_unwanted.txt',"r")
lines = f.read().splitlines() 
f.close()

for line in lines:
    lemmas_in_wordnet.remove(line)

In [123]:
# Run a much coarser comb for now. Just remove all proper nouns.
# Unfortunately, this removes a bunch of decent words, while still missing a bunch of junk words and phrases.
namedEntities = []
for lemma in lemmas_in_wordnet:
    if isNamedEntity(lemma): namedEntities.append(lemma)
        
for entity in namedEntities:
    lemmas_in_wordnet.remove(entity)

['Pectinibranchia', 'Orpington', 'Babar', 'Enid', 'Mercurochrome', 'Grossulariaceae', 'Woodward', 'Cariama', 'Neanderthalian', 'Trema', 'Serranus', 'Claudius', 'Thalictrum', 'IJsselmeer', 'Leukeran', 'Grace', 'Enceliopsis', 'Platalea', 'Ceroxylon', 'Juncus', 'Tilia', 'Stalin', 'Cu', 'Noel', 'Atsugewi', 'Tarheel', 'Berne', 'Nurnberg', 'Oldfield', 'Martian', 'Massine', 'Mantinea', 'Foucault', 'Strindberg', 'Lentibulariaceae', 'Uppsala', 'XVI', 'Indonesia', 'Cebuan', 'GCA', 'Texas', 'Shreveport', 'Arctonyx', 'Burk', 'Tussaud', 'SWbW', 'Czech', 'Thatch', 'Bradypodidae', 'Iroquoian', 'Susian', 'Picea', 'Martyniaceae', 'Latinise', 'Grenada', 'Brahui', 'Comptonia', 'Nash', 'Gonorhynchidae', 'Novgorod', 'Faust', 'Racine', 'Christiania', 'Malopterurus', 'Metazoa', 'Neruda', 'Algerian', 'Sax', 'Cercocebus', 'Carter', 'Mississippi', 'Almaty', 'Hefa', 'NEbE', 'Palaeolithic', 'Waterbury', 'Sihasapa', 'Arcturus', 'Bohemia', 'MHz', 'Elanus', 'Azactam', 'Pholiota', 'Artemis', 'Buck', 'MMPI', 'Nymphali

In [126]:
# Get rid of unwanted chars.
cleaned_lemmas = [] # contains placeholder strings
for lemma in lemmas_in_wordnet:
    cleanedWord = unpunctuate2(lemma, xchars)
    cleaned_lemmas.append(cleanedWord.upper())
    
# Get rid of duplicates after removing punctuation.
cleaned_lemmas = list(set(cleaned_lemmas))

# Get rid of numbers.
temp = []
for i in range(len(cleaned_lemmas)):
    if not isAllDigits(cleaned_lemmas[i]): temp.append(cleaned_lemmas[i])
cleaned_lemmas = temp

# TODO: Get rid of all scientific names, acronyms.

In [129]:
# Inspect the words.
print(len(cleaned_lemmas))
i=0
for word in cleaned_lemmas:
    print(word)
    i+=1
    if (i>10): break

132067
HELIANTHUSLAETIFLORUS
PROCTOR
LONGISLANDSOUND
COMFORTSTATION
PROPRIETARYCOLONY
HARVARDUNIVERSITY
RUBBERNECKING
DICYNODONT
EGG
PARAMNESIA
FITCH


In [130]:
# Get rid of words less than 3 chars long.
lemmas = [word for word in cleaned_lemmas if len(word) > 2]
wordLengths = [len(word) for word in lemmas]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print(minLength, maxLength)
print(len(lemmas))

3 64
131808


In [131]:
for n in range(minLength, maxLength+1):
    # Get all words of length n.
    words = [word for word in lemmas if len(word)==n]
    if len(words) == 0: continue
        
    with open('./wordlists/wordnet_%d.txt' %n, 'w') as f:
        for word in words:
            f.write(word)
            f.write(" ")

# Broda's list

In [23]:
broda_list = []
f = open('./raw_data/peter-broda-wordlist__scored.txt',"r")
lines = f.read().splitlines() 
f.close()

print(len(lines))

for line in lines:
    (word, score) = line.split(";")
    score = int(score)
    # I think many smaller words automatically have poor score, 
    # with better scores if they're worse (because they're more rare, but are actually just bad abbr., etc.)
    if (len(word) < )
    if (score >= 51) or (len(word) < 5): broda_list.append(word)
        
print(len(broda_list))
broda_set = set(broda_list)
print(len(broda_set))

527372
179349
179280


In [24]:
# Get rid of proper nouns. isProperNoun() seems to work well enough.
named = []
for word in broda_set:
    if isProperNoun(word): named.append(word)
        
print(len(named))

3790


In [19]:
for word in named:
    broda_set.remove(word)
print(len(broda_set))

162378


In [25]:
# Get rid of words less than 3 chars long.
broda_list = [word for word in broda_set if len(word) > 2]
wordLengths = [len(word) for word in broda_list]
minLength = min(wordLengths)
maxLength = max(wordLengths)
print(minLength, maxLength)

3 54


In [26]:
# Save into files, organized by word length.
for n in range(minLength, maxLength+1):
    # Get all words of length n.
    words = [word for word in broda_list if len(word)==n]
    if len(words) == 0: continue
        
    with open('./wordlists/broda_%d.txt' %n, 'w') as f:
        for word in words:
            f.write(word)
            f.write(" ")

# Combine wordlists