In [1]:
%pip install \
    nltk \
    matplotlib \
    --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
from typing import List
from pathlib import Path
from collections import Counter
import urllib
import re

import nltk
from nltk import pos_tag, word_tokenize

In [3]:
# Download from nltk to perform tokenizing and pos tagging
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('gutenberg', quiet=True)
nltk.download('webtext', quiet=True)
nltk.download('brown', quiet=True)

True

In [4]:
if (pth := Path().home() / Path('english-words/words_alpha.txt')).exists():
    with open(pth) as f:
        english_words = f.read()
else:
    URL = 'https://raw.githubusercontent.com/mbkupfer/english-words/master/words_alpha.txt'
    resp = urllib.request.urlopen(URL)
    english_words = resp.read().decode('utf-8')


## Build out proper Wordle vocabulary

Constraints:
- only 5 letters
- no proper nouns
- nothing offensive (more on this one later)

In [5]:
valid_vocab = []
for word, pos in pos_tag(word_tokenize(english_words)):
    if len(word) == 5 and pos != 'NNP':
        valid_vocab.append(word)

In [6]:
print(f'Total words: {len(english_words):,}')
valid_vocab_word_count = len(valid_vocab)
print(f'Total valid wordle words: {valid_vocab_word_count:,}')

Total words: 3,864,812
Total valid wordle words: 15,842


In [7]:
def ngrams_freq_dist(vocab: List[str], n: int) -> nltk.FreqDist:
    fdist = nltk.FreqDist()
    for word in vocab:
        for ngram in nltk.ngrams(word, n=n):
            fdist[''.join(ngram)] += 1
    return fdist

In [16]:
print('''
Top 5 ngrams
============
''')
for n in range(1, 5):
    print(f'{n}-gram\n-----------')
    fdist = ngrams_freq_dist(valid_vocab, n=n)
    cnt = fdist.total()
    print(f'Total {n}-grams: {cnt:,}')
    print('Top 5:')
    top_5 = fdist.most_common(5)
    for gram, gram_cnt in top_5:
        print(f'{gram}\t{gram_cnt:,}\t{gram_cnt / cnt:.1%}')


Top 5 ngrams

1-gram
-----------
Total 1-grams: 79,210
Top 5:
a	8,343	10.5%
e	7,775	9.8%
s	6,524	8.2%
o	5,186	6.5%
r	5,130	6.5%
2-gram
-----------
Total 2-grams: 63,368
Top 5:
er	911	1.4%
ar	889	1.4%
an	831	1.3%
es	768	1.2%
re	760	1.2%
3-gram
-----------
Total 3-grams: 47,526
Top 5:
ing	89	0.2%
ave	88	0.2%
ine	88	0.2%
res	87	0.2%
are	86	0.2%
4-gram
-----------
Total 4-grams: 31,684
Top 5:
aver	17	0.1%
ills	17	0.1%
ears	14	0.0%
acks	13	0.0%
ails	13	0.0%


In [9]:
words_with_no_top_5_letter = [w for w in valid_vocab if not set(w) & set(['a', 'e', 's', 'o', 'r'])]
print(f'Total words that do not have any of the top 5 letters: {len(words_with_no_top_5_letter)}')
print(f'First 25:\n')
print(words_with_no_top_5_letter[:25])

Total words that do not have any of the top 5 letters: 785
First 25:

['bhili', 'bibby', 'bichy', 'biddy', 'biffy', 'bifid', 'biggy', 'bight', 'bigly', 'bilbi', 'bilby', 'bilch', 'bilgy', 'bilic', 'bilin', 'billy', 'bindi', 'bingy', 'bynin', 'binit', 'binny', 'bitch', 'bitty', 'bivvy', 'bixin']


Not many common words in there, and in fact some of the words could be considered offensive given the context.

Instead, let's use a common corpus to quickly rule out valid, but likely rare words.

In [10]:
from nltk.corpus import brown

In [11]:
def only_words_from_corpus(vocab, corpus_words):
    return set(vocab) & set(corpus_words) 

In [12]:
brown_words_with_no_top_5 = only_words_from_corpus(words_with_no_top_5_letter, brown.words(categories='news'))

In [13]:
len(brown_words_with_no_top_5)

45

In [14]:
brown_words_with_no_top_5

{'blind',
 'blunt',
 'buddy',
 'build',
 'built',
 'bulky',
 'child',
 'civil',
 'cubic',
 'dying',
 'fifth',
 'fifty',
 'fight',
 'filly',
 'filmy',
 'fluid',
 'flung',
 'fully',
 'guilt',
 'gully',
 'hitch',
 'hubby',
 'imply',
 'jumpy',
 'light',
 'limit',
 'lucky',
 'lunch',
 'lying',
 'might',
 'night',
 'ninth',
 'pitch',
 'quick',
 'thigh',
 'thing',
 'think',
 'tight',
 'tying',
 'unify',
 'unity',
 'until',
 'vivid',
 'which',
 'windy'}

Let's create a more generalized function to get hards words by eliminating all words that have a common ngram.

In [15]:
def get_hard_words(vocab, ngram_cutoffs):
    """Only get words that don't have a top ngram

    vocab: List[str]
        Valid words to pick from
    ngram_cutoffs: List[int]
        A list of numbers that correspond to the most common cutoffs for each ngram. The first element
        corresponds to 1-grams, second is the 2-grams cutoff, etc.
    """
    vocab = set(vocab) & set(brown.words(categories='news'))
    for n, cutoff in enumerate(ngram_cutoffs, start=1):
        fdist = ngrams_freq_dist(vocab, n)
        top = [w for w, _ in fdist.most_common(cutoff)]
        exclude = [w for w in vocab if set(''.join(i) for i in nltk.ngrams(w, n=n)) & set(top)]
        vocab = vocab - set(exclude) 
    return vocab

get_hard_words(valid_vocab, [5,5,5])

{'buddy',
 'bulky',
 'combo',
 'comic',
 'coyly',
 'cubic',
 'filmy',
 'fluid',
 'hubby',
 'imply',
 'jumpy',
 'onion',
 'quick',
 'which',
 'widow'}