In [1]:
import requests, collections, string
import numpy as np
import plotly.graph_objects as go

# Import Answer Spaces
Import the Wordle answer space, NYT wordle answer space, and an answer space of 5-letter English words.

In [2]:
# URLs to pull from
WORDLE_ANSWERS_URL = 'https://gist.githubusercontent.com/cfreshman/a03ef2cba789d8cf00c08f767e0fad7b/raw/28804271b5a226628d36ee831b0e36adef9cf449/wordle-answers-alphabetical.txt'
NYT_WORDS_URL = 'https://gist.githubusercontent.com/cfreshman/a7b776506c73284511034e63af1017ee/raw/845966807347a7b857d53294525263408be967ce/wordle-nyt-answers-alphabetical.txt'

# Instantiate the wordle answer spaces
wordle_answer_space = requests.get(WORDLE_ANSWERS_URL).text.split('\n')
nyt_answer_space = requests.get(NYT_WORDS_URL).text.split('\n')

# Instantiate an answer space constrained only by word length
english_word_answer_space = set()
data_paths = [
    '../data/wordle-allowed-guesses.txt',
    '../data/words5-from-OSPD4',
    '../data/words5-from-sgb'
]
for path in data_paths:
    with open(path, 'r') as f:
        for line in f:
            english_word_answer_space.add(line.strip())
            
# Dictionary with all answer spaces
answer_spaces = {
    'Wordle' : wordle_answer_space,
    'NYT'    : nyt_answer_space,
    'English': english_word_answer_space
}

# How is the Wordle answer space constrained?

# Overall Letter Occurence Proabilites
Assuming a random (uniform) draw from an answer space, what is the probabilty of seeing a letter in the randomly drawn word? 

In [10]:
probs, bar_gos, sorted_order = [], [], None
for i, answer_space in enumerate(answer_spaces):
    # Get letter counts
    counter = collections.Counter()
    for word in answer_spaces[answer_space]:
        counter.update(word)
    
    # Normalize into a probability distribution
    letter_probs = np.array([counter[c] if c in counter else 0 for c in string.ascii_lowercase])
    probs.append(letter_probs / letter_probs.sum())
    
    # Arbitrary sorted order
    if i == 0: sorted_order = np.argsort(-letter_probs)
    
    # Plot
    bar_gos.append(
        go.Bar(
            x=[string.ascii_lowercase[i] for i in sorted_order],
            y=[probs[-1][i] for i in sorted_order],
            name=answer_space
        )
    )

# Plot
fig = go.Figure(bar_gos)
fig.update_layout(
    title='Probability of Letter in Word Given Random Draw from Corpus',
    xaxis_title='Letter',
    yaxis_title='Probabilty'
)
fig.show()

# MLE of letter probabilities based on position in word

In [25]:
for w_pos in range(5):
    probs, bar_gos, sorted_order = [], [], None
    for i, answer_space in enumerate(answer_spaces):
        # Get letter counts
        counter = collections.Counter()
        for word in answer_spaces[answer_space]:
            counter.update(word[w_pos])

        # Normalize into a probability distribution
        letter_probs = np.array([counter[c] if c in counter else 0 for c in string.ascii_lowercase])
        probs.append(letter_probs / letter_probs.sum())

        # Arbitrary sorted order
        if i == 0: sorted_order = np.argsort(-letter_probs)

        # Plot
        bar_gos.append(
            go.Bar(
                x=[string.ascii_lowercase[i] for i in sorted_order],
                y=[probs[-1][i] for i in sorted_order],
                name=answer_space
            )
        )

    # Plot
    fig = go.Figure(bar_gos)
    fig.update_layout(
        title=r'$P(letter|position='+ str(w_pos+1) + r')\textit{ Given Random Draw from Corpus}$',
        xaxis_title='$Letter$',
        yaxis_title='$P(letter|position)$'
    )
    fig.show()

The $P(letter|position=5)$ leads to an interesting observation. Since the Wordle answer corpus is constrained to contain only non-plural nouns, there is a very low probability of observing the letter "s" in the last position. The corpus of 5-letter English words is not constrained in this way, and so we see a pronounced effect of plurals on $P(letter|position=5)$ for the English word corpus.