In [26]:
import itertools
from collections import defaultdict

import pandas as pd
import numpy as np
import csv

In [18]:
def load_wordle_words():
    with open('wordle_words.txt') as fr:
        words = [line.strip() for line in fr.readlines()]

    df = pd.DataFrame(words, columns=['word'])
    return df

In [19]:
df = load_wordle_words()

In [20]:
patterns = list(itertools.product([-1, 0, 1], repeat=5))
words_with_pattern_probs = defaultdict(lambda: defaultdict(int))

In [21]:
def pattern_diff(w1, w2):
    pattern = [None] * 5
    remaining_count = 5
    remaining = defaultdict(int)
    
    for l in w2:
        remaining[l] += 1

    for i, l in enumerate(w1):
        if w1[i] == w2[i]:
            pattern[i] = 1
            remaining[w1[i]] -= 1
            remaining_count -= 1
            
    if remaining_count > 0:
        for i, l in enumerate(w1):
            if w1[i] != w2[i]:
                if remaining[w1[i]] > 0:
                    pattern[i] = 0
                    remaining[w1[i]] -= 1
                else:
                    pattern[i] = -1

    return tuple(pattern)

In [22]:
%%time

for w1 in df.word:
    for w2 in df.word:
        pattern = pattern_diff(w1, w2)
        words_with_pattern_probs[w1][pattern] += 1

CPU times: user 18.5 s, sys: 45.1 ms, total: 18.5 s
Wall time: 18.5 s


In [23]:
num_words = df.shape[0]

for word, pattern_dict in words_with_pattern_probs.items():
    for pattern, count in pattern_dict.items():
        words_with_pattern_probs[word][pattern] = round(count / num_words, 6)
        
word_entropy = []

for word, pattern_dict in words_with_pattern_probs.items():
    e = round(sum([p * np.log2(1 / p) for p in pattern_dict.values() if p > 0]), 4)
    word_entropy.append([word, e])

In [27]:
fields = ['word', 'entropy']

with open('wordle_words_entropy.csv', 'w') as fw:
    writer = csv.writer(fw)
    writer.writerow(fields)
    writer.writerows(word_entropy)