In [1]:
import pandas as pd
import numpy as np
import itertools

from string import ascii_lowercase
from collections import defaultdict

In [2]:
WORD_LEN = 5
LETTERS = set(ascii_lowercase)

In [3]:
def load_common_words():
    df = pd.read_csv('unigrams5_freq.csv')
    df = df[['word', 'count']]
    return df

def load_words():
    five_letter_words = []

    with open('words.txt') as fr:
        for line in fr.readlines():
            word = line.strip().lower()
            intersect = set(word).intersection(LETTERS)
            if len(word) == 5 and len(intersect) == len(set(word)):
                five_letter_words.append(word)

    df = pd.DataFrame(five_letter_words, columns=['word'])
    return df


def load_wordle_words():
    with open('wordle_words.txt') as fr:
        words = [line.strip() for line in fr.readlines()]

    df = pd.DataFrame(words, columns=['word'])
    return df

In [4]:
df = load_wordle_words()
common_df = load_common_words()
wordle_entropy_df = pd.read_csv('wordle_words_entropy.csv')
df = pd.merge(df, common_df, how='left', on='word').fillna(0)
df = pd.merge(df, wordle_entropy_df, how='inner', on='word')
df.drop_duplicates(inplace=True)

In [5]:
df['disqualified'] = 0

In [6]:
df.head()

Unnamed: 0,word,count,entropy,disqualified
0,aback,0.0,3.5159,0
1,abase,0.0,4.5873,0
2,abate,0.0,4.6336,0
3,abbey,4224864.0,3.9329,0
4,abbot,0.0,3.988,0


## Play Game

In [7]:
class Wordled:
    def __init__(self, df):
        self.disqualified = set([])
        self.pinned = [None] * WORD_LEN
        self.unpinned = [set([]) for _ in range(WORD_LEN)]
        self.guesses = []
        self.must_have = set([])
        self.df = df
        self.patterns = list(itertools.product([-1, 0, 1], repeat=5))
        self.calculate_score()
        
    @staticmethod
    def sigmoid(n):
        return 1 / (1 + np.exp(-n))
        
    @staticmethod
    def pattern_diff(w1, w2):
        pattern = [None] * 5
        remaining_count = 5
        remaining = defaultdict(int)

        for l in w2:
            remaining[l] += 1

        for i, l in enumerate(w1):
            if w1[i] == w2[i]:
                pattern[i] = 1
                remaining[w1[i]] -= 1
                remaining_count -= 1

        if remaining_count > 0:
            for i, l in enumerate(w1):
                if w1[i] != w2[i]:
                    if remaining[w1[i]] > 0:
                        pattern[i] = 0
                        remaining[w1[i]] -= 1
                    else:
                        pattern[i] = -1

        return tuple(pattern)
    
    @staticmethod
    def entropy(probs):
        return round(sum([p * np.log2(1 / p) for p in probs if p > 0]), 4)

    def get_word_bits(self, word):
        bits = []
        for i, l in enumerate(word):
            p = self.letter_freq_with_pos[i][l] / self.letter_freq_with_pos[i]['total']
            bits.append(p * np.log2(1 / p))

        return round(sum(bits), 4)
        
    def calculate_letter_frequency(self):
        letter_freq_with_pos = defaultdict(lambda: defaultdict(int))

        for word in self.df.word:
            for i, l in enumerate(word):
                letter_freq_with_pos[i][l] += 1
                letter_freq_with_pos[i]['total'] += 1
        
        return letter_freq_with_pos
    
    def calculate_entropy(self):
        words_with_pattern_probs = defaultdict(lambda: defaultdict(int))

        for w1 in self.df.word:
            for w2 in self.df.word:
                pattern = Wordled.pattern_diff(w1, w2)
                words_with_pattern_probs[w1][pattern] += 1

        num_words = self.df.shape[0]

        for word, pattern_dict in words_with_pattern_probs.items():
            for pattern, count in pattern_dict.items():
                words_with_pattern_probs[word][pattern] = round(count / num_words, 6)

        entropies = [Wordled.entropy(patts.values()) for patts in words_with_pattern_probs.values()]
            
        return entropies

            
    def calculate_score(self):
        if self.df['count'].sum() > 0:
            self.df['pop_score'] = (round(self.df['count'] / self.df['count'].sum(), 6)) * 100
            self.df['pop_score'] = self.df.pop_score.apply(Wordled.sigmoid)
        
        self.letter_freq_with_pos = self.calculate_letter_frequency()
        self.df.loc[:, 'bits'] = self.df.word.apply(self.get_word_bits)
        score = 1 * self.df['entropy'] + 1 * self.df['pop_score'] + 1 * self.df['bits']
        self.df.loc[:, 'score'] = round(score, 4)


        
    def disqualify(self, word):
        for idx in range(WORD_LEN):
            if self.pinned[idx] and word[idx] != self.pinned[idx]:
                return 1
            if word[idx] in self.disqualified:
                return 1
            if word[idx] in self.unpinned[idx]:
                return 1

        return len(self.must_have.intersection(word)) != len(self.must_have)
    
    
    def record_guess(self, word, feedback):
        pins = [(i, l) for i, l in enumerate(word) if feedback[i] == 1]
        unpins = [(i, l) for i, l in enumerate(word) if feedback[i] == 0]
        present_letters = set(self.pinned + [l for i, l in pins] + [l for i, l in unpins])
        disqualified = [
            word[i] for i in range(WORD_LEN)
            if feedback[i] == -1
            and word[i] not in present_letters
        ]
        self.disqualified.update(set(disqualified))
                
        for idx, pin in pins:
            self.pinned[idx] = pin
        
        for idx, unpin in unpins:
            self.unpinned[idx].add(unpin)
            self.must_have.add(unpin)
            
        self.guesses.append(word)
        self.df.loc[:, 'disqualified'] = self.df.word.apply(self.disqualify)
        self.df = self.df.loc[self.df.disqualified == 0, :]
        self.df.loc[:, 'entropy'] = self.calculate_entropy()
        self.calculate_score()
        
    def guess(self, common_only=False, score_col='score'):
        if common_only:
            results = self.df[self.df.pop_score > 0] \
                .sort_values(by=score_col, ascending=False)
        else:
            results = self.df.sort_values(by=score_col, ascending=False)
        
        if len(results) > 0:
            return results.iloc[0].word
        
        return None
    
    def show_all_candidates(self, common_only=False, score_col='score'):
        if common_only:
            return self.df[self.df.pop_score > 0].sort_values(by=score_col, ascending=False)

        return self.df.sort_values(by=score_col, ascending=False)

In [113]:
game = Wordled(df.copy())
word = game.guess()
print(word)

slate


In [109]:
game.record_guess(word=word, feedback=[-1, -1, 0, -1, -1])
word = game.guess()
print(word)

manor


In [112]:
game.record_guess(word=word, feedback=[0, 1, -1, -1, 0])
word = game.guess()
print(word)

karma


In [106]:
game.record_guess(word=word, feedback=[-1, -1, -1, 1, 1])
word = game.guess()
print(word)

wheel


In [96]:
game.record_guess(word=word, feedback=[-1, -1, -1, 1, 1])
word = game.guess()
print(word)

excel


In [98]:
game.record_guess(word=word, feedback=[-1, -1, -1, 1, 1])
word = game.guess()
print(word)

model


In [111]:
game.show_all_candidates() # .head(10)

Unnamed: 0,word,count,entropy,disqualified,pop_score,bits,score
1198,manor,7084688.0,5.1389,False,0.598808,2.0900,7.8277
337,carry,33056477.0,4.5444,False,0.866331,2.0596,7.4703
331,canon,41144763.0,4.4765,False,0.911024,2.0616,7.4491
152,baron,3504549.0,4.9782,False,0.549364,1.9158,7.4434
324,cairn,0.0,5.0512,False,0.500000,1.8864,7.4376
...,...,...,...,...,...,...,...
1071,kayak,2628993.0,3.0667,False,0.537082,1.2422,4.8460
1069,kappa,2793667.0,2.8544,False,0.539393,1.2997,4.6935
1424,pizza,14763787.0,2.3357,False,0.697348,1.0264,4.0594
1050,jazzy,0.0,2.1907,False,0.500000,1.2107,3.9014
