# Wordle

### Strategy 1
* Find the most common letters in words with N letters
* Find words without any repeating letters made of the most common letters
* Score these words and submit the top result as a guess
* Filter or re-score based on feedback and continue with next guess
#### Notes
* Build backtesting harness
* Test ML approach vs standard algo above

In [1]:
# Load the autoreload extension
%load_ext autoreload

# Autoreload reloads modules before executing code
# 0: disable
# 1: reload modules imported with %aimport
# 2: reload all modules, except those excluded by %aimport
%autoreload 2

In [2]:
import os
import nltk

from nltk.corpus import words

SRC_DIR = os.path.realpath(os.path.join(os.path.curdir, '..'))
NLTK_DIR = os.path.join(SRC_DIR, 'src/makewords/nltk_data')
nltk.data.path.append(NLTK_DIR)

In [3]:
# Prepare for analysis

import string
from itertools import chain

import numpy as np
import pandas as pd

N = 5  # target words with this number of letters
zs = ['z{}'.format(n) for n in range(N)]  # for counting appearances, index 0-4
ns = ['n{}'.format(n) for n in range(1,N)]  # for counting n appearances per word

def get_clean_df():
    df = pd.DataFrame(
        0,
        index=np.arange(26),
        columns=list(chain(['total'], list(zs), list(ns)))
    )
    df.insert(
        0,
        'letters',
        list(string.ascii_lowercase)
    )
    df = df.set_index('letters')
    return df

df = get_clean_df()
df.head(4)

Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,0,0,0,0,0,0,0,0,0,0
b,0,0,0,0,0,0,0,0,0,0
c,0,0,0,0,0,0,0,0,0,0
d,0,0,0,0,0,0,0,0,0,0


In [7]:
# Begin analysis
import makewords.makewords as make
import makewords.score as score

all_words = make.possible_words(length=N)
df = score.from_words(df, all_words)
df.head(4)

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,4469,634,1406,872,877,680,3304,547,23,0
b,1162,628,55,261,154,64,989,82,3,0
c,1547,635,153,237,374,148,1368,86,2,0
d,1399,383,69,288,228,431,1231,78,4,0


In [8]:
M = 12

# Take the M most common letters
chars = (
    df['total']
    .sort_values(axis='index', ascending=False)
    .head(M)
    .keys()
    .to_list()
)

# Find words containing the most common letters with no repeats
occurrences_by_register = (
    df.sort_values(by='total', ascending=False)
      .head(M)[zs]
      .stack()
      .sort_values(axis='index', ascending=False)
      .keys()
)

# Score all N letter words containing the most common letters
print(occurrences_by_register)

print(df)

MultiIndex([('a', 'z1'),
            ('e', 'z4'),
            ('y', 'z4'),
            ('e', 'z3'),
            ('o', 'z1'),
            ('s', 'z0'),
            ('e', 'z1'),
            ('a', 'z3'),
            ('a', 'z2'),
            ('r', 'z2'),
            ('i', 'z1'),
            ('i', 'z3'),
            ('u', 'z1'),
            ('i', 'z2'),
            ('o', 'z2'),
            ('r', 'z1'),
            ('a', 'z4'),
            ('t', 'z4'),
            ('r', 'z4'),
            ('c', 'z0'),
            ('a', 'z0'),
            ('n', 'z2'),
            ('t', 'z0'),
            ('e', 'z2'),
            ('n', 'z4'),
            ('l', 'z4'),
            ('n', 'z3'),
            ('l', 'z1'),
            ('l', 'z2'),
            ('r', 'z3'),
            ('l', 'z3'),
            ('o', 'z3'),
            ('u', 'z2'),
            ('t', 'z3'),
            ('s', 'z3'),
            ('t', 'z2'),
            ('r', 'z0'),
            ('c', 'z3'),
            ('u', 'z3'),
            ('s', 'z4'),


In [9]:

# Build the first guess solely by sorting based on relative occurences per register
guess = {}
for letter, reg in occurrences_by_register:
    if reg not in guess:
        if letter not in guess.values():
            guess[reg] = letter

import operator
f = operator.itemgetter(0)
out = list(guess.items())
out.sort(key=lambda x: x[0])
res = ''.join([l for _,l in out])

print(res)  # This should be a sensible suggestion like "board"

sarie
