# Wordle

### Strategy 1
* Find the most common letters in words with N letters
* Find words without any repeating letters made of the most common letters
* Score these words and submit the top result as a guess
* Filter or re-score based on feedback and continue with next guess
#### Notes
* Build backtesting harness
* Test ML approach vs standard algo above

In [32]:
# Load the autoreload extension
%load_ext autoreload

# Autoreload reloads modules before executing code
# 0: disable
# 1: reload modules imported with %aimport
# 2: reload all modules, except those excluded by %aimport
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import os
import nltk

from nltk.corpus import brown as words

SRC_DIR = os.path.realpath(os.path.join(os.path.curdir, '..'))
NLTK_DIR = os.path.join(SRC_DIR, 'src/makewords/nltk_data')
nltk.data.path.append(NLTK_DIR)

In [41]:
# Prepare for analysis
import pandas as pd

import makewords.score as score
N = 5  # target words with this number of letters

df = score.get_base_df(n=N)
df.head(4)

Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,0,0,0,0,0,0,0,0,0,0
b,0,0,0,0,0,0,0,0,0,0
c,0,0,0,0,0,0,0,0,0,0
d,0,0,0,0,0,0,0,0,0,0


In [42]:
# Begin analysis
import makewords.makewords as make

all_words = make.possible_words(length=N)
df = score.lfreq(df, all_words)
df.head(4)

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,1179,162,471,343,149,54,1065,57,0,0
b,331,217,12,57,38,7,294,17,1,0
c,528,245,38,79,135,31,476,26,0,0
d,593,137,24,79,118,235,521,33,2,0


In [36]:
M = 12

# Find words containing the most common letters with no repeats
occurrences_by_register = (
    df.sort_values(by='total', ascending=False)
      .head(M)[zs]
      .stack()
      .sort_values(axis='index', ascending=False)
      .keys()
)

In [37]:
import collections
base_words = (w.lower() for w in words.words() if len(w) == N and set(w.lower()).difference(string.ascii_lowercase) == set())
wfreq = dict(collections.Counter(base_words))
df_wbase = pd.DataFrame.from_dict(wfreq, orient="index", columns=["freq"])
out = (
    df_wbase[df_wbase > 1]
    .dropna()
    .sort_values(by="freq", ascending=False)
)
out

Unnamed: 0,freq
which,3561.0
there,2728.0
would,2714.0
their,2669.0
about,1815.0
...,...
inept,2.0
spurt,2.0
horde,2.0
sails,2.0


In [38]:
scores = score.top(df, k=12, l=5, m=25, additional="rates")
df_scoring = (
    pd.DataFrame.from_dict(scores, orient="index", columns=["score"])
    .sort_values(by="score", ascending=False)
    .reset_index()
    .rename(columns={"index": "word"})
    .rename_axis("rank")
)

# Many of the top words are completely unfamiliar
# We need to score baseline for word selection before ranking
df_scoring[:25]

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,word,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
0,sores,2471
1,sales,2446
2,soles,2420
3,boies,2346
4,sages,2338
5,cares,2332
6,saves,2329
7,cores,2306
8,bores,2278
9,fares,2269


In [39]:
# We take feedback from first guess and re-score top
newwords = make.possible_words(include="lre", exclude="vinpuats", mask="el.er", length=5)
df1 = score.lfreq(df, words=newwords)
newscores = score.top(df1, words=newwords)
newdf = (
    pd.DataFrame.from_dict(newscores, orient="index", columns=["score"])
    .sort_values(by="score", ascending=False)
    .reset_index()
    .rename(columns={"index": "word"})
    .rename_axis("rank")
)
newdf[:25]

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,word,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
0,elder,1115
