# Wordle

### Strategy 1
* Find the most common letters in words with N letters
* Find words without any repeating letters made of the most common letters
* Score these words and submit the top result as a guess
* Filter or re-score based on feedback and continue with next guess
#### Notes
* Build backtesting harness
* Test ML approach vs standard algo above

In [76]:
# Load the autoreload extension
%load_ext autoreload

# Autoreload reloads modules before executing code
# 0: disable
# 1: reload modules imported with %aimport
# 2: reload all modules, except those excluded by %aimport
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
import os
import nltk

from nltk.corpus import words

SRC_DIR = os.path.realpath(os.path.join(os.path.curdir, '..'))
NLTK_DIR = os.path.join(SRC_DIR, 'src/makewords/nltk_data')
nltk.data.path.append(NLTK_DIR)

In [78]:
# Prepare for analysis

import string
from itertools import chain

import numpy as np
import pandas as pd

N = 5  # target words with this number of letters
zs = ['z{}'.format(n) for n in range(N)]  # for counting appearances, index 0-4
ns = ['n{}'.format(n) for n in range(1,N)]  # for counting n appearances per word

def get_clean_df():
    df = pd.DataFrame(
        0,
        index=np.arange(26),
        columns=list(chain(['total'], list(zs), list(ns)))
    )
    df.insert(
        0,
        'letters',
        list(string.ascii_lowercase)
    )
    df = df.set_index('letters')
    return df

df = get_clean_df()
df.head(4)

Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,0,0,0,0,0,0,0,0,0,0
b,0,0,0,0,0,0,0,0,0,0
c,0,0,0,0,0,0,0,0,0,0
d,0,0,0,0,0,0,0,0,0,0


In [79]:
# Begin analysis
import makewords.makewords as make
import makewords.score as score

all_words = make.possible_words(length=N)
df = score.lfreq(df, all_words)
df.head(4)

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,total,z0,z1,z2,z3,z4,n1,n2,n3,n4
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a,4467,633,1406,872,877,679,3304,547,23,0
b,1162,628,55,261,154,64,989,82,3,0
c,1546,635,152,237,374,148,1368,86,2,0
d,1399,383,69,288,228,431,1231,78,4,0


In [80]:
M = 12

# Find words containing the most common letters with no repeats
occurrences_by_register = (
    df.sort_values(by='total', ascending=False)
      .head(M)[zs]
      .stack()
      .sort_values(axis='index', ascending=False)
      .keys()
)

In [100]:
scores = score.top(df, k=12, l=5, m=25, additional="rates")
df_scoring = (
    pd.DataFrame.from_dict(scores, orient="index", columns=["score"])
    .sort_values(by="score", ascending=False)
    .reset_index()
    .rename(columns={"index": "word"})
    .rename_axis("rank")
)
df_scoring[:25]

[makewords] Cleaning 'en' wordlist from nltk.


Unnamed: 0_level_0,word,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
0,soree,5539
1,salay,5163
2,boree,5036
3,sairy,4986
4,saily,4985
5,cooee,4941
6,teaey,4929
7,solay,4900
8,soary,4891
9,shree,4857
