<a href="https://colab.research.google.com/github/pjconnell/Wordle_Solver/blob/main/Wordle_Solver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import urllib
import requests

In [4]:
def ltr_freq(temp):
    # calculate frequency of letters in each position for remaining words
    ltr_den = np.zeros((26,5))
    
    alphabet = ['A', 'B','C','D','E','F',
                'G','H','I','J','K','L',
                'M','N','O','P','Q','R',
                'S','T','U','V','W','X',
                'Y','Z']
    positions = ['pos_1','pos_2','pos_3','pos_4','pos_5']
    
    # df to hold density counts
    df = pd.DataFrame(ltr_den, columns = positions,index=alphabet)

    #iterate through temp to count letter freq in each position
    for i in tqdm(range(len(temp))):
        for j in range(0,5):
            for k in range(len(alphabet)):
                if temp[i][j] == alphabet[k]:
                    df[positions[j]][k]+=1.0
    return df

In [117]:
def update_wd_list(contains, excludes, pos_str, non_pos, temp):
    for i in range(len(contains)):
        temp = [word for word in temp if contains[i] in word]

    for i in range(len(excludes)):
        temp = [word for word in temp if excludes[i] not in word]

    for i in range(len(pos_str)):
        if pos_str[i] != '?':
            temp = [word for word in temp if word[i] == pos_str[i]]

    for i in range(len(non_pos)):
        if len(non_pos[i]) >0:
            for j in range(len(non_pos[i])):
                if non_pos[i][j] != '?':
                    temp = [word for word in temp if word[j] != non_pos[i][j]]
    return temp


In [193]:
def guess_wd(temp):
  # set df with updated word list and column to hold value of guessing word
  temp2 = pd.DataFrame(temp,columns =['word'])
  temp2['guess_val']=0.0

  for i in tqdm(range(temp2.shape[0])):
    wd = temp2.word[i]
    for j in range(len(temp2.word[i])):
      # calculate expected value of letter in that position
      ev_1 = (df[positions[j]][wd[j].upper()]/temp2.shape[0])*(temp2.shape[0]-df[positions[j]][wd[j].upper()])

      if j == 0:
        # calculate expected value of letter in word, but not in that position
        ev_2 = ((df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()])/(temp2.shape[0]))*(temp2.shape[0]-(df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()]))

        # calculate expected value of the letter not being in that word
        ev_3 = ((temp2.shape[0]-df.sum(axis=1)[wd[j].upper()])/(temp2.shape[0]))*(df.sum(axis=1)[wd[j].upper()])
      
      if j > 0: # don't want to double count the benefit of ruling out letters that occur more than once
        # to check if letter is repeated, define word without letter
        w2 = wd[:j]+wd[j+1:]
        
        if wd[j] in w2:
          ev_2 = 0
          ev_3 =0
        elif wd[j] not in w2:
          # calculate expected value of letter in word, but not in that position
          ev_2 = ((df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()])/(temp2.shape[0]))*(temp2.shape[0]-(df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()]))

          # calculate expected value of the letter not being in that word
          ev_3 = ((temp2.shape[0]-df.sum(axis=1)[wd[j].upper()])/(temp2.shape[0]))*(df.sum(axis=1)[wd[j].upper()])

      # add to total expected value
      ltr_ev = ev_1+ev_2+ev_3
      temp2.guess_val[i] += ltr_ev
      
  return temp2.word[temp2.guess_val.idxmax()]

In [269]:
# # load df from pickle
# # be sure to have the scrabble dictionary text file saved to your Google drive
use_df = pd.read_pickle('/content/drive/MyDrive/Wordle Solver/wds_mc.pickle') # get our list of valid guesses
use_df.columns = ['word','mc']
ans_df = use_df.sort_values(by=['mc'], ascending=False)
ans_df = ans_df.head(4000) # get our list of potential answers
ans_df = ans_df.reset_index(drop=True)

# store alphabet as a list
alphabet = ['A', 'B','C','D','E','F',
            'G','H','I','J','K','L',
            'M','N','O','P','Q','R',
            'S','T','U','V','W','X',
            'Y','Z']

# store potential letter positions as a list
positions = ['pos_1','pos_2','pos_3','pos_4','pos_5']

# create initial density chart
df = ltr_freq(ans_df['word'])

# create initial answer list
temp = ans_df['word']


In [273]:
#### Interface - rerun this cell to generate each new guess

# ID known letters
input_contains = input("Enter letters the word contains separated by a space: ")
contains = input_contains.split()

# ID known excluded letters
input_excludes = input("Enter excluded letters separated by a space: ")
excludes = input_excludes.split()

# ID known positions
pos_str = input("Enter known positions (e.g., S??A?): ")

# ID known non-positions
non_pos =["","","","",""]
for i in range(len(contains)):
    if contains[i] not in pos_str:
        non_pos[i] = input(f"Enter known nonpositions for letter {contains[i]} (e.g., {contains[i]}??{contains[i]}?): ")

# update frequency counts, remaining word list and generate new guess
temp = update_wd_list(contains, excludes, pos_str, non_pos, temp)
df = ltr_freq(temp)
guess = guess_wd(temp)
# if len(guess)<5:
#     guess = temp[0]
print(f"You should try: {guess}")

Enter letters the word contains separated by a space: L E A T
Enter excluded letters separated by a space: R S D C
Enter known positions (e.g., S??A?): ?LEAT


100%|██████████| 2/2 [00:00<00:00, 645.48it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 2/2 [00:00<00:00, 100.53it/s]

You should try: BLEAT





In [274]:
temp # run this if you want to see which words are still potential answers

['BLEAT', 'PLEAT']

In [267]:
### note - this takes a couple hours to run and is how the 'wds_mc.pickle' file was created
# # read scrabble dictionary to wordlist
# with open("/content/drive/MyDrive/Wordle Solver/scrabble_dict.txt") as f:
#     wordlist = f.read().splitlines()[2:]

# # filter for 5 letter words / no plurals (Wordle doesn't use them)
# wordlist = [word for word in wordlist if len(word) == 5 ]
# wordlist = [word for word in temp if word[4] != 'S' or word[3] == 'S']

# # made df to hold wordlist and wordcounts (to down-weight unlikely scrabble wds)
# use_df = pd.DataFrame(wordlist,columns=['words'])
# use_df['mc'] = 0

# for i in tqdm(range(use_df.shape[0])):
#   # do the request from phrasefinder.io to get the number of mentions in corpus
#   encoded_query = urllib.parse.quote(temp[i])
#   params = {'corpus': 'eng-us', 'query': encoded_query, 'topk': 1}
#   params = '&'.join('{}={}'.format(name, value) for name, value in params.items())
#   response = requests.get('https://api.phrasefinder.io/search?' + params)
#   assert response.status_code == 200

#   # add mention count to use_df
#   try:
#     use_df['mc'][i] = response.json()['phrases'][0]['mc']
#   except:
#     use_df['mc'][i] = 0

# use_df.to_pickle('wds_mc.pickle')