<a href="https://colab.research.google.com/github/pjconnell/Wordle_Solver/blob/main/Wordle_Solver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [89]:
def ltr_freq(temp):
    # calculate frequency of letters in each position for remaining words
    ltr_den = np.zeros((26,5))
    
    alphabet = ['A', 'B','C','D','E','F',
                'G','H','I','J','K','L',
                'M','N','O','P','Q','R',
                'S','T','U','V','W','X',
                'Y','Z']
    positions = ['pos_1','pos_2','pos_3','pos_4','pos_5']
    
    # df to hold density counts
    df = pd.DataFrame(ltr_den, columns = positions,index=alphabet)

    #iterate through temp to count letter freq in each position
    for i in tqdm(range(len(temp))):
        for j in range(0,5):
            for k in range(len(alphabet)):
                if temp[i][j] == alphabet[k]:
                    df[positions[j]][k]+=1.0
    return df

In [5]:
def update_wd_list(contains, excludes, pos_str, non_pos, temp):
    for i in range(len(contains)):
        temp = [word for word in temp if contains[i] in word]

    for i in range(len(excludes)):
        temp = [word for word in temp if excludes[i] not in word]

    for i in range(len(pos_str)):
        if pos_str[i] != '?':
            temp = [word for word in temp if word[i] == pos_str[i]]

    for i in range(len(non_pos)):
        if len(non_pos[i]) >0:
            for j in range(len(non_pos[i])):
                if non_pos[i][j] != '?':
                    temp = [word for word in temp if word[j] != non_pos[i][j]]
    return temp


In [81]:
def guess_wd(temp):
  # set df with updated word list and column to hold value of guessing word
  temp2 = pd.DataFrame(temp,columns =['word'])
  temp2['guess_val']=0.0

  for i in tqdm(range(temp2.shape[0])):
    wd = temp2.word[i]
    for j in range(len(temp2.word[i])):
      # calculate expected value of letter in that position
      ev_1 = (df[positions[j]][wd[j].upper()]/temp2.shape[0])*(temp2.shape[0]-df[positions[j]][wd[j].upper()])

      if j == 0:
        # calculate expected value of letter in word, but not in that position
        ev_2 = ((df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()])/(temp2.shape[0]))*(temp2.shape[0]-(df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()]))

        # calculate expected value of the letter not being in that word
        ev_3 = ((temp2.shape[0]-df.sum(axis=1)[wd[j].upper()])/(temp2.shape[0]))*(df.sum(axis=1)[wd[j].upper()])
      
      if j > 0: # don't want to double count the benefit of ruling out letters that occur more than once
        # to check if letter is repeated, define word without letter
        w2 = wd[:j]+wd[j+1:]
        
        if wd[j] in w2:
          ev_2 = 0
          ev_3 =0
        elif wd[j] not in w2:
          # calculate expected value of letter in word, but not in that position
          ev_2 = ((df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()])/(temp2.shape[0]))*(temp2.shape[0]-(df.sum(axis=1)[wd[j].upper()]-df[positions[j]][wd[j].upper()]))

          # calculate expected value of the letter not being in that word
          ev_3 = ((temp2.shape[0]-df.sum(axis=1)[wd[j].upper()])/(temp2.shape[0]))*(df.sum(axis=1)[wd[j].upper()])

      # add to total expected value
      ltr_ev = ev_1+ev_2+ev_3
      temp2.guess_val[i] += ltr_ev

  return temp2.word[temp2.guess_val.idxmax()]

In [105]:
# read scrabble dictionary to wordlist
with open("/content/drive/MyDrive/Wordle Solver/scrabble_dict.txt") as f:
    wordlist = f.read().splitlines()[2:]

# filter for 5 letter words / no plurals (Wordle doesn't use them)
temp = [word for word in wordlist if len(word) == 5 ]
temp = [word for word in temp if word[4] != 'S' or word[3] == 'S']


# store alphabet as a list
alphabet = ['A', 'B','C','D','E','F',
            'G','H','I','J','K','L',
            'M','N','O','P','Q','R',
            'S','T','U','V','W','X',
            'Y','Z']

# store potential letter positions as a list
positions = ['pos_1','pos_2','pos_3','pos_4','pos_5']

# create initial density chart
df = ltr_freq(temp)

100%|██████████| 9049/9049 [00:04<00:00, 1973.81it/s]


In [106]:
#### Interface - rerun this cell to generate each new guess

# ID known letters
input_contains = input("Enter letters the word contains separated by a space: ")
contains = input_contains.split()

# ID known excluded letters
input_excludes = input("Enter excluded letters separated by a space: ")
excludes = input_excludes.split()

# ID known positions
pos_str = input("Enter known positions (e.g., S??A?): ")

# ID known non-positions
non_pos =["","","","",""]
for i in range(len(contains)):
    if contains[i] not in pos_str:
        non_pos[i] = input(f"Enter known nonpositions for letter {contains[i]} (e.g., R??R?): ")

# update frequency counts, remaining word list and generate new guess
temp = update_wd_list(contains, excludes, pos_str, non_pos, temp)
df = ltr_freq(temp)
guess = guess_wd(temp)
if len(guess)<5:
    guess = temp[0]
print(f"You should try: {guess}")

Enter letters the word contains separated by a space: R A
Enter excluded letters separated by a space: O T E
Enter known positions (e.g., S??A?): ??A??
Enter known nonpositions for letter R (e.g., R??R?): R?R??


100%|██████████| 130/130 [00:00<00:00, 2090.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 130/130 [00:01<00:00, 124.55it/s]

You should try: CRANK



