In [1]:
# mike babb
# 2022 03 03 
# solve wordle

In [2]:
# standard
import pickle
import os
import sqlite3
from string import ascii_lowercase

In [3]:
import pandas as pd
import numpy as np

In [4]:
# define a function to load a pickle
def load_pickle(file_name):
    if os.path.exists(file_name):
        with open(file_name, 'rb') as handle:
            de_pickle = pickle.load(handle)
    else:
        de_pickle = None
        print("file does not exist")
    return de_pickle

In [5]:
char_matrix = load_pickle(file_name = 'char_matrix.pkl')
letter_dict = load_pickle(file_name = 'letter_dict.pkl')
letter_rank_dict = load_pickle(file_name ='letter_rank_dict.pkl')
word_df = load_pickle(file_name = 'word_df.pkl')

In [6]:
word_df.head()

Unnamed: 0,word,lcase,word_id,n_unique_chars,word_score,word_group
0,aalii,aalii,0,3,20449.0,-2134795976539657997
1,Aaron,aaron,1,4,20642.0,5168226787426640364
2,abaca,abaca,2,3,20009.0,7298090979578241273
3,aback,aback,3,4,15456.0,6926502400987240042
4,abaff,abaff,4,3,14038.0,7931255010571806167


In [7]:
word_df = word_df.sort_values(by = ['n_unique_chars', 'word_score'], ascending=[False, False])

In [8]:
word_df

Unnamed: 0,word,lcase,word_id,n_unique_chars,word_score,word_group
568,Aries,aries,568,5,19940.0,-7593473510551836896
572,arise,arise,572,5,19940.0,-7593473510551836896
6726,raise,raise,6726,5,19940.0,-7593473510551836896
7430,serai,serai,7430,5,19940.0,-7593473510551836896
592,arose,arose,592,5,19939.0,7873129116485683232
...,...,...,...,...,...,...
231,Ajaja,ajaja,231,2,17347.0,-8296895309048215284
5141,mamma,mamma,5141,2,15994.0,2651141554109999925
9096,ululu,ululu,9096,2,12230.0,7653841412289328249
1934,cocco,cocco,1934,2,11850.0,4163598073502918443


In [9]:
# gather the word id
word_id_list = word_df['word_id'].to_numpy()

In [10]:
def suggest_words(toss_letters = None, 
                  known_pos = None, known_anti_pos = None, verbose = True):
    """
    toss_letters: list. Letters known to be in the word
    known_pos: list. Letters in the correct position
    known_anti_pos: list. Letters in the incorrect position
    
    entering no values returns a list of suggested starting words, ranked by the frequency
    of each letter's occurence. Higher ranked words contain letters that occur more
    frequently across all five-digit words.    
    """    
       
    # intialize lists if necessary
    blank_count = 0
    
    if toss_letters is None:
        toss_letters = ['']*5
        blank_count += 1
    else:
        toss_letters = [tl for tl in toss_letters]
        print(toss_letters)

    if known_pos is None:
        known_pos = ['']*5
        blank_count += 1    
    if known_anti_pos is None:
        known_anti_pos = ['']*5
        blank_count += 1
        
    for i_kap in range(len(known_anti_pos)):
        if known_pos[i_kap] != '':
            known_anti_pos[i_kap] = ''
    
    # check if the user supplied values
    if blank_count <3:
    
        # use this list to winnow down selections
        zero_id_list = np.zeros(shape = word_id_list.shape, dtype = int)
        
        # build the list of letters that are known
        keep_letters = set(known_pos)
        for kap in known_anti_pos:
            keep_letters = keep_letters.union(kap)
                               
        for kl in keep_letters:
            if kl != '':
                for jl in kl:
                    zero_id_list[char_matrix[:, letter_dict[jl]]>0] += 1
                
        # remove a word when there is an incorrect letter
        for tl in toss_letters:
            if tl != '':
                zero_id_list[char_matrix[:, letter_dict[tl]]>0] -= 1
        
        # this is the list of word_ids that have letters of interest
        curr_word_five_id_list = word_id_list[zero_id_list >= (len(keep_letters) - 1)]

        # select from the data frame based on the word id
        curr_df = word_df.loc[word_df['word_id'].isin(curr_word_five_id_list),
                          ['lcase', 'word_score', 'n_unique_chars']]

        # the list of possible words, after removing words with incorrect letters
        pos_words = curr_df['lcase'].values        

        # we're going to make use of the lists of letters with known positions
        # and letters known to be in the word, but in the incorrect post
        output_list = np.full(shape = pos_words.shape, fill_value=True)
        #print(known_pos)
        #print(known_anti_pos)
        # enumerate each word        
        for i_pos_word, pos_word in enumerate(pos_words):
            #print(pw)            
            # enumerate each character in each word
            for ii_pwl, i_pwl in enumerate(pos_word):
                # known positions
                # based on the index, remove the word if the letters do not match
                if known_pos[ii_pwl] != '':
                    #print(kp[ii_pw], i_pw)
                    if known_pos[ii_pwl] != i_pwl:
                        output_list[i_pos_word] = False
                # known anti-positions                
                # based on the index, remove the word if the letters do match
                if known_anti_pos[ii_pwl] != '':
                    curr_known_anti_pos = known_anti_pos[ii_pwl]
                    for jj_pwl in curr_known_anti_pos:
                        #print(kap[ii_pwl], i_pwl)
                        #known_anti_pos[ii_pwl]
                        if jj_pwl == i_pwl:
                            output_list[i_pos_word] = False
        
        # the list of possible words
        pos_words = pos_words[output_list]
        
        wdf = word_df.loc[word_df['lcase'].isin(pos_words), ['lcase', 'n_unique_chars', 'word_score']]
        wdf = wdf.sort_values(by = ['n_unique_chars', 'word_score'],
                                 ascending=False)
                
    else:
        # the initial list of words to choose from, sorted by unique characters and word score
        wdf = word_df.sort_values(by = ['n_unique_chars', 'word_score'],
                                 ascending=False)
    
    pos_words = wdf['lcase'].tolist()
    if verbose:
        print(pos_words)

    return len(pos_words)

In [11]:
# letters known to not be in the word
# can be a list any length long
toss_letters = 'heus'
# letters in the correct position
# list with length five
kp = ['','','','','']
#kp = None

# letters in the word, but in the incorrect spot
# list with 
kap = ['', '', '', '', '']
#kap = None

# iterate until a solution is found
suggest_words(toss_letters = toss_letters, known_pos = kp, known_anti_pos = kap, verbose=True)

['h', 'e', 'u', 's']
['aries', 'arise', 'raise', 'serai', 'arose', 'oreas', 'ariel', 'orate', 'arles', 'arsle', 'rasen', 'aster', 'serta', 'strae', 'alert', 'artel', 'later', 'ratel', 'taler', 'telar', 'retan', 'aisle', 'elias', 'saite', 'alien', 'anile', 'elain', 'laine', 'tinea', 'atone', 'oaten', 'serau', 'urase', 'aeric', 'ceria', 'ocrea', 'alure', 'ureal', 'urnae', 'urate', 'aider', 'deair', 'oared', 'oread', 'aimer', 'marie', 'haire', 'antes', 'stean', 'aueto', 'opera', 'ental', 'leant', 'carse', 'caser', 'scrae', 'resay', 'sayer', 'seary', 'barie', 'erbia', 'rebia', 'ceral', 'relay', 'enray', 'caret', 'carte', 'crate', 'creat', 'creta', 'dares', 'tayer', 'arion', 'noria', 'asher', 'share', 'lader', 'redan', 'detar', 'tread', 'namer', 'reman', 'asper', 'prase', 'spaer', 'spare', 'earth', 'hater', 'rathe', 'aleut', 'parel', 'pearl', 'perla', 'relap', 'besra', 'saber', 'adiel', 'pater', 'peart', 'prate', 'taper', 'terap', 'diane', 'idean', 'anode', 'deota', 'todea', 'amole', 'maleo

2547

In [None]:
outcome = suggest_words(toss_letters = toss_letters, known_pos = kp, known_anti_pos = kap, verbose=True)

In [None]:
outcome

In [None]:
word_df['word_group'].unique().shape

In [None]:
temp_df = word_df.drop_duplicates(subset = ['word_group']).sort_values(by = ['lcase'])

In [None]:
temp_df.shape

In [None]:
word_df = word_df.sort_values(by = 'lcase')

In [None]:
remove_word_list = []
kp = ['','','','','']
kap = ['', '', '', '', '']
prev_letter = word_df['lcase'].iloc[0][0]
print(prev_letter)
for cw in word_df['lcase'].tolist():
    curr_letter = cw[0]
    if prev_letter != curr_letter:
        print(curr_letter)
        curr_letter = prev_letter    

    #print(cw)
    outcome = suggest_words(toss_letters = cw, known_pos = kp, known_anti_pos = kap, verbose = False)
    remove_word_list.append([cw, outcome])


In [None]:
# what is the best word to start with?
best_start_word = pd.DataFrame(data = remove_word_list, columns = ['lcase', 'n_next_word'])


In [None]:
best_start_word = best_start_word.sort_values(by = 'n_next_word')

In [None]:
best_start_word.head()