In [1]:
import pandas as pd
from string import ascii_uppercase
from itertools import product
from src.helper_methods import *

In [2]:
df = pd.read_csv('Data-Preprocessed/word_freq_wordle_only.csv')
df.head()

Unnamed: 0,word,wordFreq
0,AALII,0
1,AARGH,71592
2,AARTI,63273
3,ABACA,79841
4,ABACK,272920


# Letter Distributions

In [3]:
def get_counts(letters: list, places: list, df: pd.DataFrame) -> pd.DataFrame:
    tracker = pd.DataFrame(columns=['letter', 'place', 'letterFreq'])
    for letter in letters:
        for place in places:
            # Generate regex
            regex = list('.'*len(places))
            regex[place] = letter
            regex = ''.join(regex)
            # Calculate and record
            tracker = pd.concat([tracker, pd.DataFrame(data={'letter': letter, 'place': place, 'letterFreq': len(df[df['word'].str.match(regex)])}, index=[0])], ignore_index=True)
    return tracker

In [4]:
tracker = get_counts(letters=list(ascii_uppercase), places=[0,1,2,3,4], df=df)
tracker.head()

Unnamed: 0,letter,place,letterFreq
0,A,0,549
1,A,1,1664
2,A,2,927
3,A,3,771
4,A,4,559


In [5]:
tracker_summary = tracker.drop('place', axis=1).groupby('letter').sum().sort_values(by='letterFreq', ascending=False)
tracker_summary.head()

Unnamed: 0_level_0,letterFreq
letter,Unnamed: 1_level_1
E,4901
A,4470
S,4337
O,3262
R,3247


## Determine Top Words From Letter Frequencies

In [6]:
# letter_freq = df['word'].apply(lambda x: sum([tracker_summary.loc[letter] for letter in set(x)]))
# words = pd.concat([df, letter_freq], axis=1).sort_values(by='letterFreq', ascending=False)
# words.head()

# Repeated Letters

In [14]:
words_w_repeats = df.copy()
words_w_repeats['repeats'] = words_w_repeats['word'].apply(lambda x: max([x.count(y) for y in x]))
words_w_repeats.head()

Unnamed: 0,word,wordFreq,repeats
0,AALII,0,2
1,AARGH,71592,2
2,AARTI,63273,2
3,ABACA,79841,3
4,ABACK,272920,2


In [15]:
words_w_repeats['repeats'].describe()

count    9531.000000
mean        1.363655
std         0.502415
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: repeats, dtype: float64

# Information Approach (In Progress)
Determine the potential of each word to filter down the word list. Look at all combinations of responses from Wordle.

In [42]:
def get_regex(letter: str, place: int, word_length: int) -> str:
    regex = list('.'*word_length)
    regex[place] = letter
    return ''.join(regex)


def fp(word: str, words: pd.DataFrame) -> pd.DataFrame:
    # TODO: redo this for all possible outcomes: (exact, inexact), (inexact, exclude), etc.
    res = {'word':word, 'num_post_exclude':0, 'num_post_exact':0, 'num_post_inexact':0}
    for place in range(len(word)):
        letter = word[place]
        # Exclusions
        res['num_post_exclude'] += len(filter_exclude(exclude=set(letter), words=words))
        # Exact matches
        regex = get_regex(letter=letter, place=place, word_length=len(word))
        res['num_post_exact'] += len(filter_exact(exact=regex, words=words))
        # Inexact matches
        res['num_post_inexact'] += len(filter_inexact(inexact={letter: set({place})}, words=words, word_length=len(word)))
    return res

def get_potential_fp(words: pd.DataFrame) -> pd.DataFrame:
    word_length = len(words['word'].iloc[0])
    outcomes = ['exact', 'exclude', 'inexact']
    list1 = list(product(['exact', 'exclude', 'inexact'], range(3)))
    list2 = [str(x)+str(y) for (x,y) in list1]
    list3 = list(product(list2, list2))
    list3 = [str(x)+"_"+str(y) for (x,y) in list3 if x != y]
    columns = ['word'] + list3

    df_res = pd.DataFrame(columns=columns)
    for word in words['word']:
        df_res = pd.concat([df_res, pd.DataFrame(data=fp(word=word, words=words), index=[0])], ignore_index=True)
    for col in ['num_post_exclude', 'num_post_exact', 'num_post_inexact']:
        df_res[col] = df_res[col].astype('int')
    df_res['word'] = df_res['word'].astype('str')
    res_max = df_res[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].max(axis=1).max()
    df_res['min_post_num'] = df_res[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].replace(0, res_max).min(axis=1).astype('int')
    return df_res.sort_values(by='min_post_num')

In [43]:
df_test = pd.DataFrame(data={
    'word':['AB', 'BA', 'BC','CD'],
    'wordFreq':[3,4,5,5]
})

df_test = get_potential_fp(words=df_test).sort_index()
df_test['sum'] = df_test[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].sum(axis=1)
df_test

Unnamed: 0,word,exact0_exact1,exact0_exact2,exact0_exclude0,exact0_exclude1,exact0_exclude2,exact0_inexact0,exact0_inexact1,exact0_inexact2,exact1_exact0,...,inexact2_exclude0,inexact2_exclude1,inexact2_exclude2,inexact2_inexact0,inexact2_inexact1,num_post_exclude,num_post_exact,num_post_inexact,min_post_num,sum
0,AB,,,,,,,,,,...,,,,,,3,2,3,2,8
1,BA,,,,,,,,,,...,,,,,,3,3,2,2,8
2,BC,,,,,,,,,,...,,,,,,3,3,2,2,8
3,CD,,,,,,,,,,...,,,,,,5,2,1,1,8
