In [26]:
import pandas as pd
import seaborn as sns
from itertools import product
from src.helper_methods import *

In [3]:
df = pd.read_csv('Data-Preprocessed/word_freq_wordle_only.csv')
df.head()

Unnamed: 0,word,wordFreq
0,AALII,0
1,AARGH,71592
2,AARTI,63273
3,ABACA,79841
4,ABACK,272920


# Vowels

In [4]:
vowels = ['A', 'E', 'I', 'O', 'U', 'Y']
places = [0,1,2,3,4]
tracker = pd.DataFrame(columns=['letter', 'place', 'letterFreq'])
tracker

Unnamed: 0,letter,place,letterFreq


## Distributions

In [5]:
def get_counts(letters, places, tracker: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    for letter in letters:
        for place in places:
            # Generate regex
            regex = list('.'*len(places))
            regex[place] = letter
            regex = ''.join(regex)
            # Calculate and record
            tracker = tracker.append({'letter': letter, 'place': place, 'letterFreq': len(df[df['word'].str.match(regex)])}, ignore_index=True)
    return tracker

In [6]:
# tracker = get_counts(letters=vowels, places=places, tracker=tracker, df=df)
# tracker.head()

In [7]:
# sns.scatterplot(data=tracker, x='place', y='letterFreq', hue='letter')

### Results
Most frequent (vowel, location)s:  
A, 1  
O, 1  
E, 1  
E, 3  
E, 4  

# Information Approach
Score each word on the potential information guessing it may provide.  
* How many words can be potentially ruled out using exact letter matching?
  - sum(if match on each letter, proportion of words ruled out)
* How many words can be potentially ruled out using letter exclusions?
  - sum(if each letter is excluded, proportion of words ruled out)
* How many words can be potentially ruled out using inexact matches?
  - sum(if each letter is inexact match, proportion of words ruled out)

1. Calculate potential Filtering Power (FP) for each word based on remaining words.
   * For each word, look at each (letter, index) = ($l_i$, $i$)
2. Choose the word with the highest potential FP.

\begin{equation}
\text{FP}(w_k) = \sum_i \Big( \text{FP}_{\text{exclude}}(l_i) + \text{FP}_{\text{match-exact}}(l_i, i) + \text{FP}_{\text{match-inexact}}(l_i, i) \Big)
\end{equation}
\begin{equation}
\text{FP}(l_i, i) = len(\text{unfiltered corpus}) - len(\text{filtered corpus}(l_i, i))
\end{equation}

Equivalently - just sum over length of filtered corpus at each step, then look for minimum.  

After guessing and receiving feedback on a word, how much is the feedback from that word expected to reduce word pool?

In [21]:
def get_regex(letter: str, place: int, word_length: int) -> str:
    regex = list('.'*word_length)
    regex[place] = letter
    return ''.join(regex)


def fp(word: str, words: pd.DataFrame) -> pd.DataFrame:
    # TODO: redo this for all possible outcomes: (exact, inexact), (inexact, exclude), etc.
    res = {'word':word, 'num_post_exclude':0, 'num_post_exact':0, 'num_post_inexact':0}
    for place in range(len(word)):
        letter = word[place]
        # Exclusions
        res['num_post_exclude'] += len(filter_exclude(exclude=set(letter), words=words))
        # Exact matches
        regex = get_regex(letter=letter, place=place, word_length=len(word))
        res['num_post_exact'] += len(filter_exact(exact=regex, words=words))
        # Inexact matches
        res['num_post_inexact'] += len(filter_inexact(inexact={letter: set({place})}, words=words, word_length=len(word)))
    return res

def get_potential_fp(words: pd.DataFrame) -> pd.DataFrame:
    word_length = len(words['word'].iloc[0])
    outcomes = ['exact', 'exclude', 'inexact']
    list1 = list(product(['exact', 'exclude', 'inexact'], range(3)))
    list2 = [str(x)+str(y) for (x,y) in list1]
    list3 = list(product(list2, list2))
    list3 = [str(x)+"_"+str(y) for (x,y) in list3 if x != y]
    columns = ['word'] + list3

    df_res = pd.DataFrame(columns=columns)
    for word in words['word']:
        df_res = pd.concat([df_res, pd.DataFrame(data=fp(word=word, columns=columns, words=words), index=[0])], ignore_index=True)
    for col in ['num_post_exclude', 'num_post_exact', 'num_post_inexact']:
        df_res[col] = df_res[col].astype('int')
    df_res['word'] = df_res['word'].astype('str')
    res_max = df_res[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].max(axis=1).max()
    df_res['min_post_num'] = df_res[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].replace(0, res_max).min(axis=1).astype('int')
    return df_res.sort_values(by='min_post_num')

In [24]:
df_test = pd.DataFrame(data={
    'word':['AB', 'BA', 'BC','CD'],
    'wordFreq':[3,4,5,5]
})

df_test = get_potential_fp(words=df_test).sort_index()
df_test['sum'] = df_test[['num_post_exclude', 'num_post_exact', 'num_post_inexact']].sum(axis=1)
df_test

Unnamed: 0,word,num_post_exclude,num_post_exact,num_post_inexact,min_post_num,sum
0,AB,3,2,3,2,8
1,BA,3,3,2,2,8
2,BC,3,3,2,2,8
3,CD,5,2,1,1,8
