In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
f = open("hw1_word_counts_05.txt", "r")
words = []
counts = []
for line in f.readlines():
    l = line.replace("\n","").split(" ")
    words.append(l[0])
    counts.append(int(l[1]))
word_count = dict(zip(words, counts))
word_count_df = pd.DataFrame({'Word':words, 'Count':counts})

### fifteen most frequent words

In [3]:
word_count_df.sort_values('Count', ascending = False).head(15).Word

5821    THREE
5102    SEVEN
1684    EIGHT
6403    WOULD
18      ABOUT
5804    THEIR
6320    WHICH
73      AFTER
1975    FIRST
1947    FIFTY
4158    OTHER
2073    FORTY
6457    YEARS
5806    THERE
5250    SIXTY
Name: Word, dtype: object

### fifteen least frequent words

In [4]:
word_count_df.sort_values('Count', ascending = True).head(15).Word

3554    MAPCO
712     BOSAK
895     CAIXA
4160    OTTIS
5985    TROUP
1107    CLEFT
2041    FOAMY
977     CCAIR
5093    SERNA
6443    YALOM
5872    TOCOR
3978    NIAID
4266    PAXON
1842    FABRI
719     BOTTS
Name: Word, dtype: object

## Hangman Game

In [5]:
def hangman(correct_word, word_count, user_guesses=None):
    #alphabet
    current_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
                'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    #make the correct word into a list of letters
    target_word = [letter for letter in correct_word]
    #assign the current word as list of empty chars since we havent guessed anything yet
    current_word = ['','','','','']
    #list of words that can still be considered for the answer
    updated_dict = list(word_count.keys())
    
    #total word count to later be used in the denominator of Pr(W=w)
    total_word_count = float(sum(word_count.values()))
    
    guess_num = 0
    while(target_word != current_word):
        #make a guess and remove that letter from the current_alphabet
        if user_guesses is not None and guess_num<len(user_guesses):
            guess= user_guesses[guess_num] 
        else:
            guess = guess_next_letter(current_word, updated_dict, word_count, current_alphabet, total_word_count)
        guess_num+=1
        
        print("GUESS: " + guess)
        current_alphabet = [letter for letter in current_alphabet if letter != guess]
        
        #is the guess correct or not?
        correct_guess = guess in target_word
        
        if correct_guess:
            #put the correct guess for the letter in the correct spots
            current_word = [guess if guess==target_word[i] else current_word[i] for i in range(len(target_word))]
            regex = ''
            not_filled = []
            for i in range(len(current_word)):
                letter = current_word[i]
                if letter == '':
                    regex += '.'
                    not_filled.append(i)
                else:
                    regex += letter
            regex = re.compile(regex)
            updated_dict = [word for word in updated_dict if regex.fullmatch(word) is not None]
            updated_dict = [word for word in updated_dict if guess not in [word[i] for i in not_filled]]
            #remove any words that do not match up with our current word
        else:
            #remove all words from the dictionary that contain that guess
            if not correct_guess:
                updated_dict = [word for word in updated_dict if guess not in word]  
        #print the current word
        to_print = ""
        for letter in current_word:
            if letter == '':
                to_print+= "_ "
            else:
                to_print+= letter + " "
        print(to_print)
        print("")

In [6]:
def guess_next_letter(current_word, updated_dict, word_count, current_alphabet, total_word_count):
    spots_to_guess = [i for i in range(len(current_word)) if current_word[i]=='']
    prob_L_given_E = []
    
    for l in current_alphabet:
        bayes_list = []
        prob_L_given_w_list = []
        prob_L_given_E_list = []
        for w in updated_dict:
            #Prob(W=w) = count(w)/sum(count(all w's)) = count(w)/total_word_count
            prob_W = float(word_count[w])/total_word_count
            
            #prob(L=l for any of the spots that need to be guessed | word)
            if l in [w[spot] for spot in spots_to_guess]:
                prob_L_given_w = 1.0
            else:
                prob_L_given_w = 0.0
            prob_L_given_w_list.append(prob_L_given_w)
            
            #prob(E | W = w)
            prob_E_given_w = 1.0
            
            #Bayes numerator list
            bayes_list.append(prob_W*prob_E_given_w)
            
        #bayes_denom = sum(bayes_denom_list)
        bayes_denom = sum(bayes_list)
        bayes_list = np.array(bayes_list)/bayes_denom
        prob_L_given_E.append(sum([prob_L_given_w_list[i]*bayes_list[i] for i in range(len(bayes_list))]))
    print(max(prob_L_given_E))
    index = np.argmax(prob_L_given_E)
    return current_alphabet[index]

In [7]:
hangman("THREE", word_count)

0.5394172389647948
GUESS: E
_ _ _ E E 

0.9975410864383392
GUESS: R
_ _ R E E 

0.9813674863535253
GUESS: T
T _ R E E 

0.9999450734550994
GUESS: H
T H R E E 



In [8]:
hangman("MAPCO", word_count)

0.5394172389647948
GUESS: E
_ _ _ _ _ 

0.47645908946574006
GUESS: O
_ _ _ _ O 

0.8258395494884568
GUESS: A
_ A _ _ O 

0.8259289176090467
GUESS: R
_ A _ _ O 

0.34106728538283076
GUESS: C
_ A _ C O 

0.7743589743589743
GUESS: N
_ A _ C O 

0.803030303030303
GUESS: S
_ A _ C O 

0.7692307692307692
GUESS: F
_ A _ C O 

1.0
GUESS: M
M A _ C O 

1.0
GUESS: P
M A P C O 

