# Homework #4 - Bigram Model
## Name: Mark Kim
## Class: CSC 620 - NLT

This little program will take in a string input, compare it to the words in a
corpus and notify you if it has an exact match or if not, the closest five
matches.  For all matches, the program will provide you with the edit distance
and the relative frequency of the word in the corpus.

### NOTE: This application ignores case! If you want this to be case-sensitive, comment out line 66 and 121.

In [61]:
from nltk.metrics.distance import edit_distance
from time import sleep
import re

class Autocorrect:
    # Initialize Autocorrect object
    def __init__(self, corpus):
        self._words = self.strip_punctuation(corpus)
        self._word_count = len(self._words)
        self._word_freq = self.get_word_freq()
        self._bigram_freq = self.get_bigram_freq()

    # Update corpus to a new one
    def update_corpus(self, corpus):
        self._words = self.strip_punctuation(corpus)
        self._word_count = len(self._words)
        self._word_freq = self.get_word_freq()
        self._bigram_freq = self.get_bigram_freq()
        
    # Calculate the frequency/relative frequency of the words in the corpus
    # (unigram model)
    def get_word_freq(self) -> dict:
        wordcount = self._word_count
        freq_dict = {}
        for word in self._words:
            if word in freq_dict.keys():
                count = freq_dict.get(word)["cnt"] + 1
                freq_dict[word] = { "cnt": count }
            else:
                freq_dict[word] = { "cnt": 1 }
        for key in freq_dict:
            freq_dict[key]["rel"] = freq_dict[key]["cnt"]/wordcount
        return freq_dict

    # Calculate the bigram frequencies and probabilities
    def get_bigram_freq(self) -> dict:
        bigram_dict = {}
        for i in range(1, self._word_count):
            if (self._words[i-1], self._words[i]) in bigram_dict.keys():
                count = bigram_dict.get((self._words[i-1], self._words[i]))["cnt"] + 1
                bigram_dict[(self._words[i-1], self._words[i])] = { "cnt": count }
            else:
                bigram_dict[(self._words[i-1], self._words[i])] = { "cnt": 1 }
        for key in bigram_dict:
            bigram_dict[key]["rel"] = bigram_dict[key]["cnt"]/self._word_freq[key[0]]["cnt"]
        return bigram_dict

    # Check to see if a word is in the corpus and if not return the top 5
    # matches along with their relative frequencies
    def check_word(self, string) -> list:
        if string in self._word_freq:
            return [(0, string, self._word_freq[string]["rel"])]
        match = []
        for key in self._word_freq:
            dist = edit_distance(string, key, substitution_cost=2)
            match.append((dist, key, self._word_freq[key]["rel"]))
        match.sort()
        match = match[:5]
        return match

    # Check to see if user input exists in the corpus, and if it does return the
    # possible next words with the probabilities
    def check_bigram(self, string) -> list:
        if string not in self._word_freq:
            return []
        match = []
        for key in self._bigram_freq:
            if string == key[0]:
                match.append((key[1], self._bigram_freq[key]["rel"]))
        match.sort(key = lambda x: x[1], reverse=True)
        return match

    # Strip punctuation from the corpus and normalize
    def strip_punctuation(self, corpus):
        import re
        pattern = re.compile(r'([\'.,\(\):;\"\`])+')
        filtered = [e for e in corpus if not pattern.match(e)]
        filtered = [e.lower() for e in filtered]
        return filtered

# Bigram UI for accessing the Autocorrect class/object.
def bigram_ui(acObj, corpus):
    print("Welcome to the bigram model probability application where you provide")
    print("me with a word and I will tell you what the possible next word will ")
    print("be with the probabilities of that next word. Enter ## to end.")
    print("-" * 73)
    sleep(0.1)
    quit = "##"
    user_input = ""
    resp = ""
    while user_input != quit:
        user_input = quit
        try:
            user_input = input("\n>")
        except EOFError:
            print(user_input)
        if(user_input and user_input != quit):
            user_input = user_input.lower()
            res = acObj.check_bigram(user_input)
            if len(res) == 0:
                print('\n\033[92m' + '\033[1m' + user_input + '\033[0m'
                    + ' does not exist in the ' + '\033[92m'
                    + '\033[1m' + corpus + '\033[0m' + ' corpus.')
            else:
                print('\033[92m' + '\n\033[1m' + user_input + '\033[0m'
                    + ' has a matches in the ' + '\033[92m' + '\033[1m'
                    + corpus + '\033[0m' + ' corpus, and here are the possible next words:\n')
                print('Word\t\tProbability')
                print('-' * 47)
                for e in res:
                    print(f'{e[0]: <15}\t{e[1]:.10f}')
    print("\nThank you for trying this out!")

In [62]:
from nltk.corpus import brown

corpus = "ca01"

ca01 = brown.words(corpus)

ac = Autocorrect(ca01)

In [59]:
bigram_ui(ac, corpus)

Welcome to the bigram model probability application where you provide
me with a word and I will tell you what the possible next word will 
be with the probabilities of that next word. Enter ## to end.
-------------------------------------------------------------------------
[92m
[1mfulton[0m has a matche in the [92m[1mca01[0m corpus, and here are the possible next words:

Word		Probability
-----------------------------------------------
county         	0.4285714286
superior       	0.1428571429
legislators    	0.1428571429
taxpayers      	0.0714285714
ordinary's     	0.0714285714
tax            	0.0714285714
health         	0.0714285714
[92m
[1mwho[0m has a matche in the [92m[1mca01[0m corpus, and here are the possible next words:

Word		Probability
-----------------------------------------------
became         	0.1666666667
attended       	0.1666666667
has            	0.1666666667
is             	0.1666666667
would          	0.1666666667
defeated       	0.1666666667
[92m
