# Homework #4 - Bigram Model
## Name: Mark Kim
## Class: CSC 620 - NLT

This little program will take in a string input, compare it to the words in a
corpus and notify you if it has an exact match or if not, the closest five
matches.  For all matches, the program will provide you with the edit distance
and the relative frequency of the word in the corpus.

### NOTE: This application ignores case! If you want this to be case-sensitive, comment out line 42 and 64.

In [43]:
from nltk.metrics.distance import edit_distance
from time import sleep
import re

class Autocorrect:
    def __init__(self, corpus):
        self._words = self.strip_punctuation(corpus)
        self._word_count = len(self._words)
        self._word_freq = self.get_word_freq()
        self._bigram_freq = self.get_bigram_freq()

    def update_corpus(self, corpus):
        self._words = self.strip_punctuation(corpus)
        self._word_freq = self.get_word_freq()
        
    def get_word_freq(self) -> dict:
        wordcount = self._word_count
        freq_dict = {}
        for word in self._words:
            if word in freq_dict.keys():
                count = freq_dict.get(word)["cnt"] + 1
                freq_dict[word] = { "cnt": count }
            else:
                freq_dict[word] = { "cnt": 1 }
        for key in freq_dict:
            freq_dict[key]["rel"] = freq_dict[key]["cnt"]/wordcount
        return freq_dict

    def get_bigram_freq(self) -> dict:
        bigram_dict = {}
        for i in range(1, self._word_count):
            if (self._words[i-1], self._words[i]) in bigram_dict.keys():
                count = bigram_dict.get((self._words[i-1], self._words[i]))["cnt"] + 1
                bigram_dict[(self._words[i-1], self._words[i])] = { "cnt": count }
            else:
                bigram_dict[(self._words[i-1], self._words[i])] = { "cnt": 1 }
        for key in bigram_dict:
            bigram_dict[key]["rel"] = bigram_dict[key]["cnt"]/self._word_freq[key[0]]["cnt"]
        return bigram_dict

    def check_word(self, string) -> list:
        if string in self._word_freq:
            return [(0, string, self._word_freq[string]["rel"])]
        match = []
        for key in self._word_freq:
            dist = edit_distance(string, key, substitution_cost=2)
            match.append((dist, key, self._word_freq[key]["rel"]))
        match.sort()
        match = match[:5]
        return match

    def check_bigram(self, string) -> list:
        if string not in self._word_freq:
            return []
        match = []
        for key in self._bigram_freq:
            if string == key[0]:
                match.append((key[1], self._bigram_freq[key]["rel"]))
        match.sort(key = lambda x: x[1], reverse=True)
        return match

    def strip_punctuation(self, corpus):
        import re
        pattern = re.compile(r'([\'.,\(\):;\"\`])+')
        filtered = [e for e in corpus if not pattern.match(e)]
        filtered = [e.lower() for e in filtered]
        return filtered
            
def check_words_ui(acObj, corpus):
    print("Welcome to the autocorrect application, where the string/word you enter")
    print("are compared to a corpus assigned to this app.  Please enter in your")
    print("string and I will let you know if there is an exact match.  If there is")
    print("an exact match, I will provide you the word and its relative frequency.")
    print("If there are no matches, I will provide you with the 5 closest matches by")
    print("Levenshtein Distance and their relative frequency! Enter '##' to end.")
    print("-" * 73)
    sleep(0.1)
    quit = "##"
    user_input = ""
    resp = ""
    while user_input != quit:
        user_input = quit
        try:
            user_input = input("\n>")
        except EOFError:
            print(user_input)
        if(user_input and user_input != quit):
            user_input = user_input.lower()
            res = acObj.check_word(user_input)
            if len(res) == 1:
                print('\n\033[92m' + '\033[1m' + user_input + '\033[0m'
                    + ' is a complete and correct word as per corpus ' + '\033[92m'
                    + '\033[1m' + corpus + '\033[0m' + ', and its probability is ' 
                    + '\033[92m' + str(res[0][2]) + '\033[0m')
            else:
                print('\033[92m' + '\n\033[1m' + user_input + '\033[0m'
                    + ' does not have a match in ' + '\033[92m' + '\033[1m'
                    + corpus + '\033[0m' + ', and here are the five closest matches:\n')
                print('Dist\tWord\t\tRelative Frequency')
                print('-' * 47)
                for e in res:
                    print(f' {e[0]}\t{e[1]: <15}\t{e[2]:.10f}')
    print("\nThank you for trying this out!")

def bigram_ui(acObj, corpus):
    print("Welcome to the bigram model probability application where you provide")
    print("me with a word and I will tell you what the possible next word will ")
    print("be with the probabilities of that next word. Enter ## to end.")
    print("-" * 73)
    sleep(0.1)
    quit = "##"
    user_input = ""
    resp = ""
    while user_input != quit:
        user_input = quit
        try:
            user_input = input("\n>")
        except EOFError:
            print(user_input)
        if(user_input and user_input != quit):
            user_input = user_input.lower()
            res = acObj.check_word(user_input)
            if len(res) == 1:
                print('\n\033[92m' + '\033[1m' + user_input + '\033[0m'
                    + ' is a complete and correct word as per corpus ' + '\033[92m'
                    + '\033[1m' + corpus + '\033[0m' + ', and its probability is ' 
                    + '\033[92m' + str(res[0][2]) + '\033[0m')
            else:
                print('\033[92m' + '\n\033[1m' + user_input + '\033[0m'
                    + ' does not have a match in ' + '\033[92m' + '\033[1m'
                    + corpus + '\033[0m' + ', and here are the five closest matches:\n')
                print('Dist\tWord\t\tRelative Frequency')
                print('-' * 47)
                for e in res:
                    print(f' {e[0]}\t{e[1]: <15}\t{e[2]:.10f}')
    print("\nThank you for trying this out!")


In [44]:
from nltk.corpus import brown

corpus = "ca01"

ca01 = brown.words(corpus)

ac = Autocorrect(ca01)

In [46]:
ac.check_bigram("the")

[('jury', 0.08387096774193549),
 ('fulton', 0.03870967741935484),
 ('election', 0.03870967741935484),
 ('state', 0.03870967741935484),
 ('city', 0.025806451612903226),
 ('house', 0.025806451612903226),
 ('atlanta', 0.01935483870967742),
 ('new', 0.01935483870967742),
 ('petition', 0.01935483870967742),
 ("mayor's", 0.01935483870967742),
 ('republicans', 0.01935483870967742),
 ('resolution', 0.01935483870967742),
 ('polls', 0.01935483870967742),
 ('number', 0.012903225806451613),
 ('grand', 0.012903225806451613),
 ('jurors', 0.012903225806451613),
 ('couple', 0.012903225806451613),
 ('audience', 0.012903225806451613),
 ('county', 0.012903225806451613),
 ('georgia', 0.012903225806451613),
 ('highway', 0.012903225806451613),
 ('senate', 0.012903225806451613),
 ('action', 0.012903225806451613),
 ('coolest', 0.012903225806451613),
 ('praise', 0.0064516129032258064),
 ('manner', 0.0064516129032258064),
 ('september-october', 0.0064516129032258064),
 ('hard-fought', 0.0064516129032258064),
 (

In [None]:
check_words_ui(ac, corpus)

Welcome to the autocorrect application, where the string/word you enter
are compared to a corpus assigned to this app.  Please enter in your
string and I will let you know if there is an exact match.  If there is
an exact match, I will provide you the word and its relative frequency.
If there are no matches, I will provide you with the 5 closest matches by
Levenshtein Distance and their relative frequency! Enter '##' to end.
-------------------------------------------------------------------------

>the

[92m[1mthe[0m is a complete and correct word as per corpus [92m[1mca01[0m, and its probability is [92m0.07781124497991967[0m

>a

[92m[1ma[0m is a complete and correct word as per corpus [92m[1mca01[0m, and its probability is [92m0.02710843373493976[0m


In [None]:
import nltk

nltk.download()