In [None]:
import math
from collections import defaultdict
from collections import defaultdict, Counter
from nltk.corpus import brown

#reusing the code from the previous assignment
#bigram class, for creating transition probability
class Bigram ():
    def __init__(self):
        self.alpha = 0.0001
        self.prob = defaultdict(float)
        self.counter_unigram = defaultdict(float)
        self.counter_bigram = defaultdict(float)
    
    def train(self, trainingSentences):
        
        self.uni_total = 0
        
        for sentence in trainingSentences:
            prev = sentence[0]
            self.counter_unigram[prev] += 1
            for i in range(1, len(sentence)):
                self.counter_unigram[sentence[i]] += 1
                self.counter_bigram[tuple([prev, sentence[i]])] += 1
                prev = sentence[i]
               
        self.counter_unigram['UNK'] += 1
        self.uni_total = sum(self.counter_unigram.values())        
        self.vocab_size = len(self.counter_unigram.keys())

    def getBigramProbability(self, word1, word2):

        gram = tuple([word1, word2])
        prob1 = self.counter_unigram[word1] 
        prob2 = self.counter_unigram[word2]

        if gram in self.counter_bigram:
            prob_bigram = self.counter_bigram[gram]
            return (prob_bigram + self.alpha) / (prob1 + (self.vocab_size*self.alpha))
        elif prob1 == 0:
            return 1.0 / self.vocab_size
        
        return self.alpha/(prob1 + (self.alpha*self.vocab_size))

    def getUnigramProbability(self, word):

        return self.counter_unigram[word]/self.uni_total if word in self.counter_unigram else self.counter_unigram['UNK']/self.uni_total
        
#get data from brown corpus
def getData():
    brown_corpus = brown.tagged_sents()
    train_len = int(len(brown_corpus)*0.8)
    dev_len = int(len(brown_corpus)*0.9)
    
    train_set = brown_corpus[:train_len]
    dev_set = brown_corpus[train_len:dev_len]
    test_set = brown_corpus[dev_len:]

    return train_set, test_set

#creating HMM
class HMM:

    def __init__(self):
        #alpha is for data smoothing
        self.alpha = 0.001
        self.train_set, self.test_set = getData()
        self.bigram = Bigram()

        self.tag_seq = self.get_sequence(self.train_set, 1)
        self.pos_freq = self._counter(self.tag_seq)
        self.states = self.pos_freq.keys()
        
        #total number of tags in corpus
        self.vocab_size = sum(self.pos_freq.values())

        self.initial = defaultdict(float)
        self.transition = defaultdict(lambda: defaultdict(lambda: 1.0/self.vocab_size))
        self.emission = defaultdict(lambda: defaultdict(lambda : 1.0/self.vocab_size))
    
    #get the frequency of terms in a nested list
    def _counter(self, nested_list):

        count = []
        for _list in nested_list:
            count += _list

        return Counter(count)
    
    #seperate the tuples (word, tag) into word sequence and tag sequence
    def get_sequence(self, seq, index):

        tag_seq = []
        for sent in seq:
            sent_tag = []
            for word in sent:
                sent_tag.append(word[index])
            tag_seq.append(sent_tag)

        return tag_seq

    def train(self):

        print('training initial and transition probability')
        self.initial_and_transit_prob()
        print('training emission probability')
        self.emission_prob()
        print('training finished')
                
    #get the initial and transition matrix
    def initial_and_transit_prob(self):

        self.bigram.train(self.tag_seq)

        for sent in self.tag_seq:
            prev = sent[0]
            for i in range(1, len(sent)):
                curr = sent[i]
                self.transition[prev][curr] = self.bigram.getBigramProbability(prev, curr)
                self.initial[prev] = self.bigram.getUnigramProbability(prev)
                prev = curr
    
    #get emission matrix
    def emission_prob(self):

        tag_pairs = self._counter(self.train_set)

        for pair in tag_pairs.keys():
            #pair = (word, tag)
            self.emission[pair[1]][pair[0]] = (self.alpha + tag_pairs[pair]) / (self.pos_freq[pair[1]] + (self.alpha*self.vocab_size))
    
    #helper function of viterbi algorithm, for calculating the maximum probability under t
    def vit_helper(self, delta) :  
        max_val = 0.0
        max_key = ""  

        for key in delta.keys() :
            if delta[key] > max_val:  
                max_key = key   
                max_val = delta[key]
                
        return max_key, max_val

    def viterbi(self, obs):

        path = []  
        prob = []
        viter = {}

        #set the initial probability
        for state in self.states:
            viter[state] = self.initial[state]*self.emission[state][obs[0]]

        key, val = self.vit_helper(viter)
        path.append(key)
        prob.append(val)
        
        for t in range(1, len(obs)):
            prevState = path[-1]
            
            for state in self.states:            
                viter[state] = prob[-1]*self.transition[prevState][state]*self.emission[state][obs[t]]

            key, val = self.vit_helper(viter)
            path.append(key)  
            prob.append(val)

        return path
    
    #testing accuracy rate of the tagger
    def test(self):

        test = self.get_sequence(self.test_set, 0)
        gold = self.get_sequence(self.test_set, 1)

        accuracy = 0.0
        test_num = 0.0
        
        for i in range(len(test)):
            ans = self.viterbi(test[i])
            for j in range(len(ans)):
                test_num += 1
                if ans[j] == gold[i][j]:
                    accuracy += 1

        return float(accuracy) / test_num

test = HMM()
test.train()
print(test.test())