# CSE710 Advanced Aritificial Intelligence (Group Project)

## Bangla Parts of Speech tagging using Hidden Markov Model

---

By 

---

*   **Mosarrat Rumman - 20266007**
*   **Abu Nayeem Tasneem - 20266002**

---

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install pomegranate



In [6]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple, OrderedDict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import os
from io import BytesIO
from itertools import chain
import random
import pandas as pd

In [7]:

Sentence = namedtuple("Sentence", "words tags")

def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
    return OrderedDict(((s[0], Sentence(*zip(*[l.strip().split("\t")
                        for l in s[1:]]))) for s in sentence_lines if s[0]))


def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

Sentence = namedtuple("Sentence", "words tags")

def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
    return OrderedDict(((s[0], Sentence(*zip(*[l.strip().split("\t")
                        for l in s[1:]]))) for s in sentence_lines if s[0]))

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())


class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [8]:
data = Dataset("/content/drive/MyDrive/Colab Notebooks/data/tags-universal.txt", "/content/drive/MyDrive/Colab Notebooks/data/DATAPOSV1.txt", train_test_split=0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

There are 78 sentences in the corpus.
There are 62 sentences in the training set.
There are 16 sentences in the testing set.


In [9]:
x = 's8'
print("Sentence: {}".format(x))
print("words:\n\t{!s}".format(data.sentences[x].words))
print("tags:\n\t{!s}".format(data.sentences[x].tags))

Sentence: s8
words:
	('তিনি', 'নিজেই', 'তৈরি', 'করেন', 'বিতর্ক', '.')
tags:
	('PRON', 'ADV', 'VERB', 'VERB', 'ADJ', '.')


In [10]:
print("There are a total of {} samples of {} unique words in the corpus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))
print("There are {} samples of {} unique words in the testing set."
      .format(data.testing_set.N, len(data.testing_set.vocab)))
print("There are {} words in the test set that are missing in the training set."
      .format(len(data.testing_set.vocab - data.training_set.vocab)))

assert data.N == data.training_set.N + data.testing_set.N, \
       "The number of training + test samples should sum to the total number of samples"

There are a total of 881 samples of 529 unique words in the corpus.
There are 681 samples of 429 unique words in the training set.
There are 200 samples of 164 unique words in the testing set.
There are 100 words in the test set that are missing in the training set.


In [11]:
print(data.vocab)

frozenset({'সিডনিতে', '.', 'পাওয়ায়', 'ইচ্ছে', 'অগ্রিম', 'আমার', 'দলের', 'শ্রেণীর', 'সিডনি', 'রুখে', 'দাঁড়ানোকে', 'প্রথম', 'পায়', 'অবস্থা', 'অজিরা', 'টস', 'রানআউটে', 'হবে', 'অস্ট্রেলিয়া-শ্রীলঙ্কার', 'শর্তানুযায়ী', '‘এল', 'জুটিতে', 'নির্ধারিত', 'হতে', 'এতে', 'দিতে', 'নামে', 'ডামি', 'সহযোগিতায়', 'পাঁচ', 'শিকারির', 'হাসির', 'খেললেও', 'দিন', 'লঙ্কান', 'লাল-সবুজরা', 'বিতর্ক', 'শ্রীলঙ্কা', 'অন', 'বারবারের', 'করে', 'জিতে', 'সান্তিয়াগো', '"ফেসবুক,"', 'কঠোর', 'বাছাই', 'পথে', 'আসরের', 'তরুণরা', 'ব্যাটিং', 'জয়', 'ঋণ', 'ভালো', 'ওয়ানডেতে', 'বিদায়', 'হয়ে', 'ঝড়', 'পরিচালনা', 'প্রতিপক্ষ', 'সংস্কারের', 'তালিকায়', 'ঘণ্টার', 'ঝটকায়', 'পাগলপারা', 'দু’দল', 'প্রতিবাদের', 'গোল', 'রয়েছে', 'শীর্ষেও', 'কলকাতার', 'সংস্থাটির', 'জনগণের', 'বৈদ্যুতিক', 'বিশেষভাবে', 'তৃতীয়', 'নাট্যকার', 'আনুষ্ঠানিক', 'হয়নি', 'ফেরানোর', '১৩.২', 'ড্র', 'রদ্রিগুয়েজ', '৭', 'কিছুই', '২০১২', 'নৈপুণ্যে', 'জোন', 'সার্ভিসের', 'গণমাধ্যমগুলোর', 'সিরিজেও', 'বার্নাব্যুতে', 'পাস', 'সমঝোতা', 'জানিয়েছে', 'হাইনেস', 'দাঁড়', 'মাইক', 'নির্ধারণী', 'ওয়ানডে

In [12]:
# accessing words with Dataset.X and tags with Dataset.Y 
for i in range(2):    
    print("Sentence {}:".format(i + 1), data.X[i])
    print()
    print("Labels {}:".format(i + 1), data.Y[i])
    print()

Sentence 1: ('আমি', ',', 'তাকে ', 'আসলে', 'কাজ', 'করা', 'কমিয়ে', 'দিয়েছি', '.')

Labels 1: ('PRON', '.', 'PRON', 'ADJ', 'VERB', 'VERB', 'ADV', 'VERB', '.')

Sentence 2: ('এ', 'সময়', 'লোকজনের', 'মধ্যে', 'আতঙ্ক', 'ছড়িয়ে', 'পড়ে', '.')

Labels 2: ('PRON', 'NOUN', 'NOUN', 'PSP', 'NOUN', 'VERB', 'VERB', '.')



In [13]:
# use Dataset.stream() (word, tag) samples for the entire corpus
print("\nStream (word, tag) pairs:\n")
for i, pair in enumerate(data.stream()):
    print("\t", pair)
    #if i > 3: break


Stream (word, tag) pairs:

	 ('আমি', 'PRON')
	 (',', '.')
	 ('তাকে ', 'PRON')
	 ('আসলে', 'ADJ')
	 ('কাজ', 'VERB')
	 ('করা', 'VERB')
	 ('কমিয়ে', 'ADV')
	 ('দিয়েছি', 'VERB')
	 ('.', '.')
	 ('এ', 'PRON')
	 ('সময়', 'NOUN')
	 ('লোকজনের', 'NOUN')
	 ('মধ্যে', 'PSP')
	 ('আতঙ্ক', 'NOUN')
	 ('ছড়িয়ে', 'VERB')
	 ('পড়ে', 'VERB')
	 ('.', '.')
	 ('তবে', 'CONJ')
	 ('এ', 'PRON')
	 ('আগুনে', 'NOUN')
	 ('কেউ', 'PRON')
	 ('হতাহত', 'NOUN')
	 ('হয়নি', 'VERB')
	 ('.', '.')
	 ('বৈদ্যুতিক', 'ADJ')
	 ('শর্টসার্কিট', 'NOUN')
	 ('থেকে', 'PSP')
	 ('আগুনের', 'NOUN')
	 ('সূত্রপাত', 'ADV')
	 ('হয়েছে', 'VERB')
	 ('বলে', 'VERB')
	 ('ফায়ার', 'NOUN')
	 ('সার্ভিসের', 'NOUN')
	 ('সদস্যরা', 'NOUN')
	 ('জানিয়েছে', 'VERB')
	 ('.', '.')
	 ('আমি', 'PRON')
	 ('এবং ', 'CONJ')
	 ('তুমি', 'PRON')
	 ('দ্বিতীয়', 'ADJ')
	 ('.', '.')
	 ('দ্বিতীয়', 'ADJ')
	 ('খেলাটি', 'NOUN')
	 ('বৃষ্টির', 'NOUN')
	 ('জন্য', 'PSP')
	 ('মাঠে', 'NOUN')
	 ('গড়াতে', 'VERB')
	 ('পারেনি', 'VERB')
	 ('.', '.')
	 ('আমি', 'PRON')
	 ('ওকে', 'PRON')
	 ('অনেক', 'A

### IMPLEMENTATION: Pair Counts

In [14]:
def pair_counts(tags, words):
    d = defaultdict(lambda: defaultdict(int))
    for tag, word in zip(tags, words):
        d[tag][word] += 1
        
    return d
tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
words = [word for i, (word, tag) in enumerate(data.training_set.stream())]
print(tags)
print(words)

['NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'ADJ', 'NOUN', 'PRON', 'NOUN', 'ADJ', 'NOUN', '.', 'NOUN', 'ADJ', 'NOUN', 'ADJ', 'VERB', 'NOUN', 'VERB', 'NOUN', 'ADV', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'ADJ', 'NOUN', 'PRON', 'PSP', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'NOUN', '.', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'NOUN', 'PSP', 'ADJ', 'NOUN', 'PRON', 'CONJ', 'PSP', '.', 'PSP', 'CONJ', '.', 'NOUN', 'VERB', 'ADJ', 'NOUN', 'VERB', 'VERB', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'NOUN', '.', 'ADJ', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'VERB', 'VERB', 'NOUN', 'NOUN', 'CONJ', 'NOUN', 'PSP', 'PRON', 'ADJ', 'PRON', 'NOUN', 'VERB', 'VERB', '.', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'NOUN', '.', 'PRON', 'ADV', 'ADJ', 'VERB', 'ADV', 'ADJ', 'CONJ', 'PRON', 'ADJ', 'ADV', '.', 'PRON', 'NOUN', 'NOUN', 'VERB', 'VERB', 'PSP', 'ADJ', 'VERB', 'ADJ', 'VERB', 'ADJ', 'ADJ', 'NOUN', 'ADJ', 'NOUN', '.', 'ADJ', 'NOUN', 'AD

### Making Predictions with a Model

In [15]:
def replace_unknown(sequence):
    
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
    
    _, state_path = model.viterbi(replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]]

### Evaluating Model Accuracy

In [16]:
def accuracy(X, Y, model):
    
    correct = total_predictions = 0
    for observations, actual_tags in zip(X, Y):
        
        # The model.viterbi call in simplify_decoding will return None if the HMM
        # raises an error (for example, if a test sentence contains a word that
        # is out of vocabulary for the training set). Any exception counts the
        # full sentence as an error (which makes this a conservative estimate).
        try:
            most_likely_tags = simplify_decoding(observations, model)
            correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
    return correct / total_predictions

#### Evaluate the accuracy of the MFC tagger

### Build an HMM tagger

In [17]:
def unigram_counts(sequences):

    return Counter(sequences)

tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
tag_unigrams = unigram_counts(tags)

### Bigram Counts

In [18]:
def bigram_counts(sequences):

    d = Counter(sequences)
    return d

tags = [tag for i, (word, tag) in enumerate(data.stream())]
o = [(tags[i],tags[i+1]) for i in range(0,len(tags)-2,2)]
tag_bigrams = bigram_counts(o)

### Sequence Starting Counts

In [19]:
def starting_counts(sequences):
    
    d = Counter(sequences)
    return d

tags = [tag for i, (word, tag) in enumerate(data.stream())]
starts_tag = [i[0] for i in data.Y]
tag_starts = starting_counts(starts_tag)

###  Sequence Ending Counts

In [20]:
def ending_counts(sequences):
    
    d = Counter(sequences)
    return d

end_tag = [i[len(i)-1] for i in data.Y]
tag_ends = ending_counts(end_tag)

### IMPLEMENTATION: Basic HMM Tagger

In [21]:
basic_model = HiddenMarkovModel(name="base-hmm-tagger")

#for calculating emission probability
tags = [tag for i, (word, tag) in enumerate(data.stream())]
words = [word for i, (word, tag) in enumerate(data.stream())]

tags_count=unigram_counts(tags)
tag_words_count=pair_counts(tags,words) #for calculating emission probability

starting_tag_list=[i[0] for i in data.Y]
ending_tag_list=[i[-1] for i in data.Y]

starting_tag_count=starting_counts(starting_tag_list)#the number of times a tag occured at the start
ending_tag_count=ending_counts(ending_tag_list)      #the number of times a tag occured at the end



to_pass_states = []
for tag, words_dict in tag_words_count.items():
    total = float(sum(words_dict.values()))
    distribution = {word: count/total for word, count in words_dict.items()}
    tag_emissions = DiscreteDistribution(distribution)
    tag_state = State(tag_emissions, name=tag)
    to_pass_states.append(tag_state)


basic_model.add_states()    
    
#for calculating the transition probability
start_prob={}

for tag in tags:
    start_prob[tag]=starting_tag_count[tag]/tags_count[tag]

for tag_state in to_pass_states :
    basic_model.add_transition(basic_model.start,tag_state,start_prob[tag_state.name])    

end_prob={}

for tag in tags:
    end_prob[tag]=ending_tag_count[tag]/tags_count[tag]
for tag_state in to_pass_states :
    basic_model.add_transition(tag_state,basic_model.end,end_prob[tag_state.name])
    


transition_prob_pair={}

for key in tag_bigrams.keys():
    transition_prob_pair[key]=tag_bigrams.get(key)/tags_count[key[0]]
for tag_state in to_pass_states :
    for next_tag_state in to_pass_states :
        basic_model.add_transition(tag_state,next_tag_state,transition_prob_pair[(tag_state.name,next_tag_state.name)]) #transition probability pair

basic_model.bake()

In [22]:
hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, basic_model)
print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc))

hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, basic_model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc))

training accuracy basic hmm model: 96.62%
testing accuracy basic hmm model: 67.50%


### Example Decoding Sequences with the HMM Tagger

In [23]:
for key in data.testing_set.keys[:3]:
    print("Sentence Key: {}\n".format(key))
    print("Predicted labels:\n-----------------")
    print(data.sentences[key].words)
    print(simplify_decoding(data.sentences[key].words, basic_model))
    print()
    print("Actual labels:\n--------------")
    print(data.sentences[key].tags)
    print("\n")

Sentence Key: s45

Predicted labels:
-----------------
('পালেকেল্লের', 'তৃতীয়', 'ওয়ানডেতে', 'দুই', 'ঘণ্টার', 'বেশি', 'সময়', 'টানা', 'বৃষ্টির', 'পর', 'বাংলাদেশের', 'সমীকরণ', 'দাঁড়ায়', 'অবশিষ্ট', '১৩.২', 'ওভারে', '১০৫', 'রান', '.')
['PRON', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', '.']

Actual labels:
--------------
('NOUN', 'ADJ', 'NOUN', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'ADJ', 'NOUN', 'ADJ', 'NOUN', '.')


Sentence Key: s1s

Predicted labels:
-----------------
('তবে', 'এ', 'আগুনে', 'কেউ', 'হতাহত', 'হয়নি', '.')
['CONJ', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOUN', '.']

Actual labels:
--------------
('CONJ', 'PRON', 'NOUN', 'PRON', 'NOUN', 'VERB', '.')


Sentence Key: s18

Predicted labels:
-----------------
('আগের', 'দিন', 'বিনা', 'উইকেটে', '২৩', 'রান', ',', 'নিয়ে', 'প্রথম', 'ইনিংসে', ',', 'গতকাল', 'এবং', 'দ্বিতীয়', 'দিনের', 'মতো', 'ব্যাট', 'করতে', 'নাম

In [24]:
text='কাল','থেকে','শুরু','একটা','ঈদ','.'
print(simplify_decoding(text, basic_model))

['ADJ', 'PSP', 'VERB', 'ADJ', 'NOUN', '.']
