Assingment 3 - NLP 201 - Parsa Mazaheri

In [8]:
import os
from collections import defaultdict

import nltk
nltk.download('averaged_perceptron_tagger')
import numpy as np
from sklearn import metrics
from tqdm import tqdm

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
def evaluate(test_sentences, tagged_test_sentences):
    gold = [str(tag) for sentence in test_sentences for token, tag in sentence]
    pred = [str(tag) for sentence in tagged_test_sentences for token, tag in sentence]
    print(metrics.classification_report(gold, pred, zero_division=0))


def get_token_tag_tuples(sent):
    return [nltk.tag.str2tuple(t) for t in sent.split()]


def get_tagged_sentences(text):
    sentences = []

    blocks = text.split("======================================")
    for block in blocks:
        sents = block.split("\n\n")
        for sent in sents:
            sent = sent.replace("\n", "").replace("[", "").replace("]", "")
            if sent != "":
                sentences.append(sent)
    return sentences


def load_treebank_splits(datadir):
    train = []
    dev = []
    test = []

    print("Loading treebank data...")

    for subdir, dirs, files in os.walk(datadir):
        for filename in files:
            if filename.endswith(".pos"):
                filepath = subdir + os.sep + filename
                with open(filepath, "r") as fh:
                    text = fh.read()
                    if int(subdir.split(os.sep)[-1]) in range(0, 19):
                        train += get_tagged_sentences(text)

                    if int(subdir.split(os.sep)[-1]) in range(19, 22):
                        dev += get_tagged_sentences(text)

                    if int(subdir.split(os.sep)[-1]) in range(22, 25):
                        test += get_tagged_sentences(text)

    print("Train set size: ", len(train))
    print("Dev set size: ", len(dev))
    print("Test set size: ", len(test))

    return train, dev, test


##### read the data

In [None]:
# drop data.zip into colab
!unzip /content/data.zip -d /content/

In [9]:
datadir = r'/content/data/penn-treeban3-wsj/wsj'
# r'C:\Users\Parsa Mazaheri\Desktop\Canvas\NLP 201\HW\HW3\data\penn-treeban3-wsj\wsj'
# os.path.join("data", "penn-treebank3-wsj", "wsj")
    
train, dev, test = load_treebank_splits(datadir)

train_sentences = [get_token_tag_tuples(sent) for sent in train]
dev_sentences = [get_token_tag_tuples(sent) for sent in dev]
test_sentences = [get_token_tag_tuples(sent) for sent in test]

tagged_test_sentences = [nltk.pos_tag([token for token, tag in sentence]) for sentence in test_sentences]
# evaluate(test_sentences, tagged_test_sentences, zero_division=0)

Loading treebank data...
Train set size:  51681
Dev set size:  7863
Test set size:  9046


In [10]:
k = 10

print(test[k])

print('sentence:')
print([token for token, tag in test_sentences[k]])

print('ground truth:')
print(test_sentences[k])

print("predicted:")
print(tagged_test_sentences[k])

Under/IN  these/DT deals/NNS ,/,  the/DT RTC/NNP sells/VBZ just/RB  the/DT deposits/NNS and/CC  the/DT healthy/JJ assets/NNS ./. 
sentence:
['Under', 'these', 'deals', ',', 'the', 'RTC', 'sells', 'just', 'the', 'deposits', 'and', 'the', 'healthy', 'assets', '.']
ground truth:
[('Under', 'IN'), ('these', 'DT'), ('deals', 'NNS'), (',', ','), ('the', 'DT'), ('RTC', 'NNP'), ('sells', 'VBZ'), ('just', 'RB'), ('the', 'DT'), ('deposits', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('healthy', 'JJ'), ('assets', 'NNS'), ('.', '.')]
predicted:
[('Under', 'IN'), ('these', 'DT'), ('deals', 'NNS'), (',', ','), ('the', 'DT'), ('RTC', 'NNP'), ('sells', 'VBZ'), ('just', 'RB'), ('the', 'DT'), ('deposits', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('healthy', 'JJ'), ('assets', 'NNS'), ('.', '.')]


##### Hidden Markov Model + Viterbi Algorithm

In [11]:
LOW_PROB = 1e-10; 

def get_prob(prob):
    return np.log(prob) if prob != 0.0 else np.log(LOW_PROB)
    

class HMM:
    def __init__(self):
        self.transitions = {}
        self.emissions = {}
        self.words = {}
        self.tags = {}  
        

    @staticmethod
    def add_start_stop(sentence):
        return [("<start>", "START")]+ sentence + [("<stop>", "STOP")]


    def get_words_and_tags(self, sentences):
        words, tags = {}, {}
        for sentence in sentences:
            for token, tag in sentence:
                tags[tag] = tags.get(tag, 0) + 1
                words[token] = words.get(token, 0) + 1
        return words, tags

    
    def get_transition_matrix(self, sentences, alpha=0.0):
        print("> Building transition matrix ...")
        # Initialize the transition counts.
        transitions = defaultdict(lambda: 0)
        N = len(self.tags)

        # Count the transitions (bigrams).
        for sentence in sentences:
            for i in range(len(sentence) - 1):
                t1, t2 = sentence[i][1], sentence[i + 1][1]
                transitions[(t1, t2)] += 1
            
        
        # Calculate the transition probabilities.
        for (tag1, tag2), count in transitions.items():
            transitions[(tag1, tag2)] = (count + alpha) / (self.tags[tag1] + alpha * N)

        return transitions

    
    def get_emission_matrix(self, sentences, alpha=0.0):
        print("> Building emission matrix ...")
        # Initialize the emission counts.
        emissions = defaultdict(lambda: 0)
        emissions_count = defaultdict(lambda: 0)
        N = len(self.words)

        # Count the emissions (bigrams).
        for sentence in sentences:
            for token, tag in sentence:
                emissions_count[(token, tag)] += 1
        
        # Calculate the emission probabilities.
        for (token, tag), count in emissions_count.items():
            emissions[(token, tag)] = (count + alpha) / (self.tags[tag] + alpha * N)

        return emissions

   
    def train(self, sentences, alpha=1):
        print("Training ...")
        # training: get tags, words, and build transition and emission matrices
        sentences = [self.add_start_stop(sentence) for sentence in sentences]

        self.words, self.tags = self.get_words_and_tags(sentences)
        self.transitions = self.get_transition_matrix(sentences, alpha)
        self.emissions = self.get_emission_matrix(sentences, alpha)

        print("> Done")

    
    def viterbi(self, sentence):
        """
        Viterbi algorithm for finding the most likely sequence of tags
        given a sequence of observations.
        """
        # Initialize the viterbi matrix and backpointer matrix.
        tags = list(self.tags.keys())
        viterbi = np.zeros((len(tags), len(sentence)))
        backpointer = np.zeros((len(tags), len(sentence)))

        # calculate the probs for the sentence 
        for w in range(1, len(sentence)):
            for t in range(len(tags)):
                # dynamic programming: calculate the total prob of the prev tag and the current tag
                total_prob, prev_tag = -1e8, 0
                for j in range(len(tags)):
                    prob = viterbi[j, w-1] + \
                        get_prob(self.transitions[(tags[j], tags[t])]) + \
                        get_prob(self.emissions[(sentence[w][0], tags[t])])
                    
                    if prob > total_prob:
                        total_prob = prob
                        prev_tag = j
                
                backpointer[t, w] = prev_tag
                viterbi[t, w] = total_prob

        # Find the most likely sequence of tags.
        path = []
        for j in range(len(sentence)-1, -1, -1):
            best_choice = np.argmax(viterbi[:, j])
            path.append(
                (sentence[j][0], tags[best_choice])
            )
        path = path[1:-1]   # remove <start> and <stop>
        return path[::-1]   # reverse the path
            

    # predict the tags for the sentences
    def predict(self, sentences):
        print("Predicting ...")
        predicted = []
        sentences = [self.add_start_stop(sentence) for sentence in sentences]
        for sentence in tqdm(sentences):
            predicted.append(self.viterbi(sentence))
        return predicted


    # evaluate the model
    def evaluate(self, gold_sentences, predicted_sentences):
        print("Evaluating ...")
        gold = [str(tag) for sentence in gold_sentences for token, tag in sentence]
        pred = [str(tag) for sentence in predicted_sentences for token, tag in sentence]
        return metrics.classification_report(gold, pred, zero_division=0)

    
    # get the confusion matrix
    def get_confusion_matrix(self, gold_sentence, predicted_sentence):
        print("Confusion Matrix ...")
        gold = [str(tag) for token, tag in gold_sentence]
        pred = [str(tag) for token, tag in predicted_sentence]
        return metrics.confusion_matrix(gold, pred)



In [12]:
# train the model
hmm = HMM()
hmm.train(train_sentences, alpha=1)

Training ...
> Building transition matrix ...
> Building emission matrix ...
> Done


In [13]:
# mode 
mode = 'test'

if mode == 'dev':
    sentences = dev_sentences[0:20]
elif mode == 'test':    
    sentences = test_sentences

In [14]:
# predict the test sentences
predicted = hmm.predict(sentences)

Predicting ...


100%|██████████| 9046/9046 [47:12<00:00,  3.19it/s]


In [15]:
# evaluate the model
results = hmm.evaluate(sentences, predicted)
print(results)

Evaluating ...
              precision    recall  f1-score   support

           #       1.00      0.91      0.95        22
           $       1.00      1.00      1.00      1138
          ''       1.00      0.99      1.00      1423
           (       1.00      1.00      1.00       249
           )       1.00      1.00      1.00       252
           ,       1.00      1.00      1.00      9056
           .       1.00      1.00      1.00      7035
           :       1.00      1.00      1.00       983
          CC       0.99      1.00      1.00      4289
          CD       0.99      0.93      0.96      6023
          DT       0.92      0.99      0.96     14946
          EX       0.96      0.45      0.62       174
          FW       0.60      0.08      0.14        38
          IN       0.91      0.99      0.95     18147
          JJ       0.88      0.87      0.87     10704
         JJR       0.68      0.91      0.78       581
     JJR|RBR       0.00      0.00      0.00         4
         JJS

In [16]:
# confusion matrix

confusion_matrix = hmm.get_confusion_matrix(sentences[0], predicted[0])
print(confusion_matrix)

Confusion Matrix ...
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 4 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 3 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]


In [None]:
# nltk_prediction = [nltk.pos_tag([token for token, tag in sentence]) for sentence in sentences]
# evaluate(sentences, nltk_prediction)

##### BaseModel

In [17]:
class BaseModel:
    def __init__(self):
        self.words_freq = {}
        self.words_tag_freq = {}
    
    @staticmethod
    def add_start_stop(sentence):
        return [("<start>", "START")]+ sentence + [("<stop>", "STOP")]

    def get_words_and_tags(self, sentences):
        words_freq, words_tag_freq = {}, {}
        # count the number of times each word appears with each tag
        for sentence in sentences:
            for word, tag in sentence:
                if word in words_freq:
                    if tag in words_freq[word]:
                        words_freq[word][tag] += 1
                    else:
                        words_freq[word][tag] = 1
                else:
                    words_freq[word] = {tag: 1}
        
        # get the most frequent tag for each word
        for word, tag_freq in words_freq.items():
            words_tag_freq[word] = max(tag_freq, key=tag_freq.get)

        return words_freq, words_tag_freq


    def train(self, sentences):
        print("Training ...")
        sentences = [self.add_start_stop(sentence) for sentence in sentences]
        self.words_freq, self.words_tag_freq = self.get_words_and_tags(sentences)
        print("> Done")

    
    def get_prediction(self, sentence):
        pred = []
        for word, tag in sentence:
            if word in self.words_tag_freq:
                pred.append((word, self.words_tag_freq[word]))
            else:
                pred.append((word, "<unk>"))
        return pred[1:-1]   # remove <start> and <stop>


    def predict(self, sentences):
        print("Predicting ...")
        predicted = []
        sentences = [self.add_start_stop(sentence) for sentence in sentences]
        
        for sentence in tqdm(sentences):
            predicted.append(self.get_prediction(sentence))
        return predicted


    def evaluate(self, gold_sentences, predicted_sentences):
        gold = [str(tag) for sentence in gold_sentences for token, tag in sentence]
        pred = [str(tag) for sentence in predicted_sentences for token, tag in sentence]
        print(metrics.classification_report(gold, pred, zero_division=0))
    




In [18]:
base = BaseModel()

base.train(train_sentences)
predicted = base.predict(sentences)
base.evaluate(sentences, predicted)



Training ...
> Done
Predicting ...


100%|██████████| 9046/9046 [00:00<00:00, 107651.84it/s]


              precision    recall  f1-score   support

           #       1.00      1.00      1.00        22
           $       1.00      1.00      1.00      1138
          ''       1.00      0.99      1.00      1423
           (       1.00      1.00      1.00       249
           )       1.00      1.00      1.00       252
           ,       1.00      1.00      1.00      9056
           .       1.00      1.00      1.00      7035
           :       1.00      1.00      1.00       983
       <unk>       0.00      0.00      0.00         0
          CC       1.00      1.00      1.00      4289
          CD       0.99      0.90      0.94      6023
          DT       0.99      0.99      0.99     14946
          EX       0.89      1.00      0.94       174
          FW       0.35      0.24      0.28        38
          IN       0.94      0.98      0.96     18147
          JJ       0.88      0.86      0.87     10704
         JJR       0.66      0.95      0.78       581
     JJR|RBR       0.00    