# COMP5046 Group Project

## 1. Data Download and Load


Use the re3d dataset from Defense Science and Technology Laboratory, U.K., which focuses on named entity extraction relevant to somebody operating in the role of a defence and security intelligent analyst.

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '16HRH0MKKq08lbBOzPa5ctpOcA9VLXv0p'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

id = '1c0hihZpv6r3Ldi5lSEmEoW0aeua5LkPf'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('val.csv')

id = '1sFb9c2n4UuleIwjuojc6w3YiTraVfSJw'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test_without_labels.csv')

# Mount the Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd

# Create Dataframes for each file
df_train = pd.read_csv('train.csv', sep=",")
df_val = pd.read_csv('val.csv', sep=",")
df_test_without_labels = pd.read_csv('test_without_labels.csv', sep=",")

# Convert the columns to the lists
# Training Data
train_sents = list(df_train["sents"])
train_labels = list(df_train["labels"])
# Validation Data
val_sents = list(df_val["sents"])
val_labels = list(df_val["labels"])
# Test Data
test_sents = list(df_test_without_labels["sents"])

train_labels = [label.split(' ') for label in train_labels]
val_labels = [label.split(' ') for label in val_labels]

## Data Preprocessing - Extract Word Features And Tokenising

In [None]:
!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")

def extract_features(sents):
    pos = []
    lemma = []
    dep = []
    ent = []
    
    for sent in sents:
      pos_temp = []
      lemma_temp = []
      dep_temp = []
      ent_temp = []
      
      for word in nlp(sent):
        pos_temp.append(word.tag_)
        lemma_temp.append(word.lemma_)
        dep_temp.append(word.dep_)
        ent_temp.append(word.ent_type_)
      
      pos.append(pos_temp)
      lemma.append(lemma_temp)
      dep.append(dep_temp)
      ent.append(ent_temp)
        
    return pos, lemma, dep, ent

train_pos, train_lemma, train_dep, train_ent = extract_features(train_sents)
val_pos, val_lemma, val_dep, val_ent = extract_features(val_sents)
test_pos, test_lemma, test_dep, test_ent = extract_features(test_sents)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


## Data Preprocessing - Encoding and Padding

In [None]:
import numpy as np

START_TAG = "<START>"
STOP_TAG = "<STOP>"
PAD_TAG = "<PAD>"
UNKNOWN_TAG = "<UNKNOWN>"

#### Text Encoding

In [None]:
word_to_ix = {}

for sentence in train_lemma + val_lemma + test_lemma:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_to_ix[START_TAG] = len(word_to_ix)
word_to_ix[STOP_TAG] = len(word_to_ix)
word_to_ix[PAD_TAG] = len(word_to_ix)
word_to_ix[UNKNOWN_TAG] = len(word_to_ix)

word_list = list(word_to_ix.keys())

pos_to_ix = {}
for sentence in train_pos + val_pos + test_pos:
    for pos in sentence:
        if pos not in pos_to_ix:
            pos_to_ix[pos] = len(pos_to_ix)

dep_to_ix = {}
for sentence in train_dep + val_dep + test_dep:
    for dep in sentence:
        if dep not in dep_to_ix:
            dep_to_ix[dep] = len(dep_to_ix)

ent_to_ix = {}
for sentence in train_ent + val_ent + test_ent:
    for ent in sentence:
        if ent not in ent_to_ix:
            ent_to_ix[ent] = len(ent_to_ix)

#### Label Encoding

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

#-----------------------------------
#|Encoded|   Entity                |
#-----------------------------------
#|   0   |   'B-DocumentReference' | 
#|   1   |   'B-Location'          |
#|   2   |   'B-MilitaryPlatform'  |
#|   3   |   'B-Money'             |
#|   4   |   'B-Nationality'       |
#|   5   |   'B-Organisation'      |
#|   6   |   'B-Person'            |
#|   7   |   'B-Quantity'          |
#|   8   |   'B-Temporal'          |
#|   9   |   'B-Weapon'            |
#|  10   |   'I-DocumentReference' | 
#|  11   |   'I-Location'          |
#|  12   |   'I-MilitaryPlatform'  |
#|  13   |   'I-Money'             |
#|  14   |   'I-Nationality'       |
#|  15   |   'I-Organisation'      |
#|  16   |   'I-Person'            |
#|  17   |   'I-Quantity'          |
#|  18   |   'I-Temporal'          |
#|  19   |   'I-Weapon'            |
#|  20   |   'O'                   |
#|  21   |   '<START>'             |
#|  22   |   '<STOP>'              |
#|  23   |   '<PAD>'               |
#-----------------------------------

#unique_labels = np.unique(np.unique(np.array(train_labels).flatten()) + np.unique(np.array(val_labels).flatten()))
unique_labels_list = np.unique(np.concatenate((np.array(train_labels, dtype=object), np.array(val_labels, dtype=object)), axis=None))

unique_labels = []
for label in unique_labels_list:
  for s in label:
    unique_labels.append(s)

unique_labels = np.unique(unique_labels)

lEnc = LabelEncoder()
lEnc.fit(unique_labels)
classes = list(lEnc.classes_)
print(classes)

label_to_idx = {t: i for i, t in enumerate(list(classes))}
label_to_idx[START_TAG] = len(label_to_idx)
label_to_idx[STOP_TAG] = len(label_to_idx)
label_to_idx[PAD_TAG] = len(label_to_idx)
print(label_to_idx)

train_labels_encoded = []
val_labels_encoded = []

for label in train_labels:
  train_labels_encoded.append(lEnc.transform(label))

for label in val_labels:
  val_labels_encoded.append(lEnc.transform(label))

['B-DocumentReference', 'B-Location', 'B-MilitaryPlatform', 'B-Money', 'B-Nationality', 'B-Organisation', 'B-Person', 'B-Quantity', 'B-Temporal', 'B-Weapon', 'I-DocumentReference', 'I-Location', 'I-MilitaryPlatform', 'I-Money', 'I-Nationality', 'I-Organisation', 'I-Person', 'I-Quantity', 'I-Temporal', 'I-Weapon', 'O']
{'B-DocumentReference': 0, 'B-Location': 1, 'B-MilitaryPlatform': 2, 'B-Money': 3, 'B-Nationality': 4, 'B-Organisation': 5, 'B-Person': 6, 'B-Quantity': 7, 'B-Temporal': 8, 'B-Weapon': 9, 'I-DocumentReference': 10, 'I-Location': 11, 'I-MilitaryPlatform': 12, 'I-Money': 13, 'I-Nationality': 14, 'I-Organisation': 15, 'I-Person': 16, 'I-Quantity': 17, 'I-Temporal': 18, 'I-Weapon': 19, 'O': 20, '<START>': 21, '<STOP>': 22, '<PAD>': 23}


#### Align With The Labels

NOTE: **You may run the snippet of code for multiple times.** Mind to run this snippet of code anytime when the length of the sentences are **not same** with the labels.

In [None]:
def align_with_labels(lemma, pos, dep, ent, labels):
    len_sent = [len(s) for s in lemma]
    len_label = [len(s) for s in labels]

    print(len(len_sent) == len(len_label))

    for i in range(len(len_sent)):
      if len_sent[i] != len_label[i]:
          #print("Length difference between sent and labels exists. You may execute the block one more time to make them aligned.")
          lemma[i] = lemma[i][:-1]
          pos[i] = pos[i][:-1]
          dep[i] = dep[i][:-1]
          ent[i] = ent[i][:-1]
          
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

align_with_labels(train_lemma, train_pos, train_dep, train_ent, train_labels_encoded)
align_with_labels(val_lemma, val_pos, val_dep, val_ent, val_labels_encoded)

train_input_index = to_index(train_lemma, word_to_ix)
train_output_index = to_index(train_labels, label_to_idx)
train_ent_index =  to_index(train_ent, ent_to_ix)
train_dep_index = to_index(train_dep, dep_to_ix)
train_pos_index =  to_index(train_pos, pos_to_ix)
print(len(train_input_index))
print(train_pos_index)

val_input_index = to_index(val_lemma, word_to_ix)
val_output_index = to_index(val_labels, label_to_idx)
val_ent_index =  to_index(val_ent, ent_to_ix)
val_dep_index = to_index(val_dep, dep_to_ix)
val_pos_index =  to_index(val_pos, pos_to_ix)
print(val_ent_index)

True
True
573
[[0, 0, 0, 1, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 0, 4, 5, 6, 7, 3, 8, 9, 3, 10, 9, 7, 0, 11, 12, 13, 6, 3, 9, 7, 10, 14, 6, 3, 15, 14, 6, 7, 3, 16], [6, 3, 17, 10, 9, 0, 15, 0, 3, 18, 6, 3, 3, 15, 3, 3, 16], [6, 3, 5, 7, 9, 9, 19, 5, 6, 7, 7, 3, 3, 9, 6, 0, 0, 0, 16], [6, 5, 20, 7, 3, 9, 0, 15, 6, 7, 3, 12, 13, 15, 13, 7, 10, 9, 6, 0, 15, 0, 9, 6, 3, 9, 10, 7, 9, 19, 6, 16], [6, 3, 5, 8, 9, 0, 0, 0, 0, 21, 0, 16, 22], [15, 23, 18, 9, 14, 18, 19, 24, 13, 6, 16, 22, 22, 22], [15, 3, 24, 13, 23, 16], [6, 10, 9, 6, 0, 0, 25, 8, 9, 6, 3, 9, 7, 0, 0, 0, 12, 13, 6, 7, 0, 0, 21, 0, 16], [0], [19, 25, 20, 26, 10, 9, 6, 10, 9, 6, 10, 15, 3, 9, 6, 7, 3, 9, 27, 6, 8, 16], [6, 3, 9, 0, 9, 3, 15, 7, 7, 7, 10, 5, 23, 11, 10, 23, 16], [28, 6, 3, 9, 14, 20, 3, 17, 23, 8, 9, 10, 17, 23, 10, 9, 3, 23, 16], [28, 6, 3, 22, 3, 3, 17, 29, 19, 25, 23, 23, 18, 22, 5, 0, 0, 0, 0, 18, 7, 0, 0, 0], [6, 3, 17, 9, 3, 16], [0, 0, 0, 0, 3, 9, 6, 0, 23, 14, 8, 3, 9, 7, 0, 16], [6, 3, 5

<br>
<br>
<br>

## 2. Input Embedding

### *2.1* Semantic Textual Feature Embedding

Here, several input embedding models are into consideration:
- ~~ELMo(ELMo only captures the meaning of the words)~~
- ~~BERT~~

**NOTE: sudden crash due to RAM overflow**

- ~~Word2Vec~~
- Glove(glove-wiki-gigaword-50)
- ~~FastText~~

In contrast to BERT, XLNET, and ALBERT which are trained on masking random words in a sentence, ELMo is trained on predicting the next word in a sequence. ELMo is relying on bidirectional LSTM’s under the hood and is not transformer-based, like BERT, XLNET, ALBERT, and USE. 

NOTE: The PoS tag for each word is from **SpaCy**. Dependency tag is of no need and can be very costly.


In [None]:
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-50")

SEM_EMBEDDING_DIM = 50



In [None]:
sem_embedding_matrix = []

for word in word_list:
    if word in word_emb_model:
        sem_embedding_matrix.append(word_emb_model[word])
    else:
        sem_embedding_matrix.append([0] * SEM_EMBEDDING_DIM)

sem_embedding_matrix = np.array(sem_embedding_matrix)
print(sem_embedding_matrix.shape)
print(sem_embedding_matrix)

(3480, 50)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### 2.2 Syntactic Textual Feature Embedding



In [None]:
# Map additional features to index
import numpy as np 

# NOTE: All the properties are encoded as one-hot arrays.
word_list = list(word_to_ix.keys())
pos_embedding = np.eye(len(list(pos_to_ix.values())))
dep_embedding = np.eye(len(list(dep_to_ix.values())))
ent_embedding = np.eye(len(list(ent_to_ix.values())))

print(pos_to_ix)
print(ent_to_ix)
print(ent_embedding)

{'NNP': 0, '-LRB-': 1, ':': 2, 'NN': 3, '-RRB-': 4, 'VBD': 5, 'DT': 6, 'JJ': 7, 'VBN': 8, 'IN': 9, 'NNS': 10, 'CD': 11, 'TO': 12, 'VB': 13, 'VBG': 14, 'CC': 15, '.': 16, 'VBZ': 17, ',': 18, 'PRP': 19, 'PRP$': 20, 'HYPH': 21, "''": 22, 'RB': 23, 'MD': 24, 'VBP': 25, 'JJS': 26, 'PDT': 27, '``': 28, 'WRB': 29, 'RBR': 30, 'WDT': 31, 'WP': 32, 'POS': 33, 'RBS': 34, 'NNPS': 35, 'RP': 36, 'UH': 37, 'EX': 38, 'JJR': 39, 'SYM': 40, 'NFP': 41, 'ADD': 42, 'FW': 43, 'LS': 44, '$': 45, 'WP$': 46, 'XX': 47}
{'': 0, 'LANGUAGE': 1, 'DATE': 2, 'NORP': 3, 'ORG': 4, 'ORDINAL': 5, 'GPE': 6, 'PERSON': 7, 'CARDINAL': 8, 'TIME': 9, 'LOC': 10, 'PRODUCT': 11, 'WORK_OF_ART': 12, 'FAC': 13, 'QUANTITY': 14, 'LAW': 15, 'MONEY': 16, 'EVENT': 17, 'PERCENT': 18}
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.

### 2.3 Domain Feature Embedding [Optional]

<br>
<br>
<br>

## 3. NER Model

### 3.1 The Bi-LSTM + Self Attention + CRF Model

**NOTE: The model takes too long to train(nearly 30 mins), so you would better to load the models.**

In [None]:
align_with_labels(train_lemma, train_pos, train_dep, train_ent, train_labels_encoded)
align_with_labels(val_lemma, val_pos, val_dep, val_ent, val_labels_encoded)

train_input_index = to_index(train_lemma, word_to_ix)
train_output_index = to_index(train_labels, label_to_idx)
train_ent_index =  to_index(train_ent,ent_to_ix)
train_dep_index = to_index(train_dep,dep_to_ix)
train_pos_index =  to_index(train_pos,pos_to_ix)
print(len(train_input_index))
print(train_pos_index)

val_input_index = to_index(val_lemma, word_to_ix)
val_output_index = to_index(val_labels, label_to_idx)
val_ent_index =  to_index(val_ent,ent_to_ix)
val_dep_index = to_index(val_dep,dep_to_ix)
val_pos_index =  to_index(val_pos,pos_to_ix)
print(val_ent_index)

True
True
573
[[0, 0, 0, 1, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 0, 4, 5, 6, 7, 3, 8, 9, 3, 10, 9, 7, 0, 11, 12, 13, 6, 3, 9, 7, 10, 14, 6, 3, 15, 14, 6, 7, 3, 16], [6, 3, 17, 10, 9, 0, 15, 0, 3, 18, 6, 3, 3, 15, 3, 3, 16], [6, 3, 5, 7, 9, 9, 19, 5, 6, 7, 7, 3, 3, 9, 6, 0, 0, 0, 16], [6, 5, 20, 7, 3, 9, 0, 15, 6, 7, 3, 12, 13, 15, 13, 7, 10, 9, 6, 0, 15, 0, 9, 6, 3, 9, 10, 7, 9, 19, 6, 16], [6, 3, 5, 8, 9, 0, 0, 0, 0, 21, 0, 16, 22], [15, 23, 18, 9, 14, 18, 19, 24, 13, 6, 16, 22, 22, 22], [15, 3, 24, 13, 23, 16], [6, 10, 9, 6, 0, 0, 25, 8, 9, 6, 3, 9, 7, 0, 0, 0, 12, 13, 6, 7, 0, 0, 21, 0, 16], [0], [19, 25, 20, 26, 10, 9, 6, 10, 9, 6, 10, 15, 3, 9, 6, 7, 3, 9, 27, 6, 8, 16], [6, 3, 9, 0, 9, 3, 15, 7, 7, 7, 10, 5, 23, 11, 10, 23, 16], [28, 6, 3, 9, 14, 20, 3, 17, 23, 8, 9, 10, 17, 23, 10, 9, 3, 23, 16], [28, 6, 3, 22, 3, 3, 17, 29, 19, 25, 23, 23, 18, 22, 5, 0, 0, 0, 0, 18, 7, 0, 0, 0], [6, 3, 17, 9, 3, 16], [0, 0, 0, 0, 3, 9, 6, 0, 23, 14, 8, 3, 9, 7, 0, 16], [6, 3, 5

In [None]:
# NOTE: You should test your NER model with CRF/ without CRF.
# Lab 9
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from functools import reduce

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        # The Word Embedding
        self.word_embeds = nn.Embedding(sem_embedding_matrix.shape[0], 50)
        self.word_embeds.weight.data.copy_(torch.from_numpy(sem_embedding_matrix))

        # The PoS Tag Embedding
        if (apply_pos):
            self.pos_embeds = nn.Embedding(pos_embedding.shape[0], pos_embedding.shape[0])
            self.pos_embeds.weight.data.copy_(torch.from_numpy(pos_embedding))
        
        # The Dependency Embedding
        if (apply_dep):
            self.dep_embeds = nn.Embedding(dep_embedding.shape[0], dep_embedding.shape[0])
            self.dep_embeds.weight.data.copy_(torch.from_numpy(dep_embedding))
        
        # The Entities Embedding
        if (apply_ent):
            self.ent_embeds = nn.Embedding(ent_embedding.shape[0], ent_embedding.shape[0])
            self.ent_embeds.weight.data.copy_(torch.from_numpy(ent_embedding))
            
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=num_layers, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        if attn_type != ATTN_TYPE_NONE:
            self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)
        else:
            self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

        #self.dropout_lstm=nn.Dropout(p=0.5)

    def init_hidden(self):
        return (torch.randn(2 * num_layers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * num_layers, 1, self.hidden_dim // 2).to(device))
  
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])

        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags, **kw):
        if (apply_ent and apply_pos and apply_dep):
            ent = kw['ent']
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos and apply_dep):
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos):
            pos = kw['pos']
        elif (not(not apply_ent and not apply_pos and not apply_dep)):
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        if (apply_ent and apply_pos and apply_dep):
            feats = self._get_lstm_features(sentence, ent=ent, pos=pos, dep=dep)
        elif (apply_pos and apply_dep):
            feats = self._get_lstm_features(sentence, pos=pos, dep=dep)
        elif (apply_pos):
            feats = self._get_lstm_features(sentence, pos=pos)
        elif (not apply_ent and not apply_pos and not apply_dep):
            feats = self._get_lstm_features(sentence)
        else:
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")
        
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def _get_lstm_features(self, sentence, **kw):
        if (apply_ent and apply_pos and apply_dep):
            ent = kw['ent']
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos and apply_dep):
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos):
            pos = kw['pos']
        elif (not(not apply_ent and not apply_pos and not apply_dep)):
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        self.hidden = self.init_hidden()

        word_embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)

        if (apply_ent and apply_pos and apply_dep):
            ent_embeds = self.ent_embeds(ent).view(len(ent), 1, -1)
            pos_embeds = self.pos_embeds(pos).view(len(pos), 1, -1)
            dep_embeds = self.dep_embeds(dep).view(len(dep), 1, -1)
        elif (apply_pos and apply_dep):
            pos_embeds = self.pos_embeds(pos).view(len(pos), 1, -1)
            dep_embeds = self.dep_embeds(dep).view(len(dep), 1, -1)
        elif (apply_pos):
            pos_embeds = self.pos_embeds(pos).view(len(pos), 1, -1)
        elif (not(not apply_ent and not apply_pos and not apply_dep)):
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        if (apply_ent and apply_pos and apply_dep):
            embeds = torch.cat((word_embeds, ent_embeds, pos_embeds, dep_embeds), 2)  
        elif (apply_pos and apply_dep):
            embeds = torch.cat((word_embeds, pos_embeds, dep_embeds), 2)  
        elif (apply_pos):
            embeds = torch.cat((word_embeds, pos_embeds), 2)  
        elif (not apply_ent and not apply_pos and not apply_dep):
            embeds = word_embeds
        else:
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        if attn_type != ATTN_TYPE_NONE:
            # hidden_out.shape: (2 * self.num_layers, 1, 25)
            hidden_out = torch.cat((self.hidden[0].view(2 * self.num_layers, 1, 25)[:,0,:],self.hidden[0].view(2 * self.num_layers, 1, 25)[:,0,:]), 1)
            hidden_out = hidden_out.unsqueeze(0)

            att_out = self.cal_attention(lstm_out, hidden_out, attn_type)
            att_out = att_out.view(-1, self.hidden_dim * 2)
            
            lstm_feats = self.hidden2tag(att_out)
        else:
            lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
            lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def cal_attention(self, lstm_out, hidden_out, method):
        # Transpose the hidden output
        hidden_out_t = torch.transpose(hidden_out, 0, 1)
        hidden_out_t = torch.transpose(hidden_out, 1, 2)

        hidden_out_t_1 = hidden_out_t   
        hidden_out_1 = hidden_out

        for i in range(lstm_out.size()[0] - 1):
            hidden_out_t = torch.cat((hidden_out_t, hidden_out_t_1), 0)
            hidden_out = torch.cat((hidden_out, hidden_out_1), 0)   

        if attn_num == 1:
            # Dot-product attention: (Luong 2015)
            if method == ATTN_TYPE_DOT_PRODUCT:
                # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
                attn_weights = F.softmax(torch.bmm(lstm_out, hidden_out_t), dim=-1)
                #print(attn_weights.shape)

                attn_output = torch.bmm(attn_weights, hidden_out)
                concat_output = torch.cat((attn_output, lstm_out), 1)

            # Scaled dot-product attention: (Vaswani 2017)
            elif method == ATTN_TYPE_SCALE_DOT_PRODUCT:
                attn_weights = F.softmax((torch.bmm(lstm_out, hidden_out_t) / math.sqrt(len(hidden_out))), dim=-1)
                attn_output = torch.bmm(attn_weights, hidden_out)
                concat_output = torch.cat((attn_output, lstm_out), 1)
            
            # Content based attention: score(st,hi)=cosine[st,hi] (Graves 2014)
            elif method == ATTN_TYPE_CONTENT_BASE:
                # cosine_similarity: https://pytorch.org/docs/stable/nn.functional.html
                attn_weights = F.softmax(torch.cosine_similarity(lstm_out, hidden_out, dim=2, eps=1e-6), dim=-1)
                attn_weights = torch.unsqueeze(attn_weights, 1) # Input shape: (52, 2) -> Target shape: (52, 1, 2)
                attn_output = torch.bmm(attn_weights, hidden_out)
                concat_output = torch.cat((attn_output, lstm_out), 1)

        # Multi-head attention
        else:
              attn_output_list = []
              # Dot-product attention: (Luong 2015)
              if method == ATTN_TYPE_CONTENT_BASE:
                  # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
                  split_h_out = torch.tensor_split(hidden_out, attn_num)
                  split_lstm_out = torch.tensor_split(lstm_out, attn_num)
                  
                  for i in range(len(split_h_out)):
                      attn_weights = F.softmax(torch.cosine_similarity(split_lstm_out[i], split_h_out[i], dim=2, eps=1e-6), dim=-1)
                      attn_weights = torch.unsqueeze(attn_weights, 1) # Input shape: (52 / attn_num, 2) -> Target shape: (52 / attn_num, 1, 2)
                      attn_output = torch.bmm(attn_weights, split_h_out[i])
                      attn_output_list.append(attn_output)

                  attn_output_final = reduce(lambda x, y: torch.cat((x, y), 0), attn_output_list)
                  concat_output = torch.cat((attn_output_final, lstm_out), 1)
        
        return concat_output

    def forward(self, sentence, **kw):  # dont confuse this with _forward_alg above.
        if (apply_ent and apply_pos and apply_dep):
            ent = kw['ent']
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos and apply_dep):
            pos = kw['pos']
            dep = kw['dep']
        elif (apply_pos):
            pos = kw['pos']
        elif (not(not apply_ent and not apply_pos and not apply_dep)):
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        # Get the emission scores from the BiLSTM
        if (apply_ent and apply_pos and apply_dep):
            lstm_feats = self._get_lstm_features(sentence, ent=ent, pos=pos, dep=dep)
        elif (apply_pos and apply_dep):
            lstm_feats = self._get_lstm_features(sentence, pos=pos, dep=dep)
        elif (apply_pos):
            lstm_feats = self._get_lstm_features(sentence, pos=pos)
        elif (not apply_ent and not apply_pos and not apply_dep):
            lstm_feats = self._get_lstm_features(sentence)
        else:
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

'''
Code of the model is modified from the pytorch tutorial:
https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
The idea of the attention position is from 
https://github.com/JohnnyPeng123/Attention_Based_Bi-LSTM_NER_with_CRF/blob/master/Code.ipynb,
but the code was modified because the dimensions of the hidden outputs are different.
Another modification is there are also a parameter available to choose the attention method.
'''

'\nCode of the model is modified from the pytorch tutorial:\nhttps://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html\nThe idea of the attention position is from \nhttps://github.com/JohnnyPeng123/Attention_Based_Bi-LSTM_NER_with_CRF/blob/master/Code.ipynb,\nbut the code was modified because the dimensions of the hidden outputs are different.\nAnother modification is there are also a parameter available to choose the attention method.\n'

In [None]:
import numpy as np
def cal_acc(model, input_index, output_index, ent_index, pos_index, dep_index):
    apply_ent, apply_dep, apply_pos = embedding_config

    predicted = []
    ground_truth = []
    
    for i, idxs in enumerate(input_index):
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        if (apply_ent and apply_pos and apply_dep):
            ent = torch.tensor(ent_index[i], dtype=torch.long).to(device)
            dep = torch.tensor(dep_index[i], dtype=torch.long).to(device)
            pos = torch.tensor(pos_index[i], dtype=torch.long).to(device)
        elif (apply_pos and apply_dep):
            dep = torch.tensor(dep_index[i], dtype=torch.long).to(device)
            pos = torch.tensor(pos_index[i], dtype=torch.long).to(device)
        elif (apply_pos):
            pos = torch.tensor(pos_index[i], dtype=torch.long).to(device)
        elif (not(not apply_ent and not apply_pos and not apply_dep)):
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        if (apply_ent and apply_pos and apply_dep):
            _, s_pred = model.forward(sentence_in, ent=ent, pos=pos, dep=dep)
        elif (apply_pos and apply_dep):
            _, s_pred = model.forward(sentence_in, pos=pos, dep=dep)
        elif (apply_pos):
            _, s_pred = model.forward(sentence_in, pos=pos)
        elif (not apply_ent and not apply_pos and not apply_dep):
            _, s_pred = model.forward(sentence_in)
        else:
            raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")
       
        predicted += s_pred

    for seq in output_index:
        ground_truth += seq
   
    count = 0
    for i in range(len(ground_truth)):
        if predicted[i] == ground_truth[i]:
            count += 1
    accuracy = count / len(ground_truth)

    return predicted, ground_truth, accuracy

'''
The code is modified from lab 9.
'''

'\nThe code is modified from lab 9.\n'

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime

def train_crf(model, optimizer, train_input_index, train_output_index, val_input_index, val_output_index):
    apply_ent, apply_dep, apply_pos = embedding_config

    for epoch in range(epochs):  
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_input_index):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(train_output_index[i], dtype=torch.long).to(device)

            if (apply_ent and apply_pos and apply_dep):
                ent = torch.tensor(train_ent_index[i], dtype=torch.long).to(device)
                dep = torch.tensor(train_dep_index[i], dtype=torch.long).to(device)
                pos = torch.tensor(train_pos_index[i], dtype=torch.long).to(device)
            elif (apply_pos and apply_dep):
                dep = torch.tensor(train_dep_index[i], dtype=torch.long).to(device)
                pos = torch.tensor(train_pos_index[i], dtype=torch.long).to(device)
            elif (apply_pos):
                pos = torch.tensor(train_pos_index[i], dtype=torch.long).to(device)
            elif (not(not apply_ent and not apply_pos and not apply_dep)):
                raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

            # Step 3. Run our forward pass.
            # NOTE: Only 4 cases are tested: 
            # word embedding, word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            if (apply_ent and apply_pos and apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets, ent=ent, pos=pos, dep=dep)
                loss.backward()
                optimizer.step()

                train_loss+=loss.item()
            elif (apply_pos and apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets, pos=pos, dep=dep)
                loss.backward()
                optimizer.step()

                train_loss+=loss.item()
            elif (apply_pos):
                loss = model.neg_log_likelihood(sentence_in, targets, pos=pos)
                loss.backward()
                optimizer.step()

                train_loss+=loss.item()
            elif (not apply_ent and not apply_pos and not apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets)

                loss.backward()
                optimizer.step()

                train_loss+=loss.item()
            else:
                raise Exception("Only 4 cases are allowed: word embedding + pos, word embedding + pos + dep, word embedding + pos + dep + ent")

        model.eval()
        
        # Call the cal_acc functions you implemented as required
        _, _, train_acc = cal_acc(model, train_input_index, train_output_index, train_ent_index, train_pos_index, train_dep_index)
        _, _, val_acc = cal_acc(model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
        
        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(val_output_index[i], dtype=torch.long).to(device)

            if (apply_ent and apply_pos and apply_dep):
                ent = torch.tensor(val_ent_index[i], dtype=torch.long).to(device)
                dep = torch.tensor(val_dep_index[i], dtype=torch.long).to(device)
                pos = torch.tensor(val_pos_index[i], dtype=torch.long).to(device)
            elif (apply_pos and apply_dep):
                dep = torch.tensor(val_dep_index[i], dtype=torch.long).to(device)
                pos = torch.tensor(val_pos_index[i], dtype=torch.long).to(device)
            elif (apply_pos):
                pos = torch.tensor(val_pos_index[i], dtype=torch.long).to(device)

            if (apply_ent and apply_pos and apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets, ent=ent, pos=pos, dep=dep)
            elif (apply_pos and apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets, pos=pos, dep=dep)
            elif (apply_pos):
                loss = model.neg_log_likelihood(sentence_in, targets, pos=pos)
            elif (not apply_ent and not apply_pos and not apply_dep):
                loss = model.neg_log_likelihood(sentence_in, targets)

            val_loss+=loss.item()

        time2 = datetime.datetime.now()

        print("Epoch: %d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

'''
The code is modified from lab 9.
'''

'\nThe code is modified from lab 9.\n'

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

num_layers = 1
SEM_EMBEDDING_DIM = 50 # Without pos tag
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]

HIDDEN_DIM = 50
epochs = 10
attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 1

embedding_config = [True, True, True] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_crf = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer_crf = optim.Adam(model_crf.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_crf, optimizer_crf, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 10462.77, train acc: 0.8534, val loss: 2685.54, val acc: 0.7823, time: 98.47s
Epoch: 2, Training loss: 4586.10, train acc: 0.8997, val loss: 2326.13, val acc: 0.7984, time: 99.04s
Epoch: 3, Training loss: 3051.16, train acc: 0.9239, val loss: 2394.95, val acc: 0.7964, time: 98.87s
Epoch: 4, Training loss: 2421.32, train acc: 0.9282, val loss: 2805.39, val acc: 0.7895, time: 100.04s
Epoch: 5, Training loss: 2187.65, train acc: 0.9292, val loss: 2972.29, val acc: 0.7874, time: 99.17s
Epoch: 6, Training loss: 1909.39, train acc: 0.9251, val loss: 3414.92, val acc: 0.7873, time: 99.14s
Epoch: 7, Training loss: 1832.98, train acc: 0.9433, val loss: 3202.15, val acc: 0.7924, time: 99.23s
Epoch: 8, Training loss: 1656.74, train acc: 0.9513, val loss: 3191.36, val acc: 0.7884, time: 98.88s
Epoch: 9, Training loss: 1859.00, train acc: 0.9489, val loss: 3387.04, val acc: 0.7977, time: 98.93s
Epoch: 10, Training loss: 1720.47, train acc: 0.9559, val loss: 3564.69, val acc

In [None]:
# Save the model
def save(model, name):
    path = F"/content/gdrive/My Drive/{name}" 
    torch.save(model, path)

In [None]:
torch.save(model_crf, 'our_model.pt')
save(model_crf, 'our_model.pt')






























### 3.2 Additional Components [Optional]

<br>
<br>
<br>

## 4. Evaluation and Testing

NOTE: Each model will take **15-20 minutes** to train and evaluate.

### 4.1 Performance Comparison

#### 4.1.1 The Baseline Model

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class Baseline(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(Baseline, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(base_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [None]:
import gensim.downloader as api
base_word_emb_model = api.load("glove-twitter-25") 

base_embedding_matrix = []
for word in word_list:
    try:
        base_embedding_matrix.append(base_word_emb_model.wv[word])
    except:
        base_embedding_matrix.append([0]*25)
base_embedding_matrix = np.array(base_embedding_matrix)
base_embedding_matrix.shape

'''
Code of the baseline model is from lab 9.
'''



  import sys


'\nCode of the baseline model is from lab 9.\n'

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

num_layers = 1
SEM_EMBEDDING_DIM = 25 # Without pos tag

HIDDEN_DIM = 50
epochs = 20 # The original configuration
attn_type = ATTN_TYPE_NONE

embedding_config = [False, False, False] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

EMBEDDING_DIM = SEM_EMBEDDING_DIM

base_model = Baseline(len(word_to_ix), label_to_idx, 25, HIDDEN_DIM).to(device)
optimizer = optim.SGD(base_model.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(base_model, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 15010.39, train acc: 0.7238, val loss: 4790.61, val acc: 0.6763, time: 115.75s
Epoch: 2, Training loss: 9804.34, train acc: 0.7589, val loss: 3956.37, val acc: 0.7148, time: 117.77s
Epoch: 3, Training loss: 8091.52, train acc: 0.7821, val loss: 3505.46, val acc: 0.7308, time: 118.42s
Epoch: 4, Training loss: 7024.61, train acc: 0.7971, val loss: 3149.58, val acc: 0.7421, time: 118.35s
Epoch: 5, Training loss: 6221.92, train acc: 0.8109, val loss: 2858.82, val acc: 0.7497, time: 121.44s
Epoch: 6, Training loss: 5593.91, train acc: 0.8312, val loss: 2656.78, val acc: 0.7600, time: 121.12s
Epoch: 7, Training loss: 5073.71, train acc: 0.8361, val loss: 2577.77, val acc: 0.7662, time: 121.08s
Epoch: 8, Training loss: 4591.77, train acc: 0.8511, val loss: 2459.27, val acc: 0.7755, time: 121.75s
Epoch: 9, Training loss: 4185.82, train acc: 0.8662, val loss: 2352.06, val acc: 0.7799, time: 121.70s
Epoch: 10, Training loss: 3800.04, train acc: 0.8764, val loss: 2323.39,

In [None]:
save(base_model, 'baseline_model.pt')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### 4.1.2 Our Model

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE

our_model = torch.load('our_model.pt')

_, _, val_acc = cal_acc(our_model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
print("Accuracy: " + str(round(val_acc * 100, 2)) + "%")

Accuracy: 79.71%


### 4.2 Ablation Study - different input embedding model

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

num_layers = 1
SEM_EMBEDDING_DIM = 50 # Without pos tag

HIDDEN_DIM = 50
epochs = 10
attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 1

#### 4.2.1 Word Embedding

In [None]:
embedding_config = [False, False, False] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

EMBEDDING_DIM = SEM_EMBEDDING_DIM

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_sem = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_sem.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_sem, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 10726.34, train acc: 0.8585, val loss: 2847.94, val acc: 0.7613, time: 121.81s
Epoch: 2, Training loss: 4316.34, train acc: 0.9162, val loss: 2502.80, val acc: 0.7823, time: 123.25s
Epoch: 3, Training loss: 2603.91, train acc: 0.9366, val loss: 2775.41, val acc: 0.7783, time: 121.82s
Epoch: 4, Training loss: 2029.57, train acc: 0.9444, val loss: 2993.74, val acc: 0.7780, time: 120.64s
Epoch: 5, Training loss: 1605.54, train acc: 0.9551, val loss: 3239.98, val acc: 0.7630, time: 120.81s
Epoch: 6, Training loss: 1437.58, train acc: 0.9563, val loss: 3544.06, val acc: 0.7749, time: 121.19s
Epoch: 7, Training loss: 1321.26, train acc: 0.9587, val loss: 3734.16, val acc: 0.7717, time: 121.34s
Epoch: 8, Training loss: 1192.50, train acc: 0.9666, val loss: 3855.21, val acc: 0.7736, time: 127.42s
Epoch: 9, Training loss: 1026.22, train acc: 0.9740, val loss: 4322.08, val acc: 0.7766, time: 126.89s
Epoch: 10, Training loss: 928.29, train acc: 0.9687, val loss: 4735.40, 

In [None]:
save(model_sem, 'model_sem.pt')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### 4.2.2 Word Embedding + PoS Tagging

In [None]:
embedding_config = [False, False, True] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0]

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_sem_pos = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_sem_pos.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_sem_pos, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 14442.39, train acc: 0.8446, val loss: 3177.79, val acc: 0.7759, time: 126.57s
Epoch: 2, Training loss: 4832.65, train acc: 0.9049, val loss: 2643.78, val acc: 0.7861, time: 124.58s
Epoch: 3, Training loss: 2986.43, train acc: 0.9321, val loss: 2587.58, val acc: 0.7941, time: 123.78s
Epoch: 4, Training loss: 2133.67, train acc: 0.9374, val loss: 3010.04, val acc: 0.7819, time: 125.41s
Epoch: 5, Training loss: 1677.84, train acc: 0.9532, val loss: 3045.98, val acc: 0.7941, time: 125.56s
Epoch: 6, Training loss: 1529.34, train acc: 0.9573, val loss: 3196.31, val acc: 0.7928, time: 122.17s
Epoch: 7, Training loss: 1348.26, train acc: 0.9579, val loss: 3410.50, val acc: 0.7886, time: 119.81s
Epoch: 8, Training loss: 1315.07, train acc: 0.9611, val loss: 3357.65, val acc: 0.7857, time: 118.59s
Epoch: 9, Training loss: 1233.01, train acc: 0.9535, val loss: 3764.49, val acc: 0.7736, time: 118.96s
Epoch: 10, Training loss: 1367.30, train acc: 0.9624, val loss: 4109.25,

In [None]:
save(model_sem_pos, 'model_sem_pos.pt')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### 4.2.3 Word Embedding + PoS Tagging + Dependency

In [None]:
embedding_config = [False, True, True] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0]

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_sem_pos_dep = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_sem_pos_dep.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_sem_pos_dep, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 11537.83, train acc: 0.8511, val loss: 2936.48, val acc: 0.7753, time: 119.70s
Epoch: 2, Training loss: 4641.92, train acc: 0.8956, val loss: 2330.46, val acc: 0.7994, time: 118.41s
Epoch: 3, Training loss: 3020.86, train acc: 0.9127, val loss: 2627.27, val acc: 0.7895, time: 119.26s
Epoch: 4, Training loss: 2305.39, train acc: 0.9081, val loss: 3029.41, val acc: 0.7666, time: 120.06s
Epoch: 5, Training loss: 2093.75, train acc: 0.9416, val loss: 2931.60, val acc: 0.7763, time: 119.39s
Epoch: 6, Training loss: 1814.11, train acc: 0.9395, val loss: 3135.58, val acc: 0.7854, time: 119.76s
Epoch: 7, Training loss: 1688.02, train acc: 0.9387, val loss: 3345.52, val acc: 0.7753, time: 119.70s
Epoch: 8, Training loss: 1559.84, train acc: 0.9625, val loss: 3360.65, val acc: 0.7888, time: 118.07s
Epoch: 9, Training loss: 1506.06, train acc: 0.9549, val loss: 3566.86, val acc: 0.7912, time: 118.06s
Epoch: 10, Training loss: 1358.25, train acc: 0.9629, val loss: 3552.95,

In [None]:
save(model_sem_pos_dep, 'model_sem_pos_dep.pt')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### 4.2.4 Word Embedding + PoS Tagging + Dependency + Entity (Our Model)

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE

our_model = torch.load('our_model.pt')
_, _, val_acc = cal_acc(our_model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
print("Accuracy: " + str(round(val_acc * 100, 2)) + "%")

Accuracy: 79.43%


### 4.3 Ablation Study - different attention strategy

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

num_layers = 1
SEM_EMBEDDING_DIM = 50 # Without pos tag
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]

HIDDEN_DIM = 50
epochs = 10

embedding_config = [True, True, True] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

attn_num = 1

#### 4.3.0 No Attention

In [None]:
attn_type = ATTN_TYPE_NONE

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_no_attn = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_no_attn.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_no_attn, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 9506.67, train acc: 0.8590, val loss: 2623.50, val acc: 0.7867, time: 103.91s
Epoch: 2, Training loss: 4209.53, train acc: 0.8967, val loss: 2272.01, val acc: 0.8032, time: 103.80s
Epoch: 3, Training loss: 2893.37, train acc: 0.9164, val loss: 2415.11, val acc: 0.8039, time: 104.84s
Epoch: 4, Training loss: 2300.79, train acc: 0.9380, val loss: 2410.50, val acc: 0.8015, time: 104.01s
Epoch: 5, Training loss: 1887.07, train acc: 0.9321, val loss: 2848.22, val acc: 0.7948, time: 104.21s
Epoch: 6, Training loss: 1758.94, train acc: 0.9407, val loss: 2929.00, val acc: 0.7871, time: 105.19s
Epoch: 7, Training loss: 1700.94, train acc: 0.9545, val loss: 2872.68, val acc: 0.8026, time: 105.54s
Epoch: 8, Training loss: 1518.99, train acc: 0.9608, val loss: 2813.65, val acc: 0.7994, time: 104.73s
Epoch: 9, Training loss: 1382.17, train acc: 0.9463, val loss: 3371.29, val acc: 0.7859, time: 105.03s
Epoch: 10, Training loss: 1383.01, train acc: 0.9575, val loss: 3343.39, 

In [None]:
save(model_no_attn, 'model_no_attn.pt')

#### 4.3.1 Dot-Product Attention

In [None]:
attn_type = ATTN_TYPE_DOT_PRODUCT

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_dot_prod = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_dot_prod.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_dot_prod, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 11569.67, train acc: 0.8574, val loss: 3003.09, val acc: 0.7935, time: 104.81s
Epoch: 2, Training loss: 4728.40, train acc: 0.8970, val loss: 2508.46, val acc: 0.7886, time: 106.08s
Epoch: 3, Training loss: 3086.33, train acc: 0.9240, val loss: 2804.63, val acc: 0.7928, time: 106.80s
Epoch: 4, Training loss: 2388.01, train acc: 0.9369, val loss: 2669.07, val acc: 0.7911, time: 107.55s
Epoch: 5, Training loss: 1898.64, train acc: 0.9405, val loss: 2950.74, val acc: 0.7971, time: 107.03s
Epoch: 6, Training loss: 1748.82, train acc: 0.9446, val loss: 3188.11, val acc: 0.8017, time: 106.42s
Epoch: 7, Training loss: 1874.92, train acc: 0.9434, val loss: 3220.46, val acc: 0.7933, time: 106.41s
Epoch: 8, Training loss: 1727.14, train acc: 0.9511, val loss: 3197.54, val acc: 0.8007, time: 106.38s
Epoch: 9, Training loss: 1667.40, train acc: 0.9580, val loss: 3510.20, val acc: 0.8003, time: 105.94s
Epoch: 10, Training loss: 1568.44, train acc: 0.9491, val loss: 3722.73,

In [None]:
save(model_dot_prod, 'model_dot_prod.pt')

#### 4.3.2 Scaled Dot-Product Attention

In [None]:
attn_type = ATTN_TYPE_SCALE_DOT_PRODUCT

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_s_dot_prod = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_s_dot_prod.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_s_dot_prod, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 11544.38, train acc: 0.8523, val loss: 2851.70, val acc: 0.7783, time: 105.69s
Epoch: 2, Training loss: 4821.00, train acc: 0.8977, val loss: 2498.97, val acc: 0.8043, time: 106.06s
Epoch: 3, Training loss: 3217.47, train acc: 0.9179, val loss: 2514.90, val acc: 0.7933, time: 107.06s
Epoch: 4, Training loss: 2330.40, train acc: 0.9363, val loss: 2792.36, val acc: 0.7914, time: 106.48s
Epoch: 5, Training loss: 1785.24, train acc: 0.9473, val loss: 3041.39, val acc: 0.7929, time: 106.53s
Epoch: 6, Training loss: 1657.50, train acc: 0.9452, val loss: 2884.56, val acc: 0.7926, time: 107.28s
Epoch: 7, Training loss: 1835.62, train acc: 0.9380, val loss: 3355.47, val acc: 0.7986, time: 107.27s
Epoch: 8, Training loss: 1712.29, train acc: 0.9485, val loss: 3271.46, val acc: 0.7931, time: 106.23s
Epoch: 9, Training loss: 1728.55, train acc: 0.9485, val loss: 3599.71, val acc: 0.7918, time: 107.13s
Epoch: 10, Training loss: 1608.15, train acc: 0.9534, val loss: 3500.35,

In [None]:
save(model_s_dot_prod, 'model_s_dot_prod.pt')

#### 4.3.3 Content-Based Attention (Our Model)

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE

our_model = torch.load('our_model.pt')

_, _, val_acc = cal_acc(our_model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
print("Accuracy: " + str(round(val_acc * 100, 2)) + "%")

Accuracy: 78.86%


#### 4.3.4 1 Attention

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 1

our_model = torch.load('our_model.pt')

_, _, val_acc = cal_acc(our_model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
print("Accuracy: " + str(round(val_acc * 100, 2)) + "%")

Accuracy: 79.37%


#### 4.3.5 2 Attentions (Multi-head Attention)

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 2

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_attn_2 = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_attn_2.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_attn_2, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 10061.31, train acc: 0.8479, val loss: 2651.95, val acc: 0.7810, time: 99.86s
Epoch: 2, Training loss: 4464.67, train acc: 0.8962, val loss: 2416.99, val acc: 0.7933, time: 100.59s
Epoch: 3, Training loss: 2907.85, train acc: 0.9277, val loss: 2540.85, val acc: 0.7945, time: 100.86s
Epoch: 4, Training loss: 2256.57, train acc: 0.9368, val loss: 2586.97, val acc: 0.7901, time: 100.57s
Epoch: 5, Training loss: 2111.71, train acc: 0.9374, val loss: 3097.03, val acc: 0.7884, time: 99.94s
Epoch: 6, Training loss: 1925.89, train acc: 0.9448, val loss: 2983.19, val acc: 0.7827, time: 100.62s
Epoch: 7, Training loss: 1658.45, train acc: 0.9494, val loss: 3087.65, val acc: 0.7876, time: 100.08s
Epoch: 8, Training loss: 1670.27, train acc: 0.9472, val loss: 3475.97, val acc: 0.7816, time: 100.12s
Epoch: 9, Training loss: 1585.41, train acc: 0.9573, val loss: 3204.14, val acc: 0.7914, time: 100.32s
Epoch: 10, Training loss: 1520.89, train acc: 0.9539, val loss: 3479.64, v

In [None]:
save(model_attn_2, 'model_attn_2.pt')

#### 4.3.6 4 Attentions (Multi-head Attention)

In [None]:
embedding_config = [True, True, True]
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]
attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 4

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_attn_4 = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_attn_4.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_attn_4, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 9499.52, train acc: 0.8456, val loss: 2522.55, val acc: 0.7863, time: 100.86s
Epoch: 2, Training loss: 4303.27, train acc: 0.9011, val loss: 2220.68, val acc: 0.8032, time: 102.27s
Epoch: 3, Training loss: 3017.09, train acc: 0.9169, val loss: 2433.04, val acc: 0.7947, time: 102.75s
Epoch: 4, Training loss: 2347.64, train acc: 0.9309, val loss: 2602.25, val acc: 0.7975, time: 102.90s
Epoch: 5, Training loss: 2094.71, train acc: 0.9389, val loss: 2747.24, val acc: 0.7922, time: 100.54s
Epoch: 6, Training loss: 1751.99, train acc: 0.9461, val loss: 2886.18, val acc: 0.7920, time: 101.58s
Epoch: 7, Training loss: 1729.54, train acc: 0.9492, val loss: 3183.56, val acc: 0.7859, time: 100.90s
Epoch: 8, Training loss: 1763.81, train acc: 0.9575, val loss: 3170.85, val acc: 0.7964, time: 99.95s
Epoch: 9, Training loss: 1434.41, train acc: 0.9611, val loss: 3357.13, val acc: 0.7990, time: 100.04s
Epoch: 10, Training loss: 1286.02, train acc: 0.9582, val loss: 3626.91, v

In [None]:
save(model_attn_4, 'model_attn_4.pt')

### 4.4 Ablation Study - different Stacked layer or # of encoder/decoder strategy

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

SEM_EMBEDDING_DIM = 50 # Without pos tag
EMBEDDING_DIM = SEM_EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0]

HIDDEN_DIM = 50
epochs = 10

embedding_config = [True, True, True] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

attn_type = ATTN_TYPE_CONTENT_BASE
attn_num = 1

#### 4.4.1 With 1 Stacked Layer (Our Model)

In [None]:
our_model = torch.load('our_model.pt')

_, _, val_acc = cal_acc(our_model, val_input_index, val_output_index, val_ent_index, val_pos_index, val_dep_index)
print("Accuracy: " + str(round(val_acc * 100, 2)) + "%")

Accuracy: 79.14%


#### 4.4.2 With 2 Stacked Layers

In [None]:
num_layers = 2

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_2 = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_2.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_2, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 12200.74, train acc: 0.7914, val loss: 2997.48, val acc: 0.7421, time: 114.32s
Epoch: 2, Training loss: 5962.80, train acc: 0.8627, val loss: 2251.65, val acc: 0.7873, time: 115.62s
Epoch: 3, Training loss: 4195.30, train acc: 0.8895, val loss: 2090.15, val acc: 0.7865, time: 116.47s
Epoch: 4, Training loss: 3225.31, train acc: 0.9080, val loss: 2233.94, val acc: 0.7979, time: 115.74s
Epoch: 5, Training loss: 2712.09, train acc: 0.9094, val loss: 2530.53, val acc: 0.7903, time: 115.34s
Epoch: 6, Training loss: 2545.04, train acc: 0.9236, val loss: 2375.53, val acc: 0.7907, time: 115.96s
Epoch: 7, Training loss: 2392.34, train acc: 0.9137, val loss: 2687.86, val acc: 0.7840, time: 116.22s
Epoch: 8, Training loss: 2302.19, train acc: 0.9375, val loss: 2466.24, val acc: 0.7941, time: 115.32s
Epoch: 9, Training loss: 2109.84, train acc: 0.9293, val loss: 2683.61, val acc: 0.7795, time: 116.14s
Epoch: 10, Training loss: 1890.90, train acc: 0.9478, val loss: 2829.54,

In [None]:
save(model_2, 'model_2.pt')

#### 4.4.3 With 4 Stacked Layers

In [None]:
num_layers = 4

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_4 = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_4.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_4, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 14196.10, train acc: 0.7371, val loss: 3542.23, val acc: 0.6871, time: 132.13s
Epoch: 2, Training loss: 7629.90, train acc: 0.7446, val loss: 2894.79, val acc: 0.6898, time: 131.56s
Epoch: 3, Training loss: 6380.14, train acc: 0.7864, val loss: 2611.17, val acc: 0.7321, time: 133.31s
Epoch: 4, Training loss: 5538.39, train acc: 0.8019, val loss: 2708.97, val acc: 0.7349, time: 133.98s
Epoch: 5, Training loss: 4985.87, train acc: 0.8323, val loss: 2488.33, val acc: 0.7505, time: 133.23s
Epoch: 6, Training loss: 4588.07, train acc: 0.8495, val loss: 2569.72, val acc: 0.7584, time: 133.91s
Epoch: 7, Training loss: 4353.65, train acc: 0.8458, val loss: 2559.48, val acc: 0.7584, time: 134.34s
Epoch: 8, Training loss: 4036.16, train acc: 0.8556, val loss: 2544.93, val acc: 0.7539, time: 133.83s
Epoch: 9, Training loss: 3861.35, train acc: 0.8552, val loss: 2809.23, val acc: 0.7594, time: 133.55s
Epoch: 10, Training loss: 3846.70, train acc: 0.8610, val loss: 2539.34,

In [None]:
save(model_4, 'model_4.pt')

#### 4.4.4 With 8 Stacked Layers

In [None]:
num_layers = 8

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_8 = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_8.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_8, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 17362.52, train acc: 0.6999, val loss: 5567.75, val acc: 0.6485, time: 166.34s
Epoch: 2, Training loss: 12418.12, train acc: 0.6999, val loss: 4864.92, val acc: 0.6485, time: 166.32s
Epoch: 3, Training loss: 11491.93, train acc: 0.6711, val loss: 4504.19, val acc: 0.6088, time: 167.99s
Epoch: 4, Training loss: 11261.05, train acc: 0.6906, val loss: 4398.35, val acc: 0.6369, time: 166.99s
Epoch: 5, Training loss: 11065.94, train acc: 0.6370, val loss: 4338.61, val acc: 0.5694, time: 167.17s
Epoch: 6, Training loss: 10912.08, train acc: 0.5707, val loss: 4443.54, val acc: 0.5309, time: 166.60s
Epoch: 7, Training loss: 10822.95, train acc: 0.6467, val loss: 4361.04, val acc: 0.6022, time: 167.73s
Epoch: 8, Training loss: 10709.92, train acc: 0.6338, val loss: 4337.43, val acc: 0.5641, time: 167.55s
Epoch: 9, Training loss: 10549.57, train acc: 0.5906, val loss: 4248.03, val acc: 0.5434, time: 168.27s
Epoch: 10, Training loss: 10471.50, train acc: 0.5682, val loss:

In [None]:
save(model_8, 'model_8.pt')

### 4.5. Ablation Study - with/without CRF

#### 4.5.1 The Plain Bi-LSTM Model


In [None]:
# NOTE: The preprocessing code is from lab 6.
# Pad to max_length
max_length = len(max(train_input_index, key=len))
print(max_length) 

def pad_sequence(seq_list, max_length, index_dict):
    res = []
    for seq in seq_list:
        temp = seq[:]
        if len(seq) > max_length:
            res.append(temp[:max_length])
        else:
            temp += [index_dict['<PAD>']] * (max_length - len(seq))
            res.append(temp)
    return np.array(res)

train_input_index_pad = pad_sequence(train_input_index, max_length, word_to_ix)
val_input_index_pad = pad_sequence(val_input_index, max_length, word_to_ix)
train_output_index_pad = pad_sequence(train_output_index, max_length, label_to_idx)
val_output_index_pad = pad_sequence(val_output_index, max_length, label_to_idx)



import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load batches
from torch.utils.data import TensorDataset
#More detailed info about the TensorDataset, https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataset.html#TensorDataset
train_data = TensorDataset(torch.from_numpy(train_input_index_pad), torch.from_numpy(train_output_index_pad))

from torch.utils.data import DataLoader
#More detailed info about the dataLoader, https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataloader.html
batch_size = 128
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) 
# shuffle (bool, optional) – set to True to have the data reshuffled at every epoch (default: False).

'''
Code of padding and batching is from lab 6.
'''

154


'\nCode of padding and batching is from lab 6.\n'

In [None]:
# NOTE: You should test your NER model with CRF/ without CRF.
# Lab 9
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ATTN_TYPE_NONE = None

class Bi_LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tagset_size):
        super(Bi_LSTM, self).__init__()
        self.word_embeddings = nn.Embedding(sem_embedding_matrix.shape[0], embedding_dim)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(sem_embedding_matrix))

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)  
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)    
        return tag_space

'''
Code of the plain bi-lstm model is from lab 6.
'''

'\nCode of the plain bi-lstm model is from lab 6.\n'

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

SEM_EMBEDDING_DIM = 50 # Without pos tag
EMBEDDING_DIM = SEM_EMBEDDING_DIM

HIDDEN_DIM = 50
epochs = 10

num_layers = 1

embedding_config = [False, False, False] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

attn_type = ATTN_TYPE_CONTENT_BASE

loss_function = nn.NLLLoss()

In [None]:
#def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tagset_size):
model_no_crf = Bi_LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), len(label_to_idx)).to(device)
optimizer = optim.Adam(model_no_crf.parameters(), lr=0.01, weight_decay=1e-4)


In [None]:
from sklearn.metrics import accuracy_score

for epoch in range(epochs):  
    loss_now = 0.0
    correct = 0

    for sentence,targets in train_loader:
        sentence = sentence.to(device)
        targets = targets.to(device)

        temp_batch_size = sentence.shape[0]

        model_no_crf.train()
        optimizer.zero_grad()               
        tag_space = model_no_crf(sentence)
        loss = loss_function(tag_space.view(-1, tag_space.shape[-1]), targets.view(-1))
        loss.backward()
        optimizer.step()

        loss_now += loss.item() * temp_batch_size
        predicted = torch.argmax(tag_space, -1)
        # Note: The training accuracy here is calculated with "PAD", which will result in a relative higher accuracy.
        correct += accuracy_score(predicted.view(-1).cpu().numpy(),targets.view(-1).cpu().numpy())*temp_batch_size

    print('Epoch: %d, training loss: %.4f, training accuracy: %.2f%%'%(epoch+1,loss_now/len(train_data),100*correct/len(train_data)))

'''
Code of the training process is from lab 6.
'''

Epoch: 1, training loss: -0.5862, training accuracy: 52.88%
Epoch: 2, training loss: -5.3683, training accuracy: 86.68%
Epoch: 3, training loss: -11.0700, training accuracy: 89.70%
Epoch: 4, training loss: -16.4700, training accuracy: 89.61%
Epoch: 5, training loss: -21.8635, training accuracy: 89.61%
Epoch: 6, training loss: -27.3235, training accuracy: 89.79%
Epoch: 7, training loss: -32.8070, training accuracy: 90.23%
Epoch: 8, training loss: -38.3046, training accuracy: 90.55%
Epoch: 9, training loss: -43.8387, training accuracy: 91.18%
Epoch: 10, training loss: -49.3816, training accuracy: 92.33%


'\nCode of the training process is from lab 6.\n'

In [None]:
model_no_crf.eval()
sentence = torch.from_numpy(val_input_index_pad).to(device)
tag_space = model_no_crf(sentence)
predicted = torch.argmax(tag_space, -1)
predicted = predicted.cpu().numpy()

# cut off the PAD part
test_len_list = [len(s) for s in val_input_index]
actual_predicted_list= []
for i in range(predicted.shape[0]):
    actual_predicted_list+=list(predicted[i])[:test_len_list[i]]

# get actual tag list
actual_tags = sum(val_output_index, [])

print('Test Accuracy: %.2f%%'%(accuracy_score(actual_predicted_list,actual_tags)*100))

'''
Code of the testing process is from lab 6.
'''

Test Accuracy: 55.23%


'\nCode of the testing process is from lab 6.\n'

In [None]:
save(model_no_crf, 'model_no_crf.pt')

#### 4.5.2 Model With CRF (No Attention, Only Word Embedding)

In [None]:
# Three types of attentions are available: 
# - ATTN_TYPE_DOT_PRODUCT
# - ATTN_TYPE_SCALE_DOT_PRODUCT 
# - ATTN_TYPE_CONTENT_BASE
# NOTE: If you don't use attention, just initialize attn_type = ATTN_TYPE_NONE
ATTN_TYPE_NONE = None
ATTN_TYPE_DOT_PRODUCT = "Dot Product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"
ATTN_TYPE_CONTENT_BASE = "Content Base"

num_layers = 1
SEM_EMBEDDING_DIM = 50 # Without pos tag

HIDDEN_DIM = 50
epochs = 10
attn_type = ATTN_TYPE_NONE
attn_num = 1

embedding_config = [False, False, False] # [apply_ent, apply_dep, apply_pos]
apply_ent, apply_dep, apply_pos = embedding_config

EMBEDDING_DIM = SEM_EMBEDDING_DIM

# def __init__(self, embedding_dim, hidden_dim, num_layers, vocab_size, tag_to_ix):
model_comp = BiLSTM_CRF(EMBEDDING_DIM, HIDDEN_DIM, num_layers, len(word_to_ix), label_to_idx).to(device)
optimizer = optim.Adam(model_comp.parameters(), lr=0.01, weight_decay=1e-4)

train_crf(model_comp, optimizer, train_input_index, train_output_index, val_input_index, val_output_index)

Epoch: 1, Training loss: 11706.90, train acc: 0.8597, val loss: 2904.21, val acc: 0.7689, time: 100.76s
Epoch: 2, Training loss: 4294.48, train acc: 0.9033, val loss: 2601.35, val acc: 0.7810, time: 102.84s
Epoch: 3, Training loss: 2680.14, train acc: 0.9147, val loss: 2727.44, val acc: 0.7774, time: 103.55s
Epoch: 4, Training loss: 1954.10, train acc: 0.9433, val loss: 2972.54, val acc: 0.7840, time: 104.62s
Epoch: 5, Training loss: 1622.60, train acc: 0.9564, val loss: 3309.45, val acc: 0.7789, time: 104.76s
Epoch: 6, Training loss: 1278.68, train acc: 0.9608, val loss: 3467.75, val acc: 0.7766, time: 104.44s
Epoch: 7, Training loss: 1177.16, train acc: 0.9660, val loss: 3745.75, val acc: 0.7749, time: 104.97s
Epoch: 8, Training loss: 1114.30, train acc: 0.9662, val loss: 3999.14, val acc: 0.7787, time: 104.94s
Epoch: 9, Training loss: 1018.63, train acc: 0.9640, val loss: 4300.25, val acc: 0.7670, time: 103.77s
Epoch: 10, Training loss: 1037.18, train acc: 0.9692, val loss: 4110.35,

In [None]:
save(model_comp, 'model_comp.pt')