# Readme.txt

If you wish to run this .ipynb file, run the cells in which the following is written.
```
#**********************************RUN THIS CELL**********************************#
```

##Load the Datasets

In [0]:
#**********************************RUN THIS CELL**********************************#

# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#Training Data
id = '19E2v3QyOqUohMG65Qn5n_zlAhzJ0cvN4'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')

#Validation Data
id = '1BMX04M5J-6Pqsejyf1rp7AIZGJiLdl7a'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv')

#Testing Data
id = '1NrkdJJ00OwD8naPucpzFh_KnClBp0NZZ'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test.csv')

import pandas as pd
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

train_data = df_train['Sentence'].tolist()
train_labels = df_train['NER'].tolist()
val_data = df_val['Sentence'].tolist()
val_labels = df_val['NER'].tolist()
test_data = df_test['Sentence'].tolist()
test_labels = df_test['NER'].tolist()

print("Training set number:",len(train_data))
print("Training labels number:",len(train_labels))
print("Validation set number:",len(val_data))
print("Validation labels number:",len(val_labels))
print("Testing set number:",len(test_data))
print("Testing labels number:",len(test_labels))

Training set number: 3000
Training labels number: 3000
Validation set number: 700
Validation labels number: 700
Testing set number: 3684
Testing labels number: 3684


# Tokenization

In [0]:
#**********************************RUN THIS CELL**********************************#

#Tokenization
train_data_tokenized = [s.split() for s in train_data]
train_labels_tokenized = [s.split() for s in train_labels]
val_data_tokenized = [s.split() for s in val_data]
val_labels_tokenized = [s.split() for s in val_labels]
test_data_tokenized = [s.split() for s in test_data]

# Make Dictionaries

In [0]:
#**********************************RUN THIS CELL**********************************#

word_to_ix = {}
for sentence in train_data_tokenized + val_data_tokenized + test_data_tokenized:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in train_labels_tokenized:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

tags_list = list(tag_to_ix.keys())

# Get the Indices

In [0]:
#**********************************RUN THIS CELL**********************************#

def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        index_list = []
        for w in sent:
            try:
                index_list.append(to_ix[w])
            except:
                index_list.append(0)
        input_index_list.append(index_list)
    return input_index_list

train_input_index =  to_index(train_data_tokenized, word_to_ix)
train_output_index = to_index(train_labels_tokenized, tag_to_ix)
val_input_index = to_index(val_data_tokenized, word_to_ix)
val_output_index = to_index(val_labels_tokenized, tag_to_ix)

#  Generate Features

## Generate Word Embeddings

In [0]:
#**********************************RUN THIS CELL**********************************#

import numpy as np

import gensim.downloader as api
word_emb_model = api.load("glove-twitter-50") 

EMBEDDING_DIM = 50

embedding_matrix = []     
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  # This is added back by InteractiveShellApp.init_path()


(13972, 50)

## Character Embedding

In [0]:
#**********************************RUN THIS CELL**********************************#

# Fetching the maximum length of a word in the list from both the dataset
max_word_len_train = len(max(word_list, key=len))

# Adding an extra character to make all the words of the reviews of the same length as maximum length of the word
def add_padding(corpus, max_word_len):
    output = []
    for word in corpus:
        if len(word)>max_word_len:
            output.append(word[:max_word_len])
        else:
            for j in range(max_word_len-len(word)):
                word = word + "-"
            output.append(word)
    return output

wordlist_train_pad = add_padding(word_list, max_word_len_train)

# from itertools import chain
# u_c = set(chain.from_iterable(word_list))

#Assume that we have the following character instances
char_arr = ['"', '$', '%', '&', "'", '(', ')', '*',
            '+', ',', '-', '.', '/', '0', '1', '2',
            '3', '4', '5', '6', '7', '8', '9', ':',
            ';', '=', '?', '[', ']', '`', 'a', 'b',
            'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
            'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
            's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
            '@', '#']
            # Added few extra characters in the dictionary
            # to handle the Padded and Out of Vocabulary values

# Create a dictionary for above char_arr
char_dict = {n: i for i, n in enumerate(char_arr)}
# Get the dictionary length
charDict_len = len(char_dict)

# Get one-hot encoding for every word
def encode_words(seq_data):
  input_batch = []
    
  for seq in seq_data:
    input_data = [char_dict[n] for n in seq]
    input_batch.append(np.eye(charDict_len)[input_data])
  return input_batch

char_embeds = encode_words(wordlist_train_pad)
char_vector = np.array(char_embeds)
char_vector.shape

(13972, 60, 58)

## Concatenate the features

### Concatenate Word Embeddings + Character Embeddings

In [0]:
#**********************************RUN THIS CELL**********************************#

embedding_matrix = np.concatenate((char_vector[:,:,-1], embedding_matrix),axis = 1)
print(embedding_matrix.shape)

(13972, 110)


# Bi-LSTM CRF Model

## NER Class

In [0]:
#**********************************RUN THIS CELL**********************************#

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=2, bidirectional=True, dropout = 0.2)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.out = nn.Linear(hidden_dim * 2, hidden_dim)

    def _cal_attention(self, lstm_out, method):
        attention_result = torch.zeros(lstm_out.size()[0], self.hidden_dim * 2, device=device)
        if method == 'ATTN_TYPE_DOT_PRODUCT':
            # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
            for i in range(lstm_out.size()[0]):
                hidden = lstm_out[i]
                attn_weights = F.softmax(torch.bmm(hidden.unsqueeze(0).unsqueeze(0), lstm_out.T.unsqueeze(0)), dim=-1)
                attn_output = torch.bmm(attn_weights, lstm_out.unsqueeze(0))
                concat_output = torch.cat((hidden.unsqueeze(0),attn_output[0]), 1)
                attention_result[i] = concat_output.squeeze(0)
        elif method == 'ATTN_TYPE_SCALE_DOT_PRODUCT':
            for i in range(lstm_out.size()[0]):
                hidden = lstm_out[i]
                attn_weights = F.softmax(1/np.sqrt(self.hidden_dim)*torch.bmm(hidden.unsqueeze(0).unsqueeze(0), lstm_out.T.unsqueeze(0)), dim=-1)
                attn_output = torch.bmm(attn_weights, lstm_out.unsqueeze(0))
                concat_output = torch.cat((hidden.unsqueeze(0),attn_output[0]), 1)
                attention_result[i] = concat_output.squeeze(0)
        
        attention_out = self.hidden2tag(self.out(attention_result))
        return attention_out

    def init_hidden(self):
        return (torch.randn(4, 1, self.hidden_dim // 2).to(device),
                torch.randn(4, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        (h_n,h_c) = self.hidden
        hidden_out =torch.cat((h_n[0,:,:],h_n[1,:,:]),1)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        return lstm_out, hidden_out

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        lstm_out, hidden = self._get_lstm_features(sentence)
        attention_feats = self._cal_attention(lstm_out, 'ATTN_TYPE_SCALE_DOT_PRODUCT')
        forward_score = self._forward_alg(attention_feats)
        gold_score = self._score_sentence(attention_feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats, hidden_out = self._get_lstm_features(sentence)

        attention_feats = self._cal_attention(lstm_feats, 'ATTN_TYPE_SCALE_DOT_PRODUCT')

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(attention_feats)
        return score, tag_seq

## Calculate Accuracy

In [0]:
#**********************************RUN THIS CELL**********************************#

import numpy as np
def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return ground_truth, predicted, accuracy

## Initialize Model

In [0]:
#**********************************RUN THIS CELL**********************************#

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50
EMBEDDING_DIM = 110
model3 = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model3.parameters(), lr=0.01, weight_decay=1e-4)

## Train the model

In [0]:
"""Each epoch will take about 1-2 minutes-- 133.62sec"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0
    c = 0
    model3.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        # print('t:',c,tags_index)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model3.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        # print('s:',sentence_in)
        c+=1
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model3.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model3.eval()
    _, _, train_acc = cal_acc(model3, train_input_index,train_output_index)
    time2 = datetime.datetime.now()
  
    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, (time2-time1).total_seconds()))


Epoch:1, Training loss: 14656.10, train acc: 0.9078, time: 161.93s
Epoch:2, Training loss: 7842.14, train acc: 0.9306, time: 162.75s
Epoch:3, Training loss: 5871.51, train acc: 0.9407, time: 162.81s
Epoch:4, Training loss: 4752.80, train acc: 0.9495, time: 163.47s
Epoch:5, Training loss: 4016.49, train acc: 0.9521, time: 162.90s
Epoch:6, Training loss: 3359.83, train acc: 0.9573, time: 163.62s
Epoch:7, Training loss: 2936.69, train acc: 0.9619, time: 162.88s
Epoch:8, Training loss: 2616.81, train acc: 0.9634, time: 164.29s
Epoch:9, Training loss: 2246.66, train acc: 0.9690, time: 163.88s
Epoch:10, Training loss: 2048.15, train acc: 0.9707, time: 163.43s
Epoch:11, Training loss: 1834.37, train acc: 0.9737, time: 163.12s
Epoch:12, Training loss: 1617.41, train acc: 0.9746, time: 163.75s
Epoch:13, Training loss: 1445.29, train acc: 0.9759, time: 163.77s
Epoch:14, Training loss: 1348.81, train acc: 0.9813, time: 162.15s
Epoch:15, Training loss: 1227.52, train acc: 0.9808, time: 163.16s
Epo

# Save the model

In [0]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Save the Model
torch.save(model3, '/content/gdrive/My Drive/Colab Notebooks/Assignment_2_Files/NER_Model3.pt')

  "type " + obj.__name__ + ". It won't be checked "


# Load the model

In [0]:
#**********************************RUN THIS CELL**********************************#

# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
id = '1W-RRrj-KRHYluPV68K2syaAk0lG9gEwq'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('NER_Model3.pt')

In [0]:
#**********************************RUN THIS CELL**********************************#

# Load the model
NERModel3 = torch.load('NER_Model3.pt')

# Evaluate the model
NERModel3.eval()

BiLSTM_CRF(
  (word_embeds): Embedding(13972, 110)
  (lstm): LSTM(110, 25, num_layers=2, dropout=0.2, bidirectional=True)
  (hidden2tag): Linear(in_features=50, out_features=7, bias=True)
  (embedding): Embedding(13972, 50)
  (out): Linear(in_features=100, out_features=50, bias=True)
)

# Testing

## Testing on Validation Data

In [0]:
#**********************************RUN THIS CELL**********************************#

y_true, y_pred, _ = cal_acc(NERModel3, val_input_index,val_output_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_true_decode,y_pred_decode,digits=4))


              precision    recall  f1-score   support

       I-LOC     0.8012    0.9236    0.8581       419
      I-MISC     0.8581    0.6791    0.7582       187
       I-ORG     0.7286    0.5088    0.5992       285
       I-PER     0.9762    0.8423    0.9043       875
           O     0.9630    0.9931    0.9778      5790

    accuracy                         0.9457      7556
   macro avg     0.8654    0.7894    0.8195      7556
weighted avg     0.9441    0.9457    0.9429      7556



## Prediction on Test data

In [0]:
#**********************************RUN THIS CELL**********************************#

test_input_index = to_index(test_data_tokenized, word_to_ix)

import numpy as np

def calAccuracy_test(model, input_index):
    predicted = []
    for i, idxs in enumerate(input_index):
        _, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    return predicted

y_pred_test = calAccuracy_test(NERModel3, test_input_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

test_output = decode_output(y_pred_test)

## Write predictions to file

In [0]:
#**********************************RUN THIS CELL**********************************#

predicted_file = pd.DataFrame(columns = ['Id','Predicted'])
predicted_file['Predicted'] = test_output
predicted_file['Id'] = np.arange(0, len(test_output))
predicted_file.to_csv('Model3.csv', index=False)