In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import random
from transformers import BertTokenizer, BertModel
import json
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
torch.manual_seed(1)

<torch._C.Generator at 0x231a62e8c30>

In [2]:
device = "cpu"

In [3]:
# #check if cuda is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)

In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

#### Data Loading

In [5]:
train_data = json.load(open('../Dataset/NER_train.json', 'r'))
test_data = json.load(open('../Dataset/NER_test.json', 'r'))
val_data = json.load(open('../Dataset/NER_val.json', 'r'))

In [6]:
word_to_idx = pickle.load(open('../Utils/word_to_idx.pkl', 'rb'))

In [7]:
tag_to_ix = pickle.load(open('../Utils/tag_to_ix.pkl', 'rb'))

#### RNN model

In [8]:
class RNN_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(RNN_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.rnn = nn.RNN(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores


#### LSTM Model

In [9]:
class LSTM_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(LSTM_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores

#### GRU model

In [10]:
class GRU_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(GRU_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.gru = nn.GRU(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(gru_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores

#### Embedding mats

In [11]:
bert_embedding_mat = pickle.load(open('../Utils/legal_bert_embedding_mat.pkl', 'rb'))
word2vec_embedding_mat = pickle.load(open('../Utils/word2vec_embedding_mat.pkl', 'rb'))
glove_embedding_mat = pickle.load(open('../Utils/glove_embedding_mat.pkl', 'rb'))

#### Glove + RNN 

In [12]:
rnn_model = RNN_model(len(word_to_idx), 300, 256, len(tag_to_ix), glove_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

Epoch:  0


100%|██████████| 8019/8019 [01:02<00:00, 127.46it/s]
100%|██████████| 1416/1416 [00:04<00:00, 350.29it/s]


Train Loss: 0.6250451058513435, Val Loss: 0.527879299745253, Train F1: 0.4484349906418637, Val F1: 0.48772211086730954
Epoch:  1


100%|██████████| 8019/8019 [01:05<00:00, 121.87it/s]
100%|██████████| 1416/1416 [00:04<00:00, 300.46it/s]


Train Loss: 0.4817888299660303, Val Loss: 0.443545947785655, Train F1: 0.5393097769292842, Val F1: 0.5602474425929933
Epoch:  2


100%|██████████| 8019/8019 [01:06<00:00, 121.04it/s]
100%|██████████| 1416/1416 [00:03<00:00, 358.25it/s]


Train Loss: 0.42115130154066965, Val Loss: 0.41041219450846134, Train F1: 0.5775839950077749, Val F1: 0.586166676585172
Epoch:  3


100%|██████████| 8019/8019 [01:07<00:00, 119.24it/s]
100%|██████████| 1416/1416 [00:04<00:00, 348.62it/s]


Train Loss: 0.3888844142236549, Val Loss: 0.38867701145487843, Train F1: 0.5993996426168198, Val F1: 0.6019013329601748
Epoch:  4


100%|██████████| 8019/8019 [01:06<00:00, 120.09it/s]
100%|██████████| 1416/1416 [00:04<00:00, 339.43it/s]


Train Loss: 0.3649032818724936, Val Loss: 0.37283900779449003, Train F1: 0.6154556926752869, Val F1: 0.6137888276038073
Epoch:  5


100%|██████████| 8019/8019 [01:14<00:00, 107.01it/s]
100%|██████████| 1416/1416 [00:04<00:00, 303.23it/s]


Train Loss: 0.3463229972257451, Val Loss: 0.36021146821818384, Train F1: 0.6271056727718635, Val F1: 0.6206057768419515
Epoch:  6


100%|██████████| 8019/8019 [01:17<00:00, 104.14it/s]
100%|██████████| 1416/1416 [00:06<00:00, 216.45it/s]


Train Loss: 0.33133089158502893, Val Loss: 0.3508532248713809, Train F1: 0.6374238391178215, Val F1: 0.62629363167747
Epoch:  7


100%|██████████| 8019/8019 [01:22<00:00, 97.25it/s] 
100%|██████████| 1416/1416 [00:03<00:00, 381.00it/s]


Train Loss: 0.318555892042339, Val Loss: 0.3418227206814658, Train F1: 0.6462024960573187, Val F1: 0.6349170234572294
Epoch:  8


100%|██████████| 8019/8019 [00:59<00:00, 134.26it/s]
100%|██████████| 1416/1416 [00:04<00:00, 319.59it/s]


Train Loss: 0.30551658515300406, Val Loss: 0.33521978832420557, Train F1: 0.654333687529852, Val F1: 0.6400823370633799
Epoch:  9


100%|██████████| 8019/8019 [01:00<00:00, 132.47it/s]
100%|██████████| 1416/1416 [00:04<00:00, 336.19it/s]

Train Loss: 0.2947103038571048, Val Loss: 0.3307687073831228, Train F1: 0.6620384832793077, Val F1: 0.6431011427662744





In [13]:
torch.save(rnn_model, 'Non Trainable Embeddings/Glove+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Glove+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Glove+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Glove+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Glove+RNN/val_f1.pkl', 'wb'))

#### Word2vec + RNN

In [14]:
rnn_model = RNN_model(len(word_to_idx), 300, 256, len(tag_to_ix), word2vec_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

Epoch:  0


100%|██████████| 8019/8019 [01:01<00:00, 131.22it/s]
100%|██████████| 1416/1416 [00:03<00:00, 376.71it/s]


Train Loss: 0.6568109598340905, Val Loss: 0.5123076539343265, Train F1: 0.45022232001638784, Val F1: 0.5075236501389049
Epoch:  1


100%|██████████| 8019/8019 [01:03<00:00, 127.22it/s]
100%|██████████| 1416/1416 [00:03<00:00, 393.91it/s]


Train Loss: 0.46343237259119485, Val Loss: 0.4099017054983349, Train F1: 0.5393963593588122, Val F1: 0.5650450992340578
Epoch:  2


100%|██████████| 8019/8019 [01:00<00:00, 132.63it/s]
100%|██████████| 1416/1416 [00:03<00:00, 379.85it/s]


Train Loss: 0.3935642132259288, Val Loss: 0.36856085436455155, Train F1: 0.5896951710526034, Val F1: 0.5976405353162305
Epoch:  3


100%|██████████| 8019/8019 [01:00<00:00, 133.05it/s]
100%|██████████| 1416/1416 [00:04<00:00, 351.28it/s]


Train Loss: 0.35709066869295225, Val Loss: 0.3457368135658299, Train F1: 0.6150465788912581, Val F1: 0.6155416658324357
Epoch:  4


100%|██████████| 8019/8019 [01:01<00:00, 129.91it/s]
100%|██████████| 1416/1416 [00:03<00:00, 378.96it/s]


Train Loss: 0.3315858886363667, Val Loss: 0.33031778971485054, Train F1: 0.6306646733857134, Val F1: 0.6285339054627583
Epoch:  5


100%|██████████| 8019/8019 [01:01<00:00, 131.31it/s]
100%|██████████| 1416/1416 [00:03<00:00, 379.35it/s]


Train Loss: 0.31367390837457154, Val Loss: 0.3201191327903758, Train F1: 0.6436502582115257, Val F1: 0.6374339891746351
Epoch:  6


100%|██████████| 8019/8019 [01:01<00:00, 129.47it/s]
100%|██████████| 1416/1416 [00:04<00:00, 352.66it/s]


Train Loss: 0.2988803275990276, Val Loss: 0.31305965234721667, Train F1: 0.6539981862033819, Val F1: 0.6425627022798542
Epoch:  7


100%|██████████| 8019/8019 [01:02<00:00, 128.24it/s]
100%|██████████| 1416/1416 [00:04<00:00, 323.03it/s]


Train Loss: 0.2866293002884059, Val Loss: 0.3094808140665271, Train F1: 0.6634050770304782, Val F1: 0.6430225970760293
Epoch:  8


100%|██████████| 8019/8019 [01:02<00:00, 129.14it/s]
100%|██████████| 1416/1416 [00:04<00:00, 333.15it/s]


Train Loss: 0.27520281374611794, Val Loss: 0.3058609219299124, Train F1: 0.6721823049135188, Val F1: 0.6471620169636737
Epoch:  9


100%|██████████| 8019/8019 [01:04<00:00, 125.12it/s]
100%|██████████| 1416/1416 [00:04<00:00, 345.87it/s]

Train Loss: 0.26516265273937567, Val Loss: 0.30503580675272624, Train F1: 0.6799706768963885, Val F1: 0.6527177738060994





In [15]:
torch.save(rnn_model, 'Non Trainable Embeddings/Word2vec+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Word2vec+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Word2vec+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Word2vec+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Word2vec+RNN/val_f1.pkl', 'wb'))

#### Bert + RNN

In [16]:
rnn_model = RNN_model(len(word_to_idx), 768, 512, len(tag_to_ix), bert_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

Epoch:  0


100%|██████████| 8019/8019 [01:22<00:00, 96.96it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 291.18it/s]


Train Loss: 0.7368409467210908, Val Loss: 0.6759775052331177, Train F1: 0.4277597931855873, Val F1: 0.4363632110328191
Epoch:  1


100%|██████████| 8019/8019 [01:22<00:00, 96.97it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 328.16it/s]


Train Loss: 0.6338107338778406, Val Loss: 0.6241690827859828, Train F1: 0.46883327355645327, Val F1: 0.46872462492834777
Epoch:  2


100%|██████████| 8019/8019 [01:18<00:00, 102.77it/s]
100%|██████████| 1416/1416 [00:04<00:00, 301.09it/s]


Train Loss: 0.5837268076949951, Val Loss: 0.5889268368666749, Train F1: 0.4958340108381039, Val F1: 0.4847640812893645
Epoch:  3


100%|██████████| 8019/8019 [01:22<00:00, 96.62it/s] 
100%|██████████| 1416/1416 [00:05<00:00, 278.44it/s]


Train Loss: 0.5520310315170757, Val Loss: 0.5676649358928583, Train F1: 0.5151703970972682, Val F1: 0.49522527562232593
Epoch:  4


100%|██████████| 8019/8019 [01:21<00:00, 97.81it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 291.54it/s]


Train Loss: 0.5288441904446184, Val Loss: 0.5486583332110218, Train F1: 0.5269894172271448, Val F1: 0.5062500949197646
Epoch:  5


100%|██████████| 8019/8019 [01:21<00:00, 98.78it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 319.83it/s]


Train Loss: 0.5102261729334732, Val Loss: 0.5340848006058148, Train F1: 0.5382163843493825, Val F1: 0.5194841667735018
Epoch:  6


100%|██████████| 8019/8019 [01:21<00:00, 98.73it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 315.50it/s]


Train Loss: 0.4927880446745838, Val Loss: 0.5229834659567029, Train F1: 0.5484856357378282, Val F1: 0.5284026581061378
Epoch:  7


100%|██████████| 8019/8019 [01:21<00:00, 98.76it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 322.58it/s]


Train Loss: 0.47838883303430557, Val Loss: 0.5122087614355598, Train F1: 0.5566179090157893, Val F1: 0.5356131004121726
Epoch:  8


100%|██████████| 8019/8019 [01:20<00:00, 99.94it/s] 
100%|██████████| 1416/1416 [00:04<00:00, 289.39it/s]


Train Loss: 0.4688579626056207, Val Loss: 0.5046447033633473, Train F1: 0.5635292667932726, Val F1: 0.5387408807515905
Epoch:  9


100%|██████████| 8019/8019 [01:18<00:00, 101.60it/s]
100%|██████████| 1416/1416 [00:04<00:00, 308.58it/s]

Train Loss: 0.45632064041517484, Val Loss: 0.49597533890651274, Train F1: 0.5704571205647169, Val F1: 0.5436559214550242





In [17]:
torch.save(rnn_model, 'Non Trainable Embeddings/Bert+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Bert+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Bert+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Bert+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Bert+RNN/val_f1.pkl', 'wb'))

#### Glove + GRU

In [18]:
gru_model = GRU_model(len(word_to_idx), 300, 256, len(tag_to_ix), glove_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(gru_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    gru_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        gru_model.zero_grad()
        tag_scores = gru_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        gru_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = gru_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

Epoch:  0


100%|██████████| 8019/8019 [02:11<00:00, 60.95it/s]
 13%|█▎        | 189/1416 [00:00<00:06, 199.86it/s]

In [None]:
torch.save(gru_model, 'Non Trainable Embeddings/Glove+GRU/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Glove+GRU/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Glove+GRU/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Glove+GRU/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Glove+GRU/val_f1.pkl', 'wb'))

#### Word2vec + GRU

In [None]:
gru_model = GRU_model(len(word_to_idx), 300, 256, len(tag_to_ix), word2vec_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(gru_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    gru_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        gru_model.zero_grad()
        tag_scores = gru_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        gru_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = gru_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(gru_model, 'Non Trainable Embeddings/Word2vec+GRU/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Word2vec+GRU/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Word2vec+GRU/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Word2vec+GRU/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Word2vec+GRU/val_f1.pkl', 'wb'))

#### Bert + GRU

In [None]:
gru_model = GRU_model(len(word_to_idx), 728, 512, len(tag_to_ix), bert_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(gru_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    gru_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        gru_model.zero_grad()
        tag_scores = gru_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        gru_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = gru_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(gru_model, 'Non Trainable Embeddings/Bert+GRU/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Bert+GRU/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Bert+GRU/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Bert+GRU/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Bert+GRU/val_f1.pkl', 'wb'))

#### Glove + LSTM

In [None]:
lstm_model = LSTM_model(len(word_to_idx), 300, 256, len(tag_to_ix), glove_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    lstm_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        lstm_model.zero_grad()
        tag_scores = lstm_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        lstm_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = lstm_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(lstm_model, 'Non Trainable Embeddings/Glove+LSTM/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Glove+LSTM/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Glove+LSTM/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Glove+LSTM/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Glove+LSTM/val_f1.pkl', 'wb'))

#### Word2vec + LSTM

In [None]:
lstm_model = LSTM_model(len(word_to_idx), 300, 256, len(tag_to_ix), word2vec_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    lstm_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        lstm_model.zero_grad()
        tag_scores = lstm_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        lstm_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = lstm_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(lstm_model, 'Non Trainable Embeddings/Word2vec+LSTM/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Word2vec+LSTM/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Word2vec+LSTM/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Word2vec+LSTM/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Word2vec+LSTM/val_f1.pkl', 'wb'))

#### Bert + LSTM

In [None]:
lstm_model = LSTM_model(len(word_to_idx), 728, 512, len(tag_to_ix), bert_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    lstm_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        lstm_model.zero_grad()
        tag_scores = lstm_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        lstm_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = lstm_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(lstm_model, 'Non Trainable Embeddings/Bert+LSTM/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Bert+LSTM/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Bert+LSTM/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Bert+LSTM/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Bert+LSTM/val_f1.pkl', 'wb'))