In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import random
from transformers import BertTokenizer, BertModel
import json
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
torch.manual_seed(1)

<torch._C.Generator at 0x231a62e8c30>

In [2]:
device = "cpu"

In [3]:
# #check if cuda is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)

In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

#### Data Loading

In [5]:
train_data = json.load(open('../Dataset/NER_train.json', 'r'))
test_data = json.load(open('../Dataset/NER_test.json', 'r'))
val_data = json.load(open('../Dataset/NER_val.json', 'r'))

In [6]:
word_to_idx = pickle.load(open('../Utils/word_to_idx.pkl', 'rb'))

In [7]:
tag_to_ix = pickle.load(open('../Utils/tag_to_ix.pkl', 'rb'))

#### RNN model

In [8]:
class RNN_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(RNN_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.rnn = nn.RNN(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores


#### LSTM Model

In [9]:
class LSTM_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(LSTM_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores

#### GRU model

In [10]:
class GRU_model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, device='cpu'):
        super(GRU_model, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_mat)).to(device)
        self.gru = nn.GRU(embedding_dim, hidden_dim).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)
        self.start_tag = start_tag
        self.end_tag = end_tag
        self.tag_to_ix = tag_to_ix
        self.target_size = target_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(gru_out.view(len(sentence), -1))
        tag_scores = nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores

#### Embedding mats

In [11]:
bert_embedding_mat = pickle.load(open('../Utils/legal_bert_embedding_mat.pkl', 'rb'))
word2vec_embedding_mat = pickle.load(open('../Utils/word2vec_embedding_mat.pkl', 'rb'))
glove_embedding_mat = pickle.load(open('../Utils/glove_embedding_mat.pkl', 'rb'))

#### Glove + RNN 

In [12]:
rnn_model = RNN_model(len(word_to_idx), 300, 256, len(tag_to_ix), glove_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

Epoch:  0


  0%|          | 0/8019 [00:00<?, ?it/s]

In [None]:
torch.save(rnn_model, 'Non Trainable Embeddings/Glove+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Glove+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Glove+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Glove+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Glove+RNN/val_f1.pkl', 'wb'))

#### Word2vec + RNN

In [None]:
rnn_model = RNN_model(len(word_to_idx), 300, 256, len(tag_to_ix), word2vec_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

In [None]:
torch.save(rnn_model, 'Non Trainable Embeddings/Word2vec+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Word2vec+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Word2vec+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Word2vec+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Word2vec+RNN/val_f1.pkl', 'wb'))

#### Bert + RNN

In [None]:
rnn_model = RNN_model(len(word_to_idx), 768, 512, len(tag_to_ix), bert_embedding_mat, tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

epochs = 10
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    print("Epoch: ", epoch)
    train_loss_temp = 0
    val_loss_temp = 0
    train_f1_temp = 0
    val_f1_temp = 0
    rnn_model.train()
    for case in tqdm(train_data):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        targets = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        rnn_model.zero_grad()
        tag_scores = rnn_model(sentence)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        train_loss_temp += loss.item()
        train_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    train_loss.append(train_loss_temp/len(train_data))
    train_f1.append(train_f1_temp/len(train_data))
    
    with torch.no_grad():
        rnn_model.eval()
        for case in tqdm(val_data):
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            targets = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            tag_scores = rnn_model(sentence)
            loss = loss_function(tag_scores, targets)
            val_loss_temp += loss.item()
            val_f1_temp += f1_score(targets.detach().numpy(), torch.argmax(tag_scores, dim=1).detach().numpy(), average='macro')
    val_loss.append(val_loss_temp/len(val_data))
    val_f1.append(val_f1_temp/len(val_data))

    print(f'Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}, Train F1: {train_f1[-1]}, Val F1: {val_f1[-1]}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0


 10%|█         | 1/10 [00:50<07:37, 50.81s/it]

Train loss: 0.00014700164319947362, Val loss: 0.00012835307279601693
Train macro f1: 0.4298899923618197, Val macro f1: 0.44013995358029906
Epoch 1/10 done
Epoch 1


 20%|██        | 2/10 [01:42<06:51, 51.40s/it]

Train loss: 0.00011928575986530632, Val loss: 0.00011143186566187069
Train macro f1: 0.47232006862324005, Val macro f1: 0.4705441667847287
Epoch 2/10 done
Epoch 2


 30%|███       | 3/10 [02:37<06:12, 53.14s/it]

Train loss: 0.00010447297972859815, Val loss: 0.00010207809100393206
Train macro f1: 0.4986533180118992, Val macro f1: 0.4879303498648421
Epoch 3/10 done
Epoch 3


 40%|████      | 4/10 [03:35<05:30, 55.11s/it]

Train loss: 9.694076288724318e-05, Val loss: 9.402784053236246e-05
Train macro f1: 0.5181887238848357, Val macro f1: 0.4992261886920438
Epoch 4/10 done
Epoch 4


 50%|█████     | 5/10 [04:44<05:00, 60.04s/it]

Train loss: 9.272542229155079e-05, Val loss: 8.805939432932064e-05
Train macro f1: 0.5313980425154133, Val macro f1: 0.5103638777358255
Epoch 5/10 done
Epoch 5


 60%|██████    | 6/10 [05:45<04:01, 60.40s/it]

Train loss: 9.054591646417975e-05, Val loss: 8.332310972036794e-05
Train macro f1: 0.5428629820788246, Val macro f1: 0.5262532842109489
Epoch 6/10 done
Epoch 6


 70%|███████   | 7/10 [06:52<03:07, 62.37s/it]

Train loss: 8.965966117102653e-05, Val loss: 7.923923112684861e-05
Train macro f1: 0.5518149781784606, Val macro f1: 0.5350296434234294
Epoch 7/10 done
Epoch 7


 80%|████████  | 8/10 [08:00<02:08, 64.09s/it]

Train loss: 8.915618673199788e-05, Val loss: 7.565174018964171e-05
Train macro f1: 0.5593830546409031, Val macro f1: 0.5405909514671249
Epoch 8/10 done
Epoch 8


 90%|█████████ | 9/10 [09:01<01:03, 63.33s/it]

Train loss: 8.859747322276235e-05, Val loss: 7.305510371224955e-05
Train macro f1: 0.5662834766962865, Val macro f1: 0.5441634178303829
Epoch 9/10 done
Epoch 9


100%|██████████| 10/10 [09:55<00:00, 59.55s/it]

Train loss: 8.804789831629023e-05, Val loss: 7.051078137010336e-05
Train macro f1: 0.5725575789822337, Val macro f1: 0.5451702618390271
Epoch 10/10 done





In [None]:
torch.save(rnn_model, 'Non Trainable Embeddings/Bert+RNN/model.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Bert+RNN/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Bert+RNN/val_loss.pkl', 'wb'))
pickle.dump(train_f1, open('Non Trainable Embeddings/Bert+RNN/train_f1.pkl', 'wb'))
pickle.dump(val_f1, open('Non Trainable Embeddings/Bert+RNN/val_f1.pkl', 'wb'))