In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import random
from transformers import BertTokenizer, BertModel
import json
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
torch.manual_seed(1)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x1777fa87810>

In [2]:
device = "cpu"

In [3]:
#check if cuda is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)

In [4]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

#### BiLSTM CRF model

In [5]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size, embedding_mat, start_tag, end_tag, tag_to_ix, batch_size=1, device='cpu'):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.target_size = target_size
        self.batch_size = batch_size
        self.device = device
        self.tag_to_ix = tag_to_ix
        self.start_tag = start_tag
        self.end_tag = end_tag

        self.embedding = nn.Embedding.from_pretrained(embedding_mat).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True).to(device)
        self.hidden2tag = nn.Linear(hidden_dim, target_size).to(device)

        self.transitions_to = nn.Parameter(torch.randn(target_size, target_size)).to(device)
        self.transitions_to.data[start_tag, :] = -10000
        self.transitions_to.data[:, end_tag] = -10000

        # self.transitions_from = nn.Parameter(torch.randn(target_size, target_size))
        # self.transitions_from.data[:, start_tag] = -10000
        # self.transitions_from.data[end_tag, :] = -10000

        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(self.device),
                torch.randn(2, 1, self.hidden_dim // 2).to(self.device))

    def get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.embedding(sentence).view(len(sentence), 1, -1)
        #convert embeds to torch float32
        embeds = embeds.float()
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats


    def _forward_algo(self, lstm_features):

        scores = torch.full((1, self.target_size), -10000.).to(self.device)
        scores[0][self.start_tag] = 0.

        forward_var = scores

        for feat in lstm_features:
            next_tag_var = self.transitions_to + feat.view(-1, 1).expand(-1, self.target_size) + forward_var.expand(self.target_size, -1)
            max_score = next_tag_var.max(dim=1).values.view(-1, 1)
            next_tag_var = next_tag_var - max_score
            forward_var = (max_score + torch.logsumexp(next_tag_var, dim=1).view(-1, 1)).view(1, -1)
            
        terminal_var = forward_var + (self.transitions_to[self.end_tag]).view(1, -1)
        alpha = terminal_var
        max_score = alpha.max()
        alpha = max_score + torch.logsumexp(alpha - max_score, dim=1)
        return alpha
    

    def _score_sentence(self, lstm_features, tags):
        score = torch.zeros(1).to(self.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix['START_TAG']], dtype=torch.long).to(self.device), tags]).to(self.device)
        for i, feat in enumerate(lstm_features):
            score += self.transitions_to[tags[i + 1], tags[i]] + feat[tags[i + 1]]
                
        score += self.transitions_to[self.tag_to_ix['END_TAG'], tags[-1]]
        return score
    
    def neg_log_likelihood(self, sentence, tags):
        lstm_feats = self.get_lstm_features(sentence)
        forward_score = self._forward_algo(lstm_feats)
        gold_score = self._score_sentence(lstm_feats, tags)
        return forward_score - gold_score
    

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.target_size), -10000.).to(self.device)
        init_vvars[0][self.start_tag] = 0

        forward_var = init_vvars
        for feat in feats:
            bptrs_t = [] 
            viterbivars_t = [] 

            next_tag_var = self.transitions_to + forward_var.expand(self.target_size, -1)
            best_tag_id = torch.argmax(next_tag_var, dim=1)
            bptrs_t = best_tag_id
            viterbivars_t = next_tag_var[range(len(best_tag_id)), best_tag_id].view(1, -1)
            
            forward_var = (viterbivars_t + feat).view(1, -1)
            backpointers.append(bptrs_t)

        
        terminal_var = forward_var + self.transitions_to[self.end_tag]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

       
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id.item())
        
        start = best_path.pop()
        best_path.reverse()
        return path_score, best_path
        
    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self.get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

        

In [6]:
train_data = json.load(open('../Dataset/BIO_Tagged/ATE_train.json', 'r'))
test_data = json.load(open('../Dataset/BIO_Tagged/ATE_test.json', 'r'))
val_data = json.load(open('../Dataset/BIO_Tagged/ATE_val.json', 'r'))

In [7]:
word_to_idx = pickle.load(open('../Utils/word_to_idx.pkl', 'rb'))

In [8]:
tag_to_ix = pickle.load(open('../Utils/tag_to_ix.pkl', 'rb'))

#### Embeddings

In [9]:
bert_embedding_mat = pickle.load(open('../Extracted Word Embeddings/bert_embedding_mat.pkl', 'rb'))
word2vec_embedding_mat = pickle.load(open('../Extracted Word Embeddings/word2vec_embedding_mat.pkl', 'rb'))
glove_embedding_mat = pickle.load(open('../Extracted Word Embeddings/glove_embedding_mat.pkl', 'rb'))

#### Glove

In [10]:
model = BiLSTM_CRF(len(word_to_idx), 300, 256, len(tag_to_ix), torch.tensor(glove_embedding_mat).to(device), tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device=device).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

EPOCHS = 10
train_loss = []
val_loss = []
train_macro_f1 = []
val_macro_f1 = []

for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    loss = 0
    preds = []
    actuals = []
    for i, case in tqdm(enumerate(train_data), desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        tags = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        sentence = sentence.to(device)
        tags = tags.to(device)
        model.train(True)
        model.zero_grad()
        loss = model.neg_log_likelihood(sentence, tags)
        loss.backward()
        optimizer.step()
        loss += loss.item()
        with torch.no_grad():
            pred = model(sentence)[1]
            tags = tags.cpu().numpy()
            preds.extend(pred)
            actuals.extend(tags)
            
    
    train_loss.append(loss/len(train_data))
    f1 = f1_score(actuals, preds, average='macro')
    train_macro_f1.append(f1)
    
    model.eval()
    with torch.no_grad():
        loss = 0
        preds = []
        actuals = []
        for case in val_data:
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            tags = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            setence = sentence.to(device)
            tags = tags.to(device)
            loss = model.neg_log_likelihood(sentence, tags)
            loss += loss.item()
            pred = model(sentence)[1]
            preds.extend(pred)
            actuals.extend(tags.cpu().numpy())
        f1 = f1_score(actuals, preds, average='macro')
        val_macro_f1.append(f1)
        val_loss.append(loss/len(val_data))

    print()
    print(f'Train loss: {train_loss[-1]}, Val loss: {val_loss[-1]}')
    print(f'Train macro f1: {train_macro_f1[-1]}, Val macro f1: {val_macro_f1[-1]}')
    print(f'Epoch {epoch + 1}/{EPOCHS} done')

Epoch 1/10: 906it [00:18, 50.12it/s]?, ?it/s]
Epoch:  10%|█         | 1/10 [00:20<03:00, 20.04s/it]


Train loss: tensor([0.0385], grad_fn=<DivBackward0>), Val loss: tensor([0.0918])
Train macro f1: 0.5852309878202453, Val macro f1: 0.6182679677638918
Epoch 1/10 done


Epoch 2/10: 906it [00:20, 44.56it/s]
Epoch:  20%|██        | 2/10 [00:42<02:51, 21.43s/it]


Train loss: tensor([0.0276], grad_fn=<DivBackward0>), Val loss: tensor([0.0876])
Train macro f1: 0.7230490325178848, Val macro f1: 0.6547734454636264
Epoch 2/10 done


Epoch 3/10: 906it [00:19, 47.34it/s]
Epoch:  30%|███       | 3/10 [01:03<02:30, 21.44s/it]


Train loss: tensor([0.0216], grad_fn=<DivBackward0>), Val loss: tensor([0.0800])
Train macro f1: 0.7759229077151747, Val macro f1: 0.6682863253266872
Epoch 3/10 done


Epoch 4/10: 906it [00:21, 42.03it/s]
Epoch:  40%|████      | 4/10 [01:27<02:14, 22.41s/it]


Train loss: tensor([0.0201], grad_fn=<DivBackward0>), Val loss: tensor([0.0770])
Train macro f1: 0.8162678025169504, Val macro f1: 0.6746646513224898
Epoch 4/10 done


Epoch 5/10: 906it [00:19, 46.13it/s]
Epoch:  50%|█████     | 5/10 [01:49<01:51, 22.31s/it]


Train loss: tensor([0.0199], grad_fn=<DivBackward0>), Val loss: tensor([0.0750])
Train macro f1: 0.8532994632656155, Val macro f1: 0.682313658747442
Epoch 5/10 done


Epoch 6/10: 906it [00:17, 52.44it/s]
Epoch:  60%|██████    | 6/10 [02:09<01:25, 21.29s/it]


Train loss: tensor([0.0206], grad_fn=<DivBackward0>), Val loss: tensor([0.0800])
Train macro f1: 0.8789091815378506, Val macro f1: 0.6967620917835396
Epoch 6/10 done


Epoch 7/10: 906it [00:18, 49.48it/s]
Epoch:  70%|███████   | 7/10 [02:29<01:03, 21.05s/it]


Train loss: tensor([0.0170], grad_fn=<DivBackward0>), Val loss: tensor([0.0781])
Train macro f1: 0.9053663863100798, Val macro f1: 0.6994330200242683
Epoch 7/10 done


Epoch 8/10: 906it [00:21, 41.40it/s]
Epoch:  80%|████████  | 8/10 [02:54<00:44, 22.15s/it]


Train loss: tensor([0.0157], grad_fn=<DivBackward0>), Val loss: tensor([0.0693])
Train macro f1: 0.921787383732157, Val macro f1: 0.7069719348908664
Epoch 8/10 done


Epoch 9/10: 906it [00:17, 52.58it/s]
Epoch:  90%|█████████ | 9/10 [03:13<00:21, 21.21s/it]


Train loss: tensor([0.0144], grad_fn=<DivBackward0>), Val loss: tensor([0.0682])
Train macro f1: 0.944019403672378, Val macro f1: 0.7009232818956521
Epoch 9/10 done


Epoch 10/10: 906it [00:14, 62.08it/s]
Epoch: 100%|██████████| 10/10 [03:29<00:00, 20.97s/it]


Train loss: tensor([0.0120], grad_fn=<DivBackward0>), Val loss: tensor([0.0802])
Train macro f1: 0.9580443721350206, Val macro f1: 0.705169079823721
Epoch 10/10 done





In [11]:
torch.save(model, 'Non Trainable Embeddings/Glove/bilstm_crf_glove.pt')
torch.save(model, '../../Deliverables/Task 2/Saved Models/t2_BiLSTM-CRF_Glove.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Glove/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Glove/val_loss.pkl', 'wb'))
pickle.dump(train_macro_f1, open('Non Trainable Embeddings/Glove/train_f1.pkl', 'wb'))
pickle.dump(val_macro_f1, open('Non Trainable Embeddings/Glove/val_f1.pkl', 'wb'))

#### Word2vec

In [12]:
model = BiLSTM_CRF(len(word_to_idx), 300, 256, len(tag_to_ix), torch.tensor(word2vec_embedding_mat).to(device), tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device=device).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

EPOCHS = 10
train_loss = []
val_loss = []
train_macro_f1 = []
val_macro_f1 = []

for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    loss = 0
    preds = []
    actuals = []
    for i, case in tqdm(enumerate(train_data), desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        tags = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        sentence = sentence.to(device)
        tags = tags.to(device)
        model.train(True)
        model.zero_grad()
        loss = model.neg_log_likelihood(sentence, tags)
        loss.backward()
        optimizer.step()
        loss += loss.item()
        with torch.no_grad():
            pred = model(sentence)[1]
            tags = tags.cpu().numpy()
            preds.extend(pred)
            actuals.extend(tags)
            
    
    train_loss.append(loss/len(train_data))
    f1 = f1_score(actuals, preds, average='macro')
    train_macro_f1.append(f1)
    
    model.eval()
    with torch.no_grad():
        loss = 0
        preds = []
        actuals = []
        for case in val_data:
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            tags = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            setence = sentence.to(device)
            tags = tags.to(device)
            loss = model.neg_log_likelihood(sentence, tags)
            loss += loss.item()
            pred = model(sentence)[1]
            preds.extend(pred)
            actuals.extend(tags.cpu().numpy())
        f1 = f1_score(actuals, preds, average='macro')
        val_macro_f1.append(f1)
        val_loss.append(loss/len(val_data))

    print()
    print(f'Train loss: {train_loss[-1]}, Val loss: {val_loss[-1]}')
    print(f'Train macro f1: {train_macro_f1[-1]}, Val macro f1: {val_macro_f1[-1]}')
    print(f'Epoch {epoch + 1}/{EPOCHS} done')

Epoch 1/10: 906it [00:19, 46.98it/s]?, ?it/s]
Epoch:  10%|█         | 1/10 [00:21<03:17, 21.93s/it]


Train loss: tensor([0.0210], grad_fn=<DivBackward0>), Val loss: tensor([0.0605])
Train macro f1: 0.5203273169524459, Val macro f1: 0.5736410029425777
Epoch 1/10 done


Epoch 2/10: 906it [00:23, 38.07it/s]
Epoch:  20%|██        | 2/10 [00:48<03:15, 24.44s/it]


Train loss: tensor([0.0222], grad_fn=<DivBackward0>), Val loss: tensor([0.0647])
Train macro f1: 0.6793496883916541, Val macro f1: 0.6151321886188014
Epoch 2/10 done


Epoch 3/10: 906it [00:21, 43.02it/s]
Epoch:  30%|███       | 3/10 [01:11<02:46, 23.82s/it]


Train loss: tensor([0.0148], grad_fn=<DivBackward0>), Val loss: tensor([0.0617])
Train macro f1: 0.724411736747783, Val macro f1: 0.6465788757594672
Epoch 3/10 done


Epoch 4/10: 906it [00:18, 48.39it/s]
Epoch:  40%|████      | 4/10 [01:32<02:16, 22.68s/it]


Train loss: tensor([0.0120], grad_fn=<DivBackward0>), Val loss: tensor([0.0627])
Train macro f1: 0.7520209903440499, Val macro f1: 0.6515613063407181
Epoch 4/10 done


Epoch 5/10: 906it [00:21, 42.19it/s]
Epoch:  50%|█████     | 5/10 [01:55<01:54, 22.97s/it]


Train loss: tensor([0.0119], grad_fn=<DivBackward0>), Val loss: tensor([0.0505])
Train macro f1: 0.7820123246122502, Val macro f1: 0.658597051539823
Epoch 5/10 done


Epoch 6/10: 906it [00:19, 45.90it/s]
Epoch:  60%|██████    | 6/10 [02:18<01:31, 22.80s/it]


Train loss: tensor([0.0123], grad_fn=<DivBackward0>), Val loss: tensor([0.0455])
Train macro f1: 0.8012826064959034, Val macro f1: 0.6735026418132467
Epoch 6/10 done


Epoch 7/10: 906it [00:21, 42.54it/s]
Epoch:  70%|███████   | 7/10 [02:41<01:08, 22.95s/it]


Train loss: tensor([0.0091], grad_fn=<DivBackward0>), Val loss: tensor([0.0566])
Train macro f1: 0.8248381875020677, Val macro f1: 0.6923173259595696
Epoch 7/10 done


Epoch 8/10: 906it [00:17, 51.90it/s]
Epoch:  80%|████████  | 8/10 [03:00<00:43, 21.82s/it]


Train loss: tensor([0.0106], grad_fn=<DivBackward0>), Val loss: tensor([0.0498])
Train macro f1: 0.8411830039519916, Val macro f1: 0.695157936260669
Epoch 8/10 done


Epoch 9/10: 906it [00:17, 50.35it/s]
Epoch:  90%|█████████ | 9/10 [03:20<00:21, 21.18s/it]


Train loss: tensor([0.0072], grad_fn=<DivBackward0>), Val loss: tensor([0.0374])
Train macro f1: 0.85879022597361, Val macro f1: 0.6884430540408024
Epoch 9/10 done


Epoch 10/10: 906it [00:19, 47.19it/s]
Epoch: 100%|██████████| 10/10 [03:42<00:00, 22.23s/it]


Train loss: tensor([0.0077], grad_fn=<DivBackward0>), Val loss: tensor([0.0261])
Train macro f1: 0.8783896031651987, Val macro f1: 0.7002072662349249
Epoch 10/10 done





In [13]:
torch.save(model, 'Non Trainable Embeddings/Word2vec/bilstm_crf_word2vec.pt')
torch.save(model, '../../Deliverables/Task 2/Saved Models/t2_BiLSTM-CRF_Word2Vec.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Word2vec/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Word2vec/val_loss.pkl', 'wb'))
pickle.dump(train_macro_f1, open('Non Trainable Embeddings/Word2vec/train_f1.pkl', 'wb'))
pickle.dump(val_macro_f1, open('Non Trainable Embeddings/Word2vec/val_f1.pkl', 'wb'))

#### Bert

In [14]:
model = BiLSTM_CRF(len(word_to_idx), 768, 256, len(tag_to_ix), torch.tensor(bert_embedding_mat).to(device), tag_to_ix['START_TAG'], tag_to_ix['END_TAG'], tag_to_ix, device=device).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

EPOCHS = 10
train_loss = []
val_loss = []
train_macro_f1 = []
val_macro_f1 = []

for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    loss = 0
    preds = []
    actuals = []
    for i, case in tqdm(enumerate(train_data), desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        sentence = prepare_sequence(train_data[case]['text'].split(' '), word_to_idx)
        tags = prepare_sequence(train_data[case]['labels'], tag_to_ix)
        sentence = sentence.to(device)
        tags = tags.to(device)
        model.train(True)
        model.zero_grad()
        loss = model.neg_log_likelihood(sentence, tags)
        loss.backward()
        optimizer.step()
        loss += loss.item()
        with torch.no_grad():
            pred = model(sentence)[1]
            tags = tags.cpu().numpy()
            preds.extend(pred)
            actuals.extend(tags)
            
    
    train_loss.append(loss/len(train_data))
    f1 = f1_score(actuals, preds, average='macro')
    train_macro_f1.append(f1)
    
    model.eval()
    with torch.no_grad():
        loss = 0
        preds = []
        actuals = []
        for case in val_data:
            sentence = prepare_sequence(val_data[case]['text'].split(' '), word_to_idx)
            tags = prepare_sequence(val_data[case]['labels'], tag_to_ix)
            setence = sentence.to(device)
            tags = tags.to(device)
            loss = model.neg_log_likelihood(sentence, tags)
            loss += loss.item()
            pred = model(sentence)[1]
            preds.extend(pred)
            actuals.extend(tags.cpu().numpy())
        f1 = f1_score(actuals, preds, average='macro')
        val_macro_f1.append(f1)
        val_loss.append(loss/len(val_data))

    print()
    print(f'Train loss: {train_loss[-1]}, Val loss: {val_loss[-1]}')
    print(f'Train macro f1: {train_macro_f1[-1]}, Val macro f1: {val_macro_f1[-1]}')
    print(f'Epoch {epoch + 1}/{EPOCHS} done')

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10: 906it [00:22, 39.90it/s]
Epoch:  10%|█         | 1/10 [00:25<03:47, 25.30s/it]


Train loss: tensor([0.0303], grad_fn=<DivBackward0>), Val loss: tensor([0.1004])
Train macro f1: 0.6888840511837563, Val macro f1: 0.5558927104039523
Epoch 1/10 done


Epoch 2/10: 906it [00:24, 37.16it/s]
Epoch:  20%|██        | 2/10 [00:51<03:28, 26.08s/it]


Train loss: tensor([0.0256], grad_fn=<DivBackward0>), Val loss: tensor([0.0879])
Train macro f1: 0.815413580041756, Val macro f1: 0.6183597149149228
Epoch 2/10 done


Epoch 3/10: 906it [00:22, 40.37it/s]
Epoch:  30%|███       | 3/10 [01:16<02:58, 25.54s/it]


Train loss: tensor([0.0193], grad_fn=<DivBackward0>), Val loss: tensor([0.0845])
Train macro f1: 0.8703079067892516, Val macro f1: 0.6673087602404365
Epoch 3/10 done


Epoch 4/10: 906it [00:23, 38.60it/s]
Epoch:  40%|████      | 4/10 [01:43<02:35, 25.88s/it]


Train loss: tensor([0.0262], grad_fn=<DivBackward0>), Val loss: tensor([0.0988])
Train macro f1: 0.9139947294839187, Val macro f1: 0.6442807070878558
Epoch 4/10 done


Epoch 5/10: 906it [00:25, 34.92it/s]
Epoch:  50%|█████     | 5/10 [02:11<02:14, 26.90s/it]


Train loss: tensor([0.0113], grad_fn=<DivBackward0>), Val loss: tensor([0.1138])
Train macro f1: 0.9392121212604226, Val macro f1: 0.6807049524541381
Epoch 5/10 done


Epoch 6/10: 906it [00:23, 38.58it/s]
Epoch:  60%|██████    | 6/10 [02:38<01:46, 26.62s/it]


Train loss: tensor([0.0073], grad_fn=<DivBackward0>), Val loss: tensor([0.1041])
Train macro f1: 0.9614018713060144, Val macro f1: 0.694253601247821
Epoch 6/10 done


Epoch 7/10: 906it [00:26, 34.08it/s]
Epoch:  70%|███████   | 7/10 [03:07<01:22, 27.45s/it]


Train loss: tensor([0.0253], grad_fn=<DivBackward0>), Val loss: tensor([0.1805])
Train macro f1: 0.9683600444707685, Val macro f1: 0.5867644046334438
Epoch 7/10 done


Epoch 8/10: 906it [00:22, 40.63it/s]
Epoch:  80%|████████  | 8/10 [03:31<00:52, 26.49s/it]


Train loss: tensor([0.0084], grad_fn=<DivBackward0>), Val loss: tensor([0.1077])
Train macro f1: 0.9753117449602394, Val macro f1: 0.6816529814498008
Epoch 8/10 done


Epoch 9/10: 906it [00:22, 40.32it/s]
Epoch:  90%|█████████ | 9/10 [03:56<00:25, 25.97s/it]


Train loss: tensor([0.0024], grad_fn=<DivBackward0>), Val loss: tensor([0.1043])
Train macro f1: 0.9826682837742978, Val macro f1: 0.6988303766422309
Epoch 9/10 done


Epoch 10/10: 906it [00:25, 36.22it/s]
Epoch: 100%|██████████| 10/10 [04:23<00:00, 26.39s/it]


Train loss: tensor([0.0018], grad_fn=<DivBackward0>), Val loss: tensor([0.0973])
Train macro f1: 0.9833451851151787, Val macro f1: 0.6999220581920932
Epoch 10/10 done





In [15]:
torch.save(model, 'Non Trainable Embeddings/Bert/bilstm_crf_bert.pt')
torch.save(model, '../../Deliverables/Task 2/Saved Models/t2_BiLSTM-CRF_Bert.pt')
pickle.dump(train_loss, open('Non Trainable Embeddings/Bert/train_loss.pkl', 'wb'))
pickle.dump(val_loss, open('Non Trainable Embeddings/Bert/val_loss.pkl', 'wb'))
pickle.dump(train_macro_f1, open('Non Trainable Embeddings/Bert/train_f1.pkl', 'wb'))
pickle.dump(val_macro_f1, open('Non Trainable Embeddings/Bert/val_f1.pkl', 'wb'))