# Evaluation of our BERT-based fine-tuned model

In [31]:
import torch

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [32]:
BASE_MODEL_NAME = "bert-base-uncased"
SAVED_MODEL_NAME = 'models/bert-finetuned-30-epochs.sd'

RANDOM_STATE = 42
FINE_TUNING = True

TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 8


In [33]:
from transformers import logging, AutoTokenizer, AutoModel

# instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

In [34]:
import transformers

# load BERT model
dbert_pt = AutoModel.from_pretrained(BASE_MODEL_NAME)

In [35]:
from torch import nn
num_classes = 2

class BertClassification(nn.Module):
    def __init__(self):
        super(BertClassification, self).__init__()
        self.dbert = dbert_pt
        self.num_classes = 2
        self.linear = nn.Linear(dbert_pt.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(p=0.5)


    def forward(self, x):
        x = self.dbert(input_ids=x)
        x = x["last_hidden_state"][:,0,:]
        x = self.dropout(x)
        x = self.linear(x)
        return x

**Load saved model parameters**

In [36]:
print("Loading model from: ", SAVED_MODEL_NAME)
model_pt = BertClassification().to(device)
model_pt.load_state_dict(torch.load(SAVED_MODEL_NAME))
model_pt.eval()

Loading model from:  models/bert-finetuned-30-epochs.sd


BertClassification(
  (dbert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [37]:
# Convert data to torch dataset
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        """
        Args:
            X, y as Torch tensors
        """
        self.X_train = X
        self.y_train = y
        

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]

In [38]:
# Evaluate the model

from datetime import datetime
from tqdm import tqdm

criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(model_pt.parameters(), lr=1e-5, weight_decay=0.01)


def evaluate_model(model_pt, dataloader):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    valid_accuracy = []
    valid_loss = 0.0
    for X, y in dataloader:
        X = X.to(device)
        y = y.to(device) 
        prediction = model_pt(X)
        loss = criterion(prediction, y)

        valid_loss += loss.item()
            
        prediction_index = prediction.argmax(axis=1)
        accuracy = (prediction_index==y)
        
        label_list = y.tolist()
        predicted_label_list = prediction_index.tolist()

        for i in range(len(label_list)):
            if label_list[i] == 1:
                if predicted_label_list[i] == 1:
                    TP += 1
                else:
                    FN += 1
            else:
                if predicted_label_list[i] == 0:
                    TN += 1
                else:
                    FP += 1
                # print(label_list,'-->', predicted_label_list)
        valid_accuracy += accuracy
    valid_accuracy = (sum(valid_accuracy) / len(valid_accuracy)).item()

    print('TP:', TP)
    print('FN:', FN)
    print('FP:', FP)
    print('TN:', TN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)
    print("accuracy: {:10.4f}".format(valid_accuracy)) 
    print('precision: {:10.4f}'.format(precision))
    print('recall: {:10.4f}'.format(recall))
    print('F1: {:10.4f}'.format(F1))
    print('{:10.4f},{:10.4f},{:10.4f},{:10.4f}'.format(valid_accuracy, precision, recall, F1))

**1. Evaluate on SUBJ Dataset**

In [39]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/SUBJ/test.csv')
df.shape

(1000, 2)

In [40]:
labels = df['label'].to_list()
texts = df['text'].to_list()

texts = tokenizer(texts, padding='max_length', max_length = 256, truncation=True,  return_tensors='pt')["input_ids"]
labels = torch.Tensor(labels).long()

dataset = MyDataset(texts, labels)
test_dataloader = DataLoader(dataset, batch_size=TRAIN_BATCH_SIZE)

evaluate_model(model_pt, test_dataloader)

TP: 485
FN: 15
FP: 12
TN: 488
accuracy:     0.9730
precision:     0.9759
recall:     0.9700
F1:     0.9729
    0.9730,    0.9759,    0.9700,    0.9729


**2. Evaluate on tasksource**

In [41]:
df = pd.read_csv('data/tasksource/test.csv')
# df.head()

In [42]:
label2id = {"SUBJ": 1, "OBJ": 0}

# tasksource
# remove the last column from a dataframe
df = df.iloc[:, :-1]
# rename the first column of the dataframe
df = df.rename(columns={"Sentence": "text", "Label": "label"})


def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

df['label'] = df['label'].apply(lambda x: label2id[x])
# df.head()

In [43]:
labels = df['label'].to_list()
texts = df['text'].to_list()

texts = tokenizer(texts, padding='max_length', max_length = 256, truncation=True,  return_tensors='pt')["input_ids"]
labels = torch.Tensor(labels).long()

dataset = MyDataset(texts, labels)
test_dataloader = DataLoader(dataset, batch_size=TRAIN_BATCH_SIZE)

evaluate_model(model_pt, test_dataloader)

TP: 45
FN: 68
FP: 17
TN: 89
accuracy:     0.6119
precision:     0.7258
recall:     0.3982
F1:     0.5143
    0.6119,    0.7258,    0.3982,    0.5143


**3. Evaluate on Bard dataset**

In [44]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/bard.csv')
df.shape

(100, 2)

In [45]:
labels = df['label'].to_list()
texts = df['text'].to_list()

texts = tokenizer(texts, padding='max_length', max_length = 256, truncation=True,  return_tensors='pt')["input_ids"]
labels = torch.Tensor(labels).long()

dataset = MyDataset(texts, labels)
test_dataloader = DataLoader(dataset, batch_size=TRAIN_BATCH_SIZE)

evaluate_model(model_pt, test_dataloader)

TP: 33
FN: 17
FP: 0
TN: 50
accuracy:     0.8300
precision:     1.0000
recall:     0.6600
F1:     0.7952
    0.8300,    1.0000,    0.6600,    0.7952


**4. Evaluate on a21.csv**

In [46]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/ai21.csv')
df.shape

(100, 2)

In [47]:
labels = df['label'].to_list()
texts = df['text'].to_list()

texts = tokenizer(texts, padding='max_length', max_length = 256, truncation=True,  return_tensors='pt')["input_ids"]
labels = torch.Tensor(labels).long()

dataset = MyDataset(texts, labels)
test_dataloader = DataLoader(dataset, batch_size=TRAIN_BATCH_SIZE)

evaluate_model(model_pt, test_dataloader)

TP: 22
FN: 28
FP: 0
TN: 50
accuracy:     0.7200
precision:     1.0000
recall:     0.4400
F1:     0.6111
    0.7200,    1.0000,    0.4400,    0.6111
