# Classify text with BERT

# Setup


Using AdamW optimizer from [tensorflow/models](https://github.com/tensorflow/models).

In [1]:
# First install all necessary packages

!pip install torch
!pip install transformers
!pip install -U torchtext

Looking in indexes: https://pypi.python.org/simple/
Collecting torch
  Using cached torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl (750.6 MB)
Installing collected packages: torch
Successfully installed torch-1.11.0
Looking in indexes: https://pypi.python.org/simple/
Collecting transformers
  Using cached transformers-4.19.2-py3-none-any.whl (4.2 MB)
Collecting filelock
  Using cached filelock-3.7.0-py3-none-any.whl (10 kB)
Collecting regex!=2019.12.17
  Using cached regex-2022.4.24-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (764 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
Collecting numpy>=1.17
  Using cached numpy-1.22.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Installing collected packages: tokenizers, regex, numpy, filelock, huggingface-hub, tr

In [2]:
!pip install pandas
!pip install matplotlib
!pip install sklearn
!pip install official

Looking in indexes: https://pypi.python.org/simple/
Collecting pandas
  Using cached pandas-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
Collecting pytz>=2020.1
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.2 pytz-2022.1
Looking in indexes: https://pypi.python.org/simple/
Collecting matplotlib
  Using cached matplotlib-3.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.33.3-py3-none-any.whl (930 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)
Collecting pillow>=6.2.0
  Using cached Pillow-9.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib
Succ

In [3]:
import os
import shutil

from utils import load_data

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import metrics, model_selection, preprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
# from official.nlp import optimization  # to create AdamW optimizer


In [4]:
import sys
sys.path.insert(1, '/home/jovyan/workbench-shared-folder/workbench-shared-folder/canary-project/Paula_internship/')

### Load the dataset


In [5]:
# DATA_PATH = "../data/"
DATA_PATH = "/home/jovyan/workbench-shared-folder/canary-project/Paula_internship/data/"

# Load data from Set A, B and EX
train_A, test_A, val_A = load_data(DATA_PATH, version="A", suffix="_kw", reduce_memory=True)
train_B, test_B, val_B = load_data(DATA_PATH, version="B", suffix="_kw", reduce_memory=True)
train_EX, test_EX, val_EX = load_data(DATA_PATH, version="EX", suffix="_kw", reduce_memory=True)

Mem. usage decreased to  4.83 Mb (83.6% reduction)
Mem. usage decreased to  2.42 Mb (83.6% reduction)
Mem. usage decreased to  0.81 Mb (83.6% reduction)
Set A with suffix '_kw' was loaded successfully.
Mem. usage decreased to  5.18 Mb (83.6% reduction)
Mem. usage decreased to  2.59 Mb (83.6% reduction)
Mem. usage decreased to  0.86 Mb (83.6% reduction)
Set B with suffix '_kw' was loaded successfully.




  train = pd.read_csv(f"{path}set_{version}_train{suffix}.csv", engine='python', error_bad_lines=False)
Skipping line 157634: unexpected end of data


  test = pd.read_csv(f"{path}set_{version}_test{suffix}.csv", engine='python', error_bad_lines=False)


  val = pd.read_csv(f"{path}set_{version}_val{suffix}.csv", engine='python', error_bad_lines=False)


Mem. usage decreased to 10.82 Mb (83.6% reduction)
Mem. usage decreased to  5.42 Mb (83.6% reduction)
Mem. usage decreased to  1.81 Mb (83.6% reduction)
Set EX with suffix '_kw' was loaded successfully.


In [6]:
#We concatenate the 3 different sets (A, B, EX):

train = pd.concat([train_A, train_B, train_EX])
test = pd.concat([test_A, test_B, test_EX])
val = pd.concat([val_A, val_B, val_EX])

# BERT Classifier using PyTorch

### Define useful classes

In [7]:
class Dataset:
    '''
    Dataset class to map indices/keys of data samples. Implemented __getitem__() and __len__() protocols.
    Using the tokenizer, the inputs are mapped to BERT ids/mask.
    '''
    
    def __init__(self, texts, labels, tokenizer, max_len, truncate):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.truncation = truncate
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=self.truncation,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
#             "length": 
        }

In [8]:
from transformers import BertModel, BertTokenizer
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
# load
model = BertModel.from_pretrained(model_name)
input_text = "Here is some text to encode"
# tokenizer-> token_id
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# input_ids: [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102]
input_ids = torch.tensor([input_ids])

with torch.no_grad():
    gg = model(input_ids)
    last_hidden_states = model(input_ids)[0] 
# The last_hidden_states are a tensor of shape (batch_size, sequence_length, hidden_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


For the example, the text “Here is some text to encode”:
- The text gets tokenized into 9 tokens (the input_ids) - actually 7 but 2 special tokens are added, namely [CLS] at the start and [SEP] at the end. So the sequence length is 9.
- The batch size is 1, as we only forward a single sentence through the model.
- And the hidden_size of a BERT-base-sized model is 768.

**Hence, the last hidden states have shape (1, 9, 768).**


We can then get the last hidden state vector of each token, e.g. if you want to get it for the first token, you would have to type last_hidden_states[:,0,:].

In [9]:
gg[0][:,0,0]

tensor([-0.0549])

In [10]:
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [11]:
tok = tokenizer.__call__(input_text,None,add_special_tokens=True, max_length=200, padding="max_length", truncation=True)
idsss = tok['input_ids']

In [9]:
class Classifier(nn.Module):
    '''
    The actual NN used for classification
    '''
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [22]:
class LSTM_Classifier(nn.Module):
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model, dimension=128):
        super(LSTM_Classifier, self).__init__()
#         self.embedding = nn.Embedding(len(text_field.vocab), 200)
        self.bert = bert_model
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=200,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=do_prob)

        self.fc = nn.Linear(2*dimension, n_classes)
        
    def forward(self, ids, mask, text_len=200):

        text_emb = self.bert(ids, attention_mask=mask)["pooler_output"]
        
        # IS IT NECESSARY TO PACK ??
#         packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_input = text_emb
        output, _ = self.lstm(packed_input)
#         output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:] # Take only CLS token
        out_reduced = torch.cat((out_forward, out_reverse), 1) # Concatenates the given sequence of seq tensors in dim 1.
        text_fea = self.drop(out_reduced) # Dropout

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, n_classes)
        text_out = torch.sigmoid(text_fea)

        return text_out
        

In [11]:
# # Define which BERT we are using:

# # SPECTER: Document-level Representation Learning using Citation-informed Transformers
# tokenizer = transformers.AutoTokenizer.from_pretrained("allenai/specter", do_lower_case=True)
# bert_model = transformers.AutoModel.from_pretrained("allenai/specter")

# # SCIBERT: BERT model trained on scientific text.
# tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True)
# bert_model = transformers.AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')


# # SQUEEZE BERT: 
# tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)
# bert_model = transformers.SqueezeBertModel.from_pretrained("squeezebert/squeezebert-uncased")

In [12]:
# FREEZE BERT PARAMS OR FINE-TUNE?
tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True)
bert_model = transformers.AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

for param in bert_model.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
n_labels= train.iloc[:, 3:].shape[1]

def build_dataset(tokenizer_max_len, truncate):
    '''
    Tokenize and map the training and validation sets
    '''
    train_dataset = Dataset(train.input.tolist(), train.iloc[:, 3:].values.tolist(), tokenizer, tokenizer_max_len, truncate)
    valid_dataset = Dataset(val.input.tolist(), val.iloc[:, 3:].values.tolist(), tokenizer, tokenizer_max_len, truncate)
    
    return train_dataset, valid_dataset

def build_dataloader(train_dataset, valid_dataset, batch_size):
    '''
    Create the torch dataloaders
    '''
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    return train_data_loader, valid_data_loader

def build_bucketiterator(train_dataset, valid_dataset, batch_size):
    '''
    Create the torch BucketIterators
    '''
    train_data_loader = BucketIterator(train_dataset, batch_size=batch_size, sort_key=lambda x: len(x.ids),
                            device=device, sort=True, sort_within_batch=True)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, sort_key=lambda x: len(x.ids),
                            device=device, sort=True, sort_within_batch=True)

    return train_data_loader, valid_data_loader

def ret_model(n_train_steps, do_prob):
    '''
    Retrieve the model
    '''
    model = LSTM_Classifier(n_train_steps, n_labels, do_prob, bert_model=bert_model)
    return model


In [17]:
# train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'], config['truncate'])
# train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])
# train_bucket, valid_bucket = build_bucketiterator(train_dataset, valid_dataset, config['batch_size'])

In [None]:
# train_data_loader

In [14]:
def ret_optimizer(model):
    '''
    Taken from Abhishek Thakur's Tez library example: 
    https://github.com/abhishekkrthakur/tez/blob/main/examples/text_classification/binary.py
    '''
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
#     opt = AdamW(optimizer_parameters, lr=config['learning_rate'])
    opt = torch.optim.AdamW(optimizer_parameters, lr=config['learning_rate'])
    return opt

def ret_scheduler(optimizer, num_train_steps):
    sch = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

def loss_function(outputs, labels, loss='BCE'):
    if labels is None:
        return None
    if loss == 'BCE':
        # BinaryCross Entropy loss
        loss_fn = nn.BCEWithLogitsLoss()
        return loss_fn(outputs, labels.float())
    elif loss == 'SigF1':
        S=-1
        E=0
        y_hat = torch.sigmoid(outputs)
        y = labels
        # Sigmoid hyperparams:
        b = torch.tensor(S)
        c = torch.tensor(E)

        # Calculate the sigmoid
        sig = 1 / (1 + torch.exp(b * (y_hat + c)))
        tp = torch.sum(sig * y, dim=0)
        fp = torch.sum(sig * (1 - y), dim=0)
        fn = torch.sum((1 - sig) * y, dim=0)

        sigmoid_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
        cost = 1 - sigmoid_f1
        macroCost = torch.mean(cost)

        return macroCost
    else:
        # BinaryCross Entropy loss
        loss_fn = nn.BCEWithLogitsLoss()
    
        return loss_fn(outputs, labels.float())

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    '''
    auc_micro_list = []
    for i in range(n_labels):
      current_pred = preds.T[i]
      current_label = labels.T[i]
      fpr_micro, tpr_micro, _ = metrics.roc_curve(current_label.T, current_pred.T)
      auc_micro = metrics.auc(fpr_micro, tpr_micro)
      auc_micro_list.append(auc_micro)
    
    return {"auc": np.array(auc_micro).mean()}
    '''
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    return {"auc_micro": auc_micro}

### Define the training and evaluation functions

In [15]:
from tqdm.notebook import tqdm

def train_fn(data_loader, model, loss_fn, optimizer, device, scheduler):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''

    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_function(outputs, targets, loss_fn)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return train_loss
    

def eval_fn(data_loader, model, device):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_function(outputs, targets)
            eval_loss += loss.item()
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return eval_loss, fin_outputs, fin_targets

## Train the network

In [16]:
def trainer(config):

    train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'], config['truncate'])
    train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])
    print("Length of Train Dataloader: ", len(train_data_loader))
    print("Length of Valid Dataloader: ", len(valid_data_loader))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_train_steps = int(len(train_dataset) / config['batch_size'] * 10)

    model = ret_model(n_train_steps, config['dropout'])
    optimizer = ret_optimizer(model)
    scheduler = ret_scheduler(optimizer, n_train_steps)
    model.to(device)
    model = nn.DataParallel(model)
    
    n_epochs = config['epochs']
    loss_fn = config['loss']

    best_val_loss = 100
    for epoch in tqdm(range(n_epochs)):
        print('Train EPOCH: ', epoch+1)
        train_loss = train_fn(train_data_loader, model, loss_fn, optimizer, device, scheduler)
        eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)
        
        metrics_eval = log_metrics(preds, labels)
        try:
            auc_score  = metrics_eval["auc_micro"]
#             print("AUC score: ", auc_score)
        except:
            pass
        avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)

        print("Average Train loss: ", avg_train_loss)
        print("Average Valid loss: ", avg_val_loss)
        torch.save(model.state_dict(), "./models/model_current_LSTM.pt")  

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "./models/model_best_LSTM.pt")  
            print("Model saved as current val_loss is: ", best_val_loss)    

Set some configuration parameters (to be fine-tuned)

In [17]:
config = {
    'learning_rate': 1e-3,
    'batch_size': 64,
    'epochs': 10,
    'dropout': 0.3,
    'tokenizer_max_len': 200,
    'truncate': True,
    'loss': 'BCE',
#     'loss': 'SigF1'
}

In [None]:
# Train the model:

trainer(config) 

Length of Train Dataloader:  4743
Length of Valid Dataloader:  792


  0%|          | 0/10 [00:00<?, ?it/s]

Train EPOCH:  1


  0%|          | 0/4743 [00:00<?, ?it/s]

## Define some functions for the inference

In [None]:
def load_model(model_name):
    '''
    Load a saved model
    '''
    train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'], config['truncate'])
    train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_train_steps = int(len(train_dataset) / config['batch_size'] * 10)

    model = ret_model(n_train_steps, config['dropout'])
    optimizer = ret_optimizer(model)
    scheduler = ret_scheduler(optimizer, n_train_steps)
    model.to(device)
    model = nn.DataParallel(model)

    Models_PATH = "/home/jovyan/workbench-shared-folder/canary-project/Paula_internship/models/"
    model.load_state_dict(torch.load(Models_PATH + model_name, map_location=device))
    
    return model

In [None]:
def inference_batches(test, model):
    '''
    Predict outputs for inference phase
    '''
    test_targets = []
    test_outputs = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    test_dataset = Dataset(test.input.tolist(), test.iloc[:, 3:].values.tolist(), tokenizer, config['tokenizer_max_len'], config['truncate'])
    data_loader = DataLoader(test_dataset, batch_size=1024, shuffle=True, num_workers=2)

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            labels = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            test_targets.extend(labels.cpu().numpy())
            test_outputs.extend(torch.sigmoid(outputs).cpu().numpy())


    return test_outputs, test_targets

In [None]:
def remove_duplicates():
    '''
    Remove duplicates of train/val datasets present in the test set
    '''
    # Get the training duplicates:
    duplicates_train = set(test.pui) & set(train.pui) 
    test_clean = test[~test['pui'].isin(duplicates_train)]
    
    # Get the validation duplicates:
    duplicates_val = set(test.pui) & set(val.pui) 
    test_clean = test_clean[~test_clean['pui'].isin(duplicates_val)]
    
    assert test_clean.shape[0] == test.shape[0] - len(duplicates_train) - len(duplicates_val)
    
    return test_clean

In [None]:
def get_metrics(preds, labels):
    '''
    Create some metrics: precison, recall, F1...
    '''
    # Convert the lists to dataframes
    lab_df = pd.DataFrame(labels)
    pred_df = pd.DataFrame(preds).round(0).astype(int)
    
    # Calculate tp/fp/fn/tn per class:
    tp = (pred_df + lab_df).eq(2).sum()
    fp = (pred_df - lab_df).eq(1).sum()
    fn = (pred_df - lab_df).eq(-1).sum()
    tn = (pred_df + lab_df).eq(0).sum()
    
    # Calculate precision and recall:
    prec = [tp[i] / (tp[i] + fp[i]) * 100.0 if tp[i] + fp[i] != 0 else 0.0 for i in range(len(tp))]
    rec = [tp[i] / (tp[i] + fn[i]) * 100.0 if tp[i] + fn[i] != 0 else 0.0 for i in range(len(tp))]
    
    # Calculate F1 score:
    f1_score = [2 * prec[i] * rec[i] / (prec[i] + rec[i]) if tp[i] > 0 else 0.0 for i in range(len(tp))]
    
    # Weighted F1 score:
    weight = lab_df.sum() / sum(lab_df.sum())
    f1_wght = [weight[i] * 2 * prec[i] * rec[i] / (prec[i] + rec[i]) if tp[i] > 0 else 0.0 for i in range(len(tp))]
    
    # Macro average:
    prec_avg = sum(prec) / len(prec)
    rec_avg = sum(rec) / len(rec)
    f1_avg = sum(f1_score) / len(f1_score)
    f1wgt_avg = sum(f1_wght) / len(f1_wght)
    
    return {
        'Precision': prec,
        'Recall': rec,
        'F1 score': f1_score,
        'Weighted F1 score': f1_wght,
        'weights': weight,
        'Average precision': prec_avg.round(2),
        'Average recall': rec_avg.round(2),
        'Average F1 score': f1_avg.round(2),
        'Average weighted F1 score': f1wgt_avg.round(2),
    }

# all_metrics = get_metrics(preds, labels)

## Inference:

In [None]:
# Predict outputs:
test_clean = remove_duplicates()
model = load_model('model_best_freezeF1.pt')
preds, labels = inference_batches(test_clean, model)
all_metrics = get_metrics(preds, labels)

In [None]:
all_metrics = get_metrics(preds, labels)

for metr, val in all_metrics.items():
    if 'Average' in metr:
        print(metr, val)

In [None]:
metrics_df = pd.DataFrame([all_metrics['Precision'], all_metrics['Recall'], all_metrics['F1 score'], all_metrics['weights']*100]).rename(index={0: "Precision", 1: "Recall", 2: "F1", 3: "Weights"}).T
# metrics_df = metrics_df.rename(index={0: "x", 1: "y", 2: "z"})
metrics_df.to_csv('outputs/metrics_FrozenSci_F1.csv', sep=';')
metrics_df