# Fine-tuning process

In [1]:
import torch
from torch import cuda


device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

### Load training set, validation set and models

In [2]:
import kaggle
import os
import zipfile


kaggle.api.authenticate()
data_path = "./datasets/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")
    
kaggle.api.dataset_download_file('marcelhiltner/pubmed-human-veterinary-medicine-classification', file_name="train.json", path=data_path)
kaggle.api.dataset_download_file('marcelhiltner/pubmed-human-veterinary-medicine-classification', file_name="valid.json", path=data_path)
zip_paths = [f"{data_path}valid.json.zip", f"{data_path}train.json.zip"]
for path in zip_paths:
    with zipfile.ZipFile(path, "r") as z:
        z.extractall(data_path)
    os.remove(path)
print(os.listdir(data_path))

['train.json', 'valid.json']


In [None]:
import pandas as pd


try:
    train_set = pd.read_json(f"{data_path}train.json", orient="records")
    print("Data loaded successfully: train.json")
    print(f"Shape: {train_set.shape}")
    val_set = pd.read_json(f"{data_path}valid.json", orient="records")
    print("Data loaded successfully: valid.json")
    print(f"Shape: {val_set.shape}")
except Exception as e:
    print(f"An error occurred: {e}")

### Fine-tune models

In [None]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

from Source_code.z_utils.BERTClassifier import BERTClassifier
from Source_code.z_utils.RoBERTaClassifier import RoBERTaClassifier
from Source_code.z_utils.DeBERTaClassifier import DeBERTaClassifier
from Source_code.z_utils.BlueBERTClassifier import BlueBERTClassifier
from Source_code.z_utils.XLNetClassifier import XLNetClassifier
from Source_code.z_utils.data_preparing import get_dataloader
from Source_code.z_utils.Dataset import Dataset
from Source_code.z_utils.loss_fn import loss_fn
from Source_code.z_utils.train import train_model
from Source_code.z_utils.plot import plot
from Source_code.z_utils.global_constants import *


models = [BERTClassifier("bert-base-uncased"),
          RoBERTaClassifier("roberta-base"),
          DeBERTaClassifier("microsoft/deberta-base"),
          BlueBERTClassifier("bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16"),
          XLNetClassifier("xlnet-large-cased")
         ]

model_histories = []

for model in models:
    print("=" * 30)
    print(f'Model {model.checkpoint} started.')
    print("=" * 30)
    
    model = model.to(device)
    
    train_dataloader = get_dataloader(train_set.title_abstract, train_set.labels, model.tokenizer, TRAIN_BATCH_SIZE, MAX_LEN)
    val_dataloader = get_dataloader(val_set.title_abstract, val_set.labels, model.tokenizer, VAL_BATCH_SIZE, MAX_LEN)
       
    torch.cuda.empty_cache()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
    )
    
    # train and validation accuracies plot
    metrics, losses, sliding_accuracies, info_best_model, run_times = train_model(model, train_dataloader, val_dataloader, TRAIN_BATCH_SIZE, loss_fn, optimizer, device, scheduler, EPOCHS)
    
    if not os.path.exists(PATH_SAVED_METRICS):
        os.makedirs(PATH_SAVED_METRICS)
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_json(f"{PATH_SAVED_METRICS}metrics_{model.checkpoint[model.checkpoint.find('/')+1:]}.json")
    train_losses_df = pd.DataFrame([{"loss": elem["loss"].detach().item(), "epoch": elem["epoch"]}  for elem in losses])
    train_losses_df.to_json(f"{PATH_SAVED_METRICS}training_losses_{model.checkpoint[model.checkpoint.find('/')+1:]}.json")
    train_accs_df = pd.DataFrame(sliding_accuracies)
    train_accs_df.to_json(f"{PATH_SAVED_METRICS}training_accuracies_{model.checkpoint[model.checkpoint.find('/')+1:]}.json")
    with open(f"{PATH_SAVED_METRICS}info_best_model_{model.checkpoint[model.checkpoint.find('/')+1:]}.txt", "w") as text_file:
        text_file.write(info_best_model)
    run_times_df = pd.DataFrame(run_times)
    run_times_df.to_json(f"{PATH_SAVED_METRICS}training_times_{model.checkpoint[model.checkpoint.find('/')+1:]}.json")
        
    model_histories.append({"model": model.checkpoint, "metrics": metrics_df, "train_losses": train_losses_df, "train_accs": train_accs_df, "info_best_model": info_best_model})

### Plot accuracy and loss history of training and validation

In [None]:
from itertools import chain

for model_hist in model_histories:
    print("=" * 30)
    print(f'Model {model_hist["model"]}')
    print("=" * 30)
    
    metrics_df = model_hist["metrics"]
    train_losses_df = model_hist["train_losses"]
    train_accs_df = model_hist["train_accs"]
    info_best_model = model_hist["info_best_model"]
    
    # training and validation accuracy history
    data_list = [metrics_df["train_acc"], metrics_df["val_acc"]]
    label_list = ["Training accuracy", "Validation accuracy"]
    all_accs = pd.concat(data_list)
    plot(data_list, label_list, 
         "Training and validation accuracy history", 
         "Accuracy", 
         "Evaluation number", 
         vlines=[i * len(metrics_df["train_acc"]) / EPOCHS for i in range(1, EPOCHS)],
         ylim=[all_accs.min(), all_accs.max()],
         model_name=model.checkpoint
        )
        
    # training and validation loss history
    data_list = [metrics_df["train_loss"], metrics_df["val_loss"]]
    label_list = ["Training loss", "Validation loss"]
    all_accs = pd.concat(data_list)
    plot(data_list, label_list, 
         "Training and validation loss history", 
         "Loss", 
         "Evaluation number", 
         vlines=[i * len(metrics_df["train_loss"]) / EPOCHS for i in range(1, EPOCHS)],
         ylim=[all_accs.min(), all_accs.max()],
         model_name=model.checkpoint
        )
    
    # training loss history
    plot([train_losses_df["loss"]], 
         ["Training loss"], 
         "Training loss of all epochs", 
         "Loss", 
         "Number of training samples", 
         vlines=[i * len(train_losses_df["loss"]) / EPOCHS for i in range(1, EPOCHS)],
         ylim=[train_losses_df["loss"].min(), train_losses_df["loss"].max()],
         model_name=model.checkpoint
        )
        
    # training loss history
    plot([train_accs_df["accuracy"]],
         ["Training accuracy"],
         "Training accuracy per batch of all epochs",
         "Accuracy", 
         "Number of training samples",
         vlines=[i * len(train_losses_df["loss"]) / EPOCHS for i in range(1, EPOCHS)],
         ylim=[train_accs_df["accuracy"].min(), train_accs_df["accuracy"].max()],
         model_name=model.checkpoint
        )