In [None]:
#Libraries
import os
import torch
from torch.utils.data import DataLoader
import json

from transformers import AutoModel
from models.ModelRetriever import get_full_classification_model, get_classification_head_model, get_adapters_model, get_lora_model
from evaluation.model_evaluator import ModelEvaluator
from hf_utils import load_model_from_hf

from datasets import load_from_disk
from sklearn.metrics import roc_auc_score
from copy import deepcopy

import shutil
from transformers import AutoConfig
from huggingface_hub import login, HfApi, snapshot_download, upload_folder

In [None]:
# Globals
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
NUM_LABELS = 2
LABEL_NAMES = ['negative', 'positive']

MODEL_NAMES = [
    'full_fine_tuning_set_small',
    'full_fine_tuning_set_medium',
    'full_fine_tuning_set_full',
    'head_fine_tuning_set_small',
    'head_fine_tuning_set_medium',
    'head_fine_tuning_set_full',
    'adapters_inner_dim_48_set_small',
    'adapters_inner_dim_48_set_medium',
    'adapters_inner_dim_48_set_full',
    'adapters_inner_dim_96_set_small',
    'adapters_inner_dim_96_set_medium',
    'adapters_inner_dim_96_set_full',
    'lora_r_32_alpha_64_set_small',
    'lora_r_32_alpha_64_set_medium',
    'lora_r_32_alpha_64_set_full',
    'lora_r_64_alpha_32_set_small',
    'lora_r_64_alpha_32_set_medium',
    'lora_r_64_alpha_32_set_full'
]

DATASET_PATH = './datasets/test_dataset'
SAVE_PATH = './evaluation/evaluation_metrics'

### Load test dataset

This dataset contains of 25_000 test instances, equally splitted in positive and negative, that will be used for evaluating model performances

In [None]:
#Load test
test_ds = load_from_disk(DATASET_PATH)
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
base_model = AutoModel.from_pretrained("bert-base-uncased", dtype=torch.float32)

### Evaluate base model

This model will be used as a starting point to which all other fine-tuned models will be measured

In [None]:
base_classification_model = deepcopy(base_model)
base_classification_model = get_full_classification_model(base_classification_model)
evaluator = ModelEvaluator(base_classification_model, device=DEVICE)
evaluator.evaluate(test_loader, num_labels=NUM_LABELS, label_names=LABEL_NAMES)
filename = os.path.join(SAVE_PATH, 'base_model.json')
evaluator.save_results(filename)

### Evaluate fine-tuned models

Because fine-tuned models are created by altering structure for the base model, the way they are saved on HF platform is like a structure with it's weights. That's why it's necessary to firstly recreate the exact structure for the desired model and only then set it's weights

In [None]:
for model_name in MODEL_NAMES:
    copy_model = deepcopy(base_model)
    if model_name.startswith('full'):
        adjusted_model = get_full_classification_model(copy_model)
    elif model_name.startswith('head'):
        adjusted_model = get_classification_head_model(copy_model)
    elif model_name.startswith('adapters'):
        parts = model_name.split('_')
        inner_dim = int(parts[3])
        adjusted_model = get_adapters_model(copy_model, inner_dim)
    else: 
        parts = model_name.split('_')
        r = int(parts[2])
        alpha = int(parts[4])
        adjusted_model = get_lora_model(copy_model, r, alpha)
        
    loaded_model = load_model_from_hf(model_name, adjusted_model)
    evaluator = ModelEvaluator(loaded_model, device=DEVICE)
    evaluator.evaluate(test_loader, num_labels=NUM_LABELS, label_names=LABEL_NAMES)
    filename = os.path.join(SAVE_PATH, f'{model_name}.json')
    evaluator.save_results(filename)