In [None]:
!pip install -U transformers accelerate



In [None]:
!pip -q install -U transformers datasets accelerate evaluate optuna wandb
# If CUDA isn't detected, restart runtime > change runtime type > GPU

In [None]:
!pip install optuna



In [None]:
import os
import json
import time
import glob
import math
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn import Linear
from torch.nn.utils import prune
from torch.quantization import quantize_dynamic

import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    get_scheduler,
    set_seed,
    AutoConfig,
)

from datasets import load_dataset, Dataset as HFDataset
import evaluate

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
)

import wandb
import optuna

import shutil, os


In [None]:
# Set device for training (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


**Load Preprocessed DataFrames**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Expected Output: Mounted at /content/drive

# Define your base directory where the files are located
# This path should point to the folder containing your data on Google Drive
BASE_DIR = "/content/drive/MyDrive/ADV_DL"

# Define paths to your preprocessed data using the exact file names we specified
DISTILBERT_TRAIN_FILE = f"{BASE_DIR}/distilbert_train.xls"
BERTWEET_TRAIN_FILE   = f"{BASE_DIR}/bertweet_train.xls"
DISTILBERT_TEST_FILE  = f"{BASE_DIR}/distilbert_test.xls"
BERTWEET_TEST_FILE    = f"{BASE_DIR}/bertweet_test.xls"
LENGTH_STATS_FILE     = f"{BASE_DIR}/length_stats.json"

# Load the dataframes and JSON file
try:
    # Load training and test data for DistilBERT and BERTweet.
    # The encoding "ISO-8859-1" is used in exercises
    df_distilbert_train = pd.read_csv(DISTILBERT_TRAIN_FILE, encoding="ISO-8859-1")
    df_bertweet_train   = pd.read_csv(BERTWEET_TRAIN_FILE, encoding="ISO-8859-1")
    df_distilbert_test  = pd.read_csv(DISTILBERT_TEST_FILE, encoding="ISO-8859-1")
    df_bertweet_test    = pd.read_csv(BERTWEET_TEST_FILE, encoding="ISO-8859-1")

    # Load the length statistics from the JSON file
    with open(LENGTH_STATS_FILE, 'r') as f:
        length_stats = json.load(f)

    print("All specified training, test data, and length statistics loaded successfully.")

except FileNotFoundError as e:
    print(f"Error: One or more files not found. Ensure they are in the specified directory ({BASE_DIR}): {e}")
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")

# Display head and shapes for verification of dataframes, and confirm JSON loaded
if 'df_distilbert_train' in locals():
    print("\nDistilBERT Train Data (first 5 rows):")
    print(df_distilbert_train.head())
    print(f"Shape: {df_distilbert_train.shape}")

if 'df_bertweet_train' in locals():
    print("\nBERTweet Train Data (first 5 rows):")
    print(df_bertweet_train.head())
    print(f"Shape: {df_bertweet_train.shape}")

if 'df_distilbert_test' in locals():
    print("\nDistilBERT Test Data (first 5 rows):")
    print(df_distilbert_test.head())
    print(f"Shape: {df_distilbert_test.shape}")

if 'df_bertweet_test' in locals():
    print("\nBERTweet Test Data (first 5 rows):")
    print(df_bertweet_test.head())
    print(f"Shape: {df_bertweet_test.shape}")

if 'length_stats' in locals():
    print("\nLength Statistics (JSON content preview):")
    print(json.dumps(length_stats, indent=2))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All specified training, test data, and length statistics loaded successfully.

DistilBERT Train Data (first 5 rows):
                                          covid_norm  label
0     [USER] [USER] [USER] [URL] and [URL] and [URL]      1
1  advice Talk to your neighbours family to excha...      2
2  covid Australia: Woolworths to give elderly, d...      2
3  My food stock is not the only one which is emp...      2
4  Me, ready to go at supermarket during the covi...      0
Shape: (41157, 2)

BERTweet Train Data (first 5 rows):
                                          covid_norm  label
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...      1
1  advice Talk to your neighbours family to excha...      2
2  covid Australia: Woolworths to give elderly, d...      2
3  My food stock is not the only one which is emp...      2
4  Me, ready to go at supermarket durin

**Create Train and Validation Splits from the loaded full training data**

We split 80% for training and 20% for validation.

stratify=df['label'] ensures that the proportion of labels is maintained in both splits [1, 2].

In [None]:
# --- Create Train and Validation Splits from the loaded full training data ---
print("\nSplitting training data into train and validation sets...")

# DistilBERT splits
train_df_distilbert, val_df_distilbert = train_test_split(
    df_distilbert_train, test_size=0.2, random_state=42, stratify=df_distilbert_train['label']
)

# BERTweet splits
train_df_bertweet, val_df_bertweet = train_test_split(
    df_bertweet_train, test_size=0.2, random_state=42, stratify=df_bertweet_train['label']
)

print(f"DistilBERT Train shape: {train_df_distilbert.shape}, Validation shape: {val_df_distilbert.shape}")
print(f"BERTweet Train shape: {train_df_bertweet.shape}, Validation shape: {val_df_bertweet.shape}")


Splitting training data into train and validation sets...
DistilBERT Train shape: (32925, 2), Validation shape: (8232, 2)
BERTweet Train shape: (32925, 2), Validation shape: (8232, 2)


### Preprocessing and Tokenization

We've converted text data into numerical token IDs for use with two distinct Transformer models: DistilBERT and BERTweet. This process included padding to ensure uniform sequence length and truncation to manage longer texts.

*  **Max Sequence Length (MAX_SEQ_LENGTH):** To make our
code more generic and data-driven, the MAX_SEQ_LENGTH is not hardcoded. Instead, it is dynamically determined by taking the 99th percentile of token lengths from a pre-calculated JSON file and adding a buffer of 5 tokens. This approach ensures that our maximum sequence length is always optimized for the specific data distribution, minimizing unnecessary padding and truncation without significant information loss.

*   **Model-Specific Pipelines:** We developed separate preprocessing pipelines to match the unique characteristics of each model.

  *   **DistilBERT:** As a general-purpose, uncased model, its pipeline normalizes tweet-specific elements. This includes replacing URLs with [URL] and user mentions with [USER], and splitting hashtags like #StayHome into Stay Home to improve the model's understanding of semantic content over tweet-specific syntax.

  *   **BERTweet:** Pre-trained on a vast corpus of tweets, BERTweet is a cased model that natively understands tweet syntax. For this model, we preserved hashtags, mentions, and URLs as they are, as they contain valuable contextual information the model was specifically trained to interpret.

In [None]:
# --- Load max_length from JSON file (correct keys: "distilbert", "bertweet") ---
try:
    length_stats_path = f"{BASE_DIR}/length_stats.json"
    with open(length_stats_path, "r") as f:
        length_stats = json.load(f)
except FileNotFoundError:
    print("Warning: 'length_stats.json' not found. Using safe defaults.")
    # Fallback with the CORRECT top-level keys used by Part A
    length_stats = {
        "distilbert": {"overall": {"p99": 96}},
        "bertweet":   {"overall": {"p99": 96}},
    }

def _get_p99(stats: dict, model_key: str, default_p99: int = 96) -> int:
    """Read overall.p99 for a given model_key, with a safe default."""
    try:
        return int(stats.get(model_key, {}).get("overall", {}).get("p99", default_p99))
    except (TypeError, ValueError):
        return default_p99

# --- Determine max_length for each model based on the 99th percentile ---
# Add a small buffer of 5 tokens; cap at 512
distilbert_max_len_json = _get_p99(length_stats, "distilbert", 96) + 5
bertweet_max_len_json   = _get_p99(length_stats, "bertweet",   96) + 5

distilbert_max_len = min(distilbert_max_len_json, 512)
bertweet_max_len   = min(bertweet_max_len_json,   512)

# --- Define MAX_SEQ_LENGTH (keep single max for both models, as you wanted) ---
MAX_SEQ_LENGTH = max(distilbert_max_len, bertweet_max_len)

print(f"DistilBERT max_length from JSON (p99+5, ≤512): {distilbert_max_len}")
print(f"BERTweet   max_length from JSON (p99+5, ≤512): {bertweet_max_len}")
print(f"Using unified MAX_SEQ_LENGTH for tokenization: {MAX_SEQ_LENGTH}")

# --- Tokenization ---
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# NOTE: BERTweet tokenizer often requires use_fast=False depending on version.
bertweet_tokenizer   = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

def tokenize_data(df, tokenizer, text_column, max_length):
    return tokenizer(
        df[text_column].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

print("\nTokenizing DistilBERT data...")
distilbert_train_encodings = tokenize_data(train_df_distilbert, distilbert_tokenizer, "covid_norm", MAX_SEQ_LENGTH)
val_distilbert_encodings   = tokenize_data(val_df_distilbert,   distilbert_tokenizer, "covid_norm", MAX_SEQ_LENGTH)
distilbert_test_encodings  = tokenize_data(df_distilbert_test,  distilbert_tokenizer, "covid_norm", MAX_SEQ_LENGTH)

print("Tokenizing BERTweet data...")
bertweet_train_encodings = tokenize_data(train_df_bertweet, bertweet_tokenizer, "covid_norm", MAX_SEQ_LENGTH)
val_bertweet_encodings   = tokenize_data(val_df_bertweet,   bertweet_tokenizer, "covid_norm", MAX_SEQ_LENGTH)
bertweet_test_encodings  = tokenize_data(df_bertweet_test,  bertweet_tokenizer, "covid_norm", MAX_SEQ_LENGTH)

print("Tokenization complete for all datasets.")


DistilBERT max_length from JSON (p99+5, ≤512): 106
BERTweet   max_length from JSON (p99+5, ≤512): 93
Using unified MAX_SEQ_LENGTH for tokenization: 106


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0



Tokenizing DistilBERT data...
Tokenizing BERTweet data...
Tokenization complete for all datasets.


**Create Custom PyTorch Dataset Classes**

To use the tokenized data with PyTorch's DataLoader and Hugging Face's Trainer, you'll need to create a custom Dataset class. This class will provide the tokenized inputs (input_ids, attention_mask) and the corresponding labels.

In [None]:
# --- Custom PyTorch Dataset ---
class TweetSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return a dictionary of input_ids, attention_mask, and label
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
df_distilbert_train.head(5)

Unnamed: 0,covid_norm,label
0,[USER] [USER] [USER] [URL] and [URL] and [URL],1
1,advice Talk to your neighbours family to excha...,2
2,"covid Australia: Woolworths to give elderly, d...",2
3,My food stock is not the only one which is emp...,2
4,"Me, ready to go at supermarket during the covi...",0


**Instantiate Custom PyTorch Datasets**

In [None]:
# --- Instantiate Custom PyTorch Datasets ---
print("\nCreating custom PyTorch Dataset instances for all splits...")

# DistilBERT datasets
train_dataset_distilbert = TweetSentimentDataset(distilbert_train_encodings, train_df_distilbert['label'].tolist())
val_dataset_distilbert = TweetSentimentDataset(val_distilbert_encodings, val_df_distilbert['label'].tolist())
test_dataset_distilbert = TweetSentimentDataset(distilbert_test_encodings, df_distilbert_test['label'].tolist())

# BERTweet datasets
train_dataset_bertweet = TweetSentimentDataset(bertweet_train_encodings, train_df_bertweet['label'].tolist())
val_dataset_bertweet = TweetSentimentDataset(val_bertweet_encodings, val_df_bertweet['label'].tolist())
test_dataset_bertweet = TweetSentimentDataset(bertweet_test_encodings, df_bertweet_test['label'].tolist())

print("Custom PyTorch Datasets created successfully.")

# Optional: Print sizes to verify
print(f"DistilBERT Train Dataset size: {len(train_dataset_distilbert)}")
print(f"DistilBERT Validation Dataset size: {len(val_dataset_distilbert)}")
print(f"DistilBERT Test Dataset size: {len(test_dataset_distilbert)}")
print(f"BERTweet Train Dataset size: {len(train_dataset_bertweet)}")
print(f"BERTweet Validation Dataset size: {len(val_dataset_bertweet)}")
print(f"BERTweet Test Dataset size: {len(test_dataset_bertweet)}")


Creating custom PyTorch Dataset instances for all splits...
Custom PyTorch Datasets created successfully.
DistilBERT Train Dataset size: 32925
DistilBERT Validation Dataset size: 8232
DistilBERT Test Dataset size: 3798
BERTweet Train Dataset size: 32925
BERTweet Validation Dataset size: 8232
BERTweet Test Dataset size: 3798


**Create PyTorch s**

We will now create PyTorch DataLoaders. DataLoaders are essential for:

• **Batching:** Grouping individual data samples into mini-batches, which is necessary for efficient training on GPUs and stable gradient updates.

• **Shuffling:** Randomizing the order of samples in each epoch (for training data) to prevent the model from learning the order of the data.

• **Parallel Loading:** Loading data in parallel using multiple worker processes, which speeds up data fetching.

We will create separate DataLoaders for training, validation, and test datasets for both DistilBERT and BERTweet. We will pay attention to shuffle=True for training data and shuffle=False for validation and test data.

The batch_size is a hyperparameter we will likely tune later.

In [None]:
# A reasonable starting batch size. This is a hyperparameter often tuned later.
BATCH_SIZE = 32

print(f"\nCreating PyTorch DataLoaders with batch_size: {BATCH_SIZE}...")

# DistilBERT DataLoaders
train_loader_distilbert = DataLoader(train_dataset_distilbert, batch_size=BATCH_SIZE, shuffle=True)
val_loader_distilbert = DataLoader(val_dataset_distilbert, batch_size=BATCH_SIZE, shuffle=False)
test_loader_distilbert = DataLoader(test_dataset_distilbert, batch_size=BATCH_SIZE, shuffle=False)

# BERTweet DataLoaders
train_loader_bertweet = DataLoader(train_dataset_bertweet, batch_size=BATCH_SIZE, shuffle=True)
val_loader_bertweet = DataLoader(val_dataset_bertweet, batch_size=BATCH_SIZE, shuffle=False)
test_loader_bertweet = DataLoader(test_dataset_bertweet, batch_size=BATCH_SIZE, shuffle=False)

print("DataLoaders created successfully.")

# Optional: Print sizes to verify
print(f"DistilBERT Train DataLoader batches: {len(train_loader_distilbert)}")
print(f"DistilBERT Validation DataLoader batches: {len(val_loader_distilbert)}")
print(f"DistilBERT Test DataLoader batches: {len(test_loader_distilbert)}")
print(f"BERTweet Train DataLoader batches: {len(train_loader_bertweet)}")
print(f"BERTweet Validation DataLoader batches: {len(val_loader_bertweet)}")
print(f"BERTweet Test DataLoader batches: {len(test_loader_bertweet)}")


Creating PyTorch DataLoaders with batch_size: 32...
DataLoaders created successfully.
DistilBERT Train DataLoader batches: 1029
DistilBERT Validation DataLoader batches: 258
DistilBERT Test DataLoader batches: 119
BERTweet Train DataLoader batches: 1029
BERTweet Validation DataLoader batches: 258
BERTweet Test DataLoader batches: 119


**Load Pre-trained Models**

We will need to load the pre-trained Transformer models (DistilBertForSequenceClassification and RobertaForSequenceClassification for BERTweet, as BERTweet is a RoBERTa-style model) that we intend to fine-tune.

These models are designed for sequence classification tasks and will be initialized with pre-trained weights, except for the newly added classification head, which will be randomly initialized

In [None]:


print("\nLoading pre-trained DistilBERT and BERTweet models...")

# Number of labels for sentiment classification task (negative, neutral, positive)
NUM_LABELS = 3

# Load DistilBERT model
distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=NUM_LABELS
).to(device) # Move model to GPU/CPU [11]

# Load BERTweet model
# BERTweet is a RoBERTa-style model, so you load RobertaForSequenceClassification [7, 8]
bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=NUM_LABELS
).to(device) # Move model to GPU/CPU [11]

print("Models loaded successfully.")
print("DistilBERT model structure (first few layers):")
print(distilbert_model)
print("\nBERTweet model structure (first few layers):")
print(bertweet_model)


Loading pre-trained DistilBERT and BERTweet models...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Models loaded successfully.
DistilBERT model structure (first few layers):
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


define the early_stop_check function:

In [None]:
# Helper function for early stopping logic [6]
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        # Check if the current epoch is beyond the patience window relative to the best epoch
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

print("\nEarly stopping utility function defined.")

# Training Function

Training and validation loop that:  
- Trains the model over 10 epochs, computing loss and accuracy each step.  
- Evaluates on the validation set each epoch and calculates additional metrics (precision, recall, F1).  
- Tracks the best validation accuracy, saves the best model state, and supports early stopping with a patience parameter.  
- Integrates with Optuna trials (saving per-trial best models) and Weights & Biases for experiment logging.  



In [None]:
# Main training and validation loop function [2]
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    # Initialize variables for tracking best performance and early stopping [2]
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None # To save the state dict of the best performing model [8]

    print(f"\nStarting training for trial {trial.number if trial else 'N/A'} for {epochs} epochs with patience {patience}...")

    # Loop through each epoch [2]
    for epoch in range(1, epochs + 1):
        # --- Training loop ---
        model.train() # Set model to training mode [2]
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        # Iterate over batches from the training DataLoader [2]
        for batch in train_loader:
            # Move input tensors to the specified device (GPU/CPU) [2]
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad() # Clear previously computed gradients [9]
            outputs = model(input_ids, attention_mask=attention_mask) # Perform forward pass [9]
            logits = outputs.logits # Get the raw output logits from the model [9]
            loss = criterion(logits, labels) # Calculate the loss using the defined criterion [9]

            loss.backward() # Perform backward pass to compute gradients [9]
            optimizer.step() # Update model weights using the optimizer [9]

            # Accumulate training loss and correct predictions [9]
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        # --- Validation loop ---
        model.eval() # Set model to evaluation mode [10]
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels = [] # To store all true labels for metric calculation [10]
        all_val_preds = [] # To store all predicted labels for metric calculation [10]

        with torch.no_grad(): # Disable gradient computation for validation [10]
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device) # [11]

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples

        # For our 3-class sentiment analysis, we might need to adjust 'average' or calculate macro/weighted F1
        # from sklearn.metrics import precision_score, recall_score, f1_score # These imports are usually at the top
        try: # Use a try-except block to handle cases where precision/recall/f1 might fail for specific label distributions
            # For multi-class classification, 'average' parameter is crucial. 'None' returns scores per class.
            # 'macro' computes metrics independently for each class and then takes the unweighted mean.
            # 'weighted' computes metrics for each class and then takes the mean weighted by support.
            # Given your three classes (negative, neutral, positive), 'weighted' or 'macro' are common.
            val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
            val_recall = recall_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
            val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        except Exception as e:
            print(f"Warning: Could not compute advanced metrics for epoch {epoch}: {e}")
            val_precision, val_recall, val_f1 = np.nan, np.nan, np.nan


        # Check for early stopping and update best model state [7, 8]
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        if val_accuracy >= best_val_accuracy: # Save the model state if current accuracy is the best or equal [8]
            # Use deepcopy if you plan to modify the model after saving, otherwise state_dict() is fine
            best_model_state = model.state_dict()
            if trial: # Only save if part of an Optuna trial [3]
                torch.save(best_model_state, f"best_model_trial_{trial.number}.pt") # Save as .pt file [3]

        # Log metrics to Weights & Biases for the current epoch [8]
        if wandb.run: # Only log if wandb is initialized
            wandb.log({
                "Epoch": epoch,
                "Train Loss": train_loss,
                "Train Accuracy": train_accuracy,
                "Validation Loss": val_loss,
                "Validation Accuracy": val_accuracy,
                "Validation Precision": val_precision,
                "Validation Recall": val_recall,
                "Validation F1": val_f1
            })

        print(f"Epoch {epoch}/{epochs} | Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        if early_stop_flag: # Exit training loop if early stopping condition is met [3]
            print(f"Early stopping triggered at epoch {epoch}.")
            break

    # Load the best model state back into the model if it was saved [3]
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    print(f"Training complete. Best Validation Accuracy: {best_val_accuracy:.4f} at epoch {best_val_accuracy_epoch}.")
    return best_val_accuracy # Return the best validation accuracy (for Optuna maximization) [3]

print("\nTraining and evaluation loop function 'train_model_with_hyperparams' defined.")



Training and evaluation loop function 'train_model_with_hyperparams' defined.


#### Hyperparameter Search Space (Optuna)

We use `log=True` when sampling learning rate and weight decay.  
This makes Optuna draw values on a logarithmic scale, which is more appropriate for parameters that span several orders of magnitude.  
- `learning_rate ∈ [1e-5, 1e-3]` (log scale)  
- `weight_decay ∈ [1e-6, 1e-4]` (log scale)  
Other parameters (`patience`, `num_layers`, `batch_size`) are searched on linear or categorical spaces.


**Handling Class Imbalance**

e implemented a class-weighted loss function within the Optuna function to handle the imbalanced sentiment labels identified in Part A. This technique assigns a higher penalty to misclassifications of under-represented classes, forcing the model to pay more attention to them during training.

## BERTweet

In [None]:
# Define the objective function for Optuna
def objective(trial):

    # Hyperparameter suggestions for tuning
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay  = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    patience      = trial.suggest_int("patience",6 ,7)
    batch_size    = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_layers    = trial.suggest_int("num_layers", 1, 3)

    # Datasets / loaders (uses your existing encodings & splits)
    train_dataset = TweetSentimentDataset(bertweet_train_encodings, train_df_bertweet['label'].tolist())
    val_dataset   = TweetSentimentDataset(val_bertweet_encodings,   val_df_bertweet['label'].tolist())

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

    # Model
    model = AutoModelForSequenceClassification.from_pretrained(
        'vinai/bertweet-base', num_labels=3
    ).to(device)

    # Freeze base; unfreeze last `num_layers` + classifier
    for p in model.roberta.parameters():
        p.requires_grad = False
    for p in model.roberta.encoder.layer[-num_layers:].parameters():
        p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # ---------- imbalance handling: weighted loss ----------
    classes = np.array([0, 1, 2], dtype=int)
    y_train = np.array(train_df_bertweet['label'].tolist(), dtype=int)
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    class_weights = torch.tensor(weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    # -------------------------------------------------------

    # Optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # W&B
    wandb.init(
        project="tweet-sentiment-finetuning",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": "BERTweet (RoBERTa-style)",
            "dataset": "COVID-19-tweets-standardized"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # Train/eval loop (your function)
    best_val_accuracy = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=10, patience=patience, trial=trial
    )

    wandb.finish()
    return best_val_accuracy


In [None]:
# Optuna Study
study = optuna.create_study(direction="maximize")  # Specify maximizing the best_val_accuracy
study.optimize(objective, n_trials=13)

[I 2025-08-17 21:21:29,154] A new study created in memory with name: no-name-06b2d4ed-b0cd-4d38-9492-73a192dd9bef
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mnogapaz98[0m ([33mnogapaz98-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



Starting training for trial 0 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7480, Train Acc: 0.6763 | Val Loss: 0.6145, Val Acc: 0.7702, Val F1: 0.7709


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5464, Train Acc: 0.7939 | Val Loss: 0.5592, Val Acc: 0.7907, Val F1: 0.7901


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4646, Train Acc: 0.8316 | Val Loss: 0.5085, Val Acc: 0.8290, Val F1: 0.8287


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.3930, Train Acc: 0.8593 | Val Loss: 0.5068, Val Acc: 0.8293, Val F1: 0.8306


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3391, Train Acc: 0.8838 | Val Loss: 0.5247, Val Acc: 0.8431, Val F1: 0.8427


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.2965, Train Acc: 0.9021 | Val Loss: 0.4915, Val Acc: 0.8439, Val F1: 0.8437


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2625, Train Acc: 0.9132 | Val Loss: 0.5039, Val Acc: 0.8394, Val F1: 0.8388


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2468, Train Acc: 0.9175 | Val Loss: 0.5492, Val Acc: 0.8236, Val F1: 0.8242


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2256, Train Acc: 0.9259 | Val Loss: 0.5648, Val Acc: 0.8197, Val F1: 0.8188


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1915, Train Acc: 0.9351 | Val Loss: 0.6421, Val Acc: 0.8437, Val F1: 0.8428
Training complete. Best Validation Accuracy: 0.8439 at epoch 6.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▇▇▇███
Train Loss,█▅▄▄▃▂▂▂▁▁
Validation Accuracy,▁▃▇▇███▆▆█
Validation F1,▁▃▇▇███▆▆█
Validation Loss,▇▄▂▂▃▁▂▄▄█
Validation Precision,▁▄▇▇███▆▆█
Validation Recall,▁▃▇▇███▆▆█

0,1
Epoch,10.0
Train Accuracy,0.93509
Train Loss,0.19147
Validation Accuracy,0.84366
Validation F1,0.84283
Validation Loss,0.6421
Validation Precision,0.84424
Validation Recall,0.84366


[I 2025-08-17 22:08:48,648] Trial 0 finished with value: 0.8439018464528668 and parameters: {'learning_rate': 0.0005067099595408021, 'weight_decay': 4.382250702866951e-05, 'patience': 6, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 1 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7181, Train Acc: 0.6967 | Val Loss: 0.5897, Val Acc: 0.7760, Val F1: 0.7770


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5375, Train Acc: 0.7910 | Val Loss: 0.5960, Val Acc: 0.7947, Val F1: 0.7945


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4540, Train Acc: 0.8297 | Val Loss: 0.5182, Val Acc: 0.8056, Val F1: 0.8078


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.3886, Train Acc: 0.8587 | Val Loss: 0.5012, Val Acc: 0.8192, Val F1: 0.8204


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3272, Train Acc: 0.8806 | Val Loss: 0.5593, Val Acc: 0.8293, Val F1: 0.8293


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.2819, Train Acc: 0.8996 | Val Loss: 0.5515, Val Acc: 0.8310, Val F1: 0.8311


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2387, Train Acc: 0.9126 | Val Loss: 0.6613, Val Acc: 0.8310, Val F1: 0.8307


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.1994, Train Acc: 0.9292 | Val Loss: 0.6694, Val Acc: 0.8140, Val F1: 0.8149


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1665, Train Acc: 0.9400 | Val Loss: 0.7870, Val Acc: 0.8279, Val F1: 0.8272


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1348, Train Acc: 0.9524 | Val Loss: 0.6918, Val Acc: 0.8310, Val F1: 0.8312
Training complete. Best Validation Accuracy: 0.8310 at epoch 6.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▃▅▇███▆██
Validation F1,▁▃▅▇███▆▇█
Validation Loss,▃▃▁▁▂▂▅▅█▆
Validation Precision,▁▅▆▇███▇▇█
Validation Recall,▁▃▅▇███▆██

0,1
Epoch,10.0
Train Accuracy,0.95244
Train Loss,0.13484
Validation Accuracy,0.83103
Validation F1,0.83119
Validation Loss,0.69175
Validation Precision,0.83158
Validation Recall,0.83103


[I 2025-08-17 22:58:25,777] Trial 1 finished with value: 0.831025267249757 and parameters: {'learning_rate': 9.099659928275238e-05, 'weight_decay': 2.7953261830148993e-06, 'patience': 6, 'batch_size': 64, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 2 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7858, Train Acc: 0.6548 | Val Loss: 0.6590, Val Acc: 0.7457, Val F1: 0.7472


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.6114, Train Acc: 0.7559 | Val Loss: 0.6024, Val Acc: 0.7594, Val F1: 0.7596


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5384, Train Acc: 0.7937 | Val Loss: 0.5864, Val Acc: 0.7724, Val F1: 0.7726


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4840, Train Acc: 0.8160 | Val Loss: 0.5554, Val Acc: 0.8015, Val F1: 0.8021


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4352, Train Acc: 0.8344 | Val Loss: 0.5884, Val Acc: 0.8015, Val F1: 0.8005


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3902, Train Acc: 0.8538 | Val Loss: 0.5406, Val Acc: 0.8173, Val F1: 0.8176


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3489, Train Acc: 0.8706 | Val Loss: 0.5771, Val Acc: 0.8139, Val F1: 0.8134


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.3048, Train Acc: 0.8863 | Val Loss: 0.5907, Val Acc: 0.8128, Val F1: 0.8138


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2713, Train Acc: 0.9000 | Val Loss: 0.6192, Val Acc: 0.8120, Val F1: 0.8130


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.2415, Train Acc: 0.9107 | Val Loss: 0.6776, Val Acc: 0.8198, Val F1: 0.8194
Training complete. Best Validation Accuracy: 0.8198 at epoch 10.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▆▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▂▄▆▆█▇▇▇█
Validation F1,▁▂▃▆▆█▇▇▇█
Validation Loss,▇▄▃▂▃▁▃▄▅█
Validation Precision,▁▄▆▆▇█████
Validation Recall,▁▂▄▆▆█▇▇▇█

0,1
Epoch,10.0
Train Accuracy,0.91074
Train Loss,0.24146
Validation Accuracy,0.81985
Validation F1,0.81943
Validation Loss,0.67764
Validation Precision,0.81922
Validation Recall,0.81985


[I 2025-08-17 23:48:44,203] Trial 2 finished with value: 0.8198493683187561 and parameters: {'learning_rate': 3.0049509402547545e-05, 'weight_decay': 1.4035442108174199e-06, 'patience': 6, 'batch_size': 64, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 3 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7353, Train Acc: 0.6886 | Val Loss: 0.6460, Val Acc: 0.7471, Val F1: 0.7476


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5755, Train Acc: 0.7738 | Val Loss: 0.5773, Val Acc: 0.7932, Val F1: 0.7938


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4969, Train Acc: 0.8120 | Val Loss: 0.5742, Val Acc: 0.7980, Val F1: 0.7973


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4308, Train Acc: 0.8418 | Val Loss: 0.5313, Val Acc: 0.8031, Val F1: 0.8048


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3706, Train Acc: 0.8653 | Val Loss: 0.5738, Val Acc: 0.8062, Val F1: 0.8070


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3152, Train Acc: 0.8859 | Val Loss: 0.5820, Val Acc: 0.8222, Val F1: 0.8221


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2692, Train Acc: 0.9033 | Val Loss: 0.5891, Val Acc: 0.8121, Val F1: 0.8143


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2302, Train Acc: 0.9193 | Val Loss: 0.5988, Val Acc: 0.8000, Val F1: 0.8049


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1962, Train Acc: 0.9318 | Val Loss: 0.6524, Val Acc: 0.8024, Val F1: 0.8041


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1669, Train Acc: 0.9405 | Val Loss: 0.6646, Val Acc: 0.8186, Val F1: 0.8193
Training complete. Best Validation Accuracy: 0.8222 at epoch 6.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▆▇▇██
Train Loss,█▆▅▄▄▃▂▂▁▁
Validation Accuracy,▁▅▆▆▇█▇▆▆█
Validation F1,▁▅▆▆▇█▇▆▆█
Validation Loss,▇▃▃▁▃▄▄▅▇█
Validation Precision,▁▄▅▆▇█▇▇▆▇
Validation Recall,▁▅▆▆▇█▇▆▆█

0,1
Epoch,10.0
Train Accuracy,0.94047
Train Loss,0.16692
Validation Accuracy,0.81863
Validation F1,0.81929
Validation Loss,0.6646
Validation Precision,0.82093
Validation Recall,0.81863


[I 2025-08-18 00:33:34,822] Trial 3 finished with value: 0.8221574344023324 and parameters: {'learning_rate': 0.00013709512423518383, 'weight_decay': 4.637652847378576e-06, 'patience': 7, 'batch_size': 32, 'num_layers': 2}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 4 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7098, Train Acc: 0.7035 | Val Loss: 0.5949, Val Acc: 0.7821, Val F1: 0.7818


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5371, Train Acc: 0.7959 | Val Loss: 0.5615, Val Acc: 0.7919, Val F1: 0.7926


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4567, Train Acc: 0.8292 | Val Loss: 0.5135, Val Acc: 0.8211, Val F1: 0.8207


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.3906, Train Acc: 0.8571 | Val Loss: 0.5103, Val Acc: 0.8158, Val F1: 0.8162


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3311, Train Acc: 0.8812 | Val Loss: 0.5185, Val Acc: 0.8192, Val F1: 0.8203


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.2853, Train Acc: 0.8979 | Val Loss: 0.5926, Val Acc: 0.8218, Val F1: 0.8222


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2459, Train Acc: 0.9137 | Val Loss: 0.6080, Val Acc: 0.8207, Val F1: 0.8219


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2087, Train Acc: 0.9267 | Val Loss: 0.6627, Val Acc: 0.8349, Val F1: 0.8345


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1735, Train Acc: 0.9378 | Val Loss: 0.6708, Val Acc: 0.8308, Val F1: 0.8309


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1411, Train Acc: 0.9495 | Val Loss: 0.6648, Val Acc: 0.8228, Val F1: 0.8240
Training complete. Best Validation Accuracy: 0.8349 at epoch 8.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▂▆▅▆▆▆█▇▆
Validation F1,▁▂▆▆▆▆▆██▇
Validation Loss,▅▃▁▁▁▅▅███
Validation Precision,▁▄▆▇▇▇▇██▇
Validation Recall,▁▂▆▅▆▆▆█▇▆

0,1
Epoch,10.0
Train Accuracy,0.94949
Train Loss,0.14111
Validation Accuracy,0.82276
Validation F1,0.82395
Validation Loss,0.66485
Validation Precision,0.82934
Validation Recall,0.82276


[I 2025-08-18 01:23:57,014] Trial 4 finished with value: 0.8349125364431487 and parameters: {'learning_rate': 6.416833262371767e-05, 'weight_decay': 1.1950671454056966e-05, 'patience': 7, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 5 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.9598, Train Acc: 0.5427 | Val Loss: 0.8214, Val Acc: 0.6586, Val F1: 0.6599


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.7856, Train Acc: 0.6744 | Val Loss: 0.7497, Val Acc: 0.7136, Val F1: 0.7162


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.6968, Train Acc: 0.7206 | Val Loss: 0.6921, Val Acc: 0.7335, Val F1: 0.7355


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.6562, Train Acc: 0.7398 | Val Loss: 0.6875, Val Acc: 0.7539, Val F1: 0.7478


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.6152, Train Acc: 0.7576 | Val Loss: 0.6317, Val Acc: 0.7606, Val F1: 0.7626


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.5936, Train Acc: 0.7665 | Val Loss: 0.6679, Val Acc: 0.7546, Val F1: 0.7549


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.5755, Train Acc: 0.7778 | Val Loss: 0.6285, Val Acc: 0.7697, Val F1: 0.7682


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.5575, Train Acc: 0.7859 | Val Loss: 0.5945, Val Acc: 0.7720, Val F1: 0.7741


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.5456, Train Acc: 0.7891 | Val Loss: 0.6077, Val Acc: 0.7792, Val F1: 0.7790


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.5288, Train Acc: 0.7999 | Val Loss: 0.5955, Val Acc: 0.7666, Val F1: 0.7694
Training complete. Best Validation Accuracy: 0.7792 at epoch 9.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▅▆▆▇▇▇███
Train Loss,█▅▄▃▂▂▂▁▁▁
Validation Accuracy,▁▄▅▇▇▇▇██▇
Validation F1,▁▄▅▆▇▇▇██▇
Validation Loss,█▆▄▄▂▃▂▁▁▁
Validation Precision,▁▄▆▆▇▆▇███
Validation Recall,▁▄▅▇▇▇▇██▇

0,1
Epoch,10.0
Train Accuracy,0.79991
Train Loss,0.52878
Validation Accuracy,0.76664
Validation F1,0.76938
Validation Loss,0.59546
Validation Precision,0.78366
Validation Recall,0.76664


[I 2025-08-18 02:14:31,238] Trial 5 finished with value: 0.7791545189504373 and parameters: {'learning_rate': 0.0004976043155941766, 'weight_decay': 1.3180146810801098e-05, 'patience': 6, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 6 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.8168, Train Acc: 0.6332 | Val Loss: 0.6935, Val Acc: 0.7267, Val F1: 0.7271


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.6726, Train Acc: 0.7209 | Val Loss: 0.6341, Val Acc: 0.7484, Val F1: 0.7490


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.6193, Train Acc: 0.7463 | Val Loss: 0.6127, Val Acc: 0.7670, Val F1: 0.7681


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.5759, Train Acc: 0.7677 | Val Loss: 0.6013, Val Acc: 0.7609, Val F1: 0.7630


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.5327, Train Acc: 0.7856 | Val Loss: 0.5931, Val Acc: 0.7751, Val F1: 0.7775


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.4781, Train Acc: 0.8101 | Val Loss: 0.6220, Val Acc: 0.7768, Val F1: 0.7770


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.4140, Train Acc: 0.8347 | Val Loss: 0.6610, Val Acc: 0.7788, Val F1: 0.7785


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.3519, Train Acc: 0.8595 | Val Loss: 0.7324, Val Acc: 0.7708, Val F1: 0.7706


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2997, Train Acc: 0.8799 | Val Loss: 0.7669, Val Acc: 0.7691, Val F1: 0.7700


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.2501, Train Acc: 0.9007 | Val Loss: 0.8358, Val Acc: 0.7767, Val F1: 0.7764
Training complete. Best Validation Accuracy: 0.7788 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▅▆▆▇▇█
Train Loss,█▆▆▅▄▄▃▂▂▁
Validation Accuracy,▁▄▆▆███▇▇█
Validation F1,▁▄▇▆███▇▇█
Validation Loss,▄▂▂▁▁▂▃▅▆█
Validation Precision,▁▄▆▇█▇▇▆▇▇
Validation Recall,▁▄▆▆███▇▇█

0,1
Epoch,10.0
Train Accuracy,0.90074
Train Loss,0.25011
Validation Accuracy,0.77672
Validation F1,0.77643
Validation Loss,0.83584
Validation Precision,0.77638
Validation Recall,0.77672


[I 2025-08-18 02:54:55,475] Trial 6 finished with value: 0.7787900874635568 and parameters: {'learning_rate': 0.00015189462596156502, 'weight_decay': 1.5405250215890978e-05, 'patience': 7, 'batch_size': 64, 'num_layers': 1}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 7 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7330, Train Acc: 0.6888 | Val Loss: 0.6213, Val Acc: 0.7479, Val F1: 0.7519


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5732, Train Acc: 0.7754 | Val Loss: 0.5804, Val Acc: 0.7920, Val F1: 0.7926


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4924, Train Acc: 0.8139 | Val Loss: 0.5968, Val Acc: 0.7949, Val F1: 0.7942


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4259, Train Acc: 0.8405 | Val Loss: 0.5515, Val Acc: 0.8086, Val F1: 0.8095


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3741, Train Acc: 0.8650 | Val Loss: 0.6046, Val Acc: 0.8044, Val F1: 0.8039


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3232, Train Acc: 0.8867 | Val Loss: 0.5601, Val Acc: 0.8134, Val F1: 0.8124


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2725, Train Acc: 0.9035 | Val Loss: 0.5687, Val Acc: 0.8168, Val F1: 0.8170


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2378, Train Acc: 0.9153 | Val Loss: 0.6232, Val Acc: 0.8145, Val F1: 0.8159


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2017, Train Acc: 0.9301 | Val Loss: 0.6712, Val Acc: 0.8166, Val F1: 0.8176


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1731, Train Acc: 0.9381 | Val Loss: 0.7078, Val Acc: 0.8207, Val F1: 0.8200
Training complete. Best Validation Accuracy: 0.8207 at epoch 10.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▅▅▆▇▇▇██
Train Loss,█▆▅▄▄▃▂▂▁▁
Validation Accuracy,▁▅▆▇▆▇█▇██
Validation F1,▁▅▅▇▆▇████
Validation Loss,▄▂▃▁▃▁▂▄▆█
Validation Precision,▁▅▅▇▇▇████
Validation Recall,▁▅▆▇▆▇█▇██

0,1
Epoch,10.0
Train Accuracy,0.93807
Train Loss,0.1731
Validation Accuracy,0.8207
Validation F1,0.82
Validation Loss,0.70782
Validation Precision,0.82116
Validation Recall,0.8207


[I 2025-08-18 03:38:23,421] Trial 7 finished with value: 0.8206997084548106 and parameters: {'learning_rate': 0.0003979053776068072, 'weight_decay': 7.323770717077471e-06, 'patience': 6, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 8 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.9679, Train Acc: 0.5282 | Val Loss: 0.8489, Val Acc: 0.6029, Val F1: 0.6097


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.8140, Train Acc: 0.6380 | Val Loss: 0.7783, Val Acc: 0.6544, Val F1: 0.6587


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.7491, Train Acc: 0.6793 | Val Loss: 0.7390, Val Acc: 0.6952, Val F1: 0.6968


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.7057, Train Acc: 0.7008 | Val Loss: 0.7098, Val Acc: 0.7089, Val F1: 0.7109


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.6744, Train Acc: 0.7198 | Val Loss: 0.6799, Val Acc: 0.7247, Val F1: 0.7268


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.6472, Train Acc: 0.7351 | Val Loss: 0.6620, Val Acc: 0.7359, Val F1: 0.7381


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.6257, Train Acc: 0.7457 | Val Loss: 0.6492, Val Acc: 0.7404, Val F1: 0.7433


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.6085, Train Acc: 0.7531 | Val Loss: 0.6366, Val Acc: 0.7496, Val F1: 0.7510


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.5896, Train Acc: 0.7627 | Val Loss: 0.6311, Val Acc: 0.7479, Val F1: 0.7514


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.5676, Train Acc: 0.7749 | Val Loss: 0.6265, Val Acc: 0.7575, Val F1: 0.7591
Training complete. Best Validation Accuracy: 0.7575 at epoch 10.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▃▅▆▇▇▇███
Validation F1,▁▃▅▆▆▇▇███
Validation Loss,█▆▅▄▃▂▂▁▁▁
Validation Precision,▁▃▄▅▆▇▇▇██
Validation Recall,▁▃▅▆▇▇▇███

0,1
Epoch,10.0
Train Accuracy,0.77494
Train Loss,0.56757
Validation Accuracy,0.75753
Validation F1,0.7591
Validation Loss,0.62652
Validation Precision,0.7653
Validation Recall,0.75753


[I 2025-08-18 04:23:24,631] Trial 8 finished with value: 0.7575315840621963 and parameters: {'learning_rate': 1.2511324122289102e-05, 'weight_decay': 8.201997235370834e-05, 'patience': 7, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 9 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7412, Train Acc: 0.6855 | Val Loss: 0.6115, Val Acc: 0.7670, Val F1: 0.7680


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5881, Train Acc: 0.7701 | Val Loss: 0.5808, Val Acc: 0.7741, Val F1: 0.7750


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5309, Train Acc: 0.7976 | Val Loss: 0.6160, Val Acc: 0.7885, Val F1: 0.7874


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4817, Train Acc: 0.8214 | Val Loss: 0.5705, Val Acc: 0.8014, Val F1: 0.8024


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4497, Train Acc: 0.8331 | Val Loss: 0.6225, Val Acc: 0.7990, Val F1: 0.7984


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.4127, Train Acc: 0.8515 | Val Loss: 0.6305, Val Acc: 0.7994, Val F1: 0.8000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3821, Train Acc: 0.8625 | Val Loss: 0.5845, Val Acc: 0.8149, Val F1: 0.8145


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.3601, Train Acc: 0.8720 | Val Loss: 0.5891, Val Acc: 0.8082, Val F1: 0.8085


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.3353, Train Acc: 0.8813 | Val Loss: 0.5670, Val Acc: 0.8127, Val F1: 0.8121


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.3054, Train Acc: 0.8932 | Val Loss: 0.5860, Val Acc: 0.8191, Val F1: 0.8189
Training complete. Best Validation Accuracy: 0.8191 at epoch 10.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▂▄▆▅▅▇▇▇█
Validation F1,▁▂▄▆▅▅▇▇▇█
Validation Loss,▆▃▆▁▇█▃▃▁▃
Validation Precision,▁▃▃▆▆▅▇▇▇█
Validation Recall,▁▂▄▆▅▅▇▇▇█

0,1
Epoch,10.0
Train Accuracy,0.89318
Train Loss,0.30544
Validation Accuracy,0.81912
Validation F1,0.81885
Validation Loss,0.58604
Validation Precision,0.81872
Validation Recall,0.81912


[I 2025-08-18 05:08:23,968] Trial 9 finished with value: 0.8191205053449951 and parameters: {'learning_rate': 0.0005270646678704237, 'weight_decay': 1.1391038609191056e-05, 'patience': 6, 'batch_size': 64, 'num_layers': 2}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 10 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7823, Train Acc: 0.6543 | Val Loss: 0.6531, Val Acc: 0.7206, Val F1: 0.7250


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.6371, Train Acc: 0.7415 | Val Loss: 0.6142, Val Acc: 0.7609, Val F1: 0.7622


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5901, Train Acc: 0.7662 | Val Loss: 0.5978, Val Acc: 0.7670, Val F1: 0.7683


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.5624, Train Acc: 0.7767 | Val Loss: 0.5914, Val Acc: 0.7666, Val F1: 0.7691


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.5382, Train Acc: 0.7883 | Val Loss: 0.6017, Val Acc: 0.7840, Val F1: 0.7838


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.5217, Train Acc: 0.7932 | Val Loss: 0.5847, Val Acc: 0.7863, Val F1: 0.7866


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.5009, Train Acc: 0.8014 | Val Loss: 0.5863, Val Acc: 0.7913, Val F1: 0.7913


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.4804, Train Acc: 0.8114 | Val Loss: 0.6049, Val Acc: 0.7868, Val F1: 0.7870


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.4621, Train Acc: 0.8138 | Val Loss: 0.5733, Val Acc: 0.7858, Val F1: 0.7867


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.4529, Train Acc: 0.8198 | Val Loss: 0.6027, Val Acc: 0.7818, Val F1: 0.7831
Training complete. Best Validation Accuracy: 0.7913 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▅▆▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▅▆▆▇███▇▇
Validation F1,▁▅▆▆▇▇███▇
Validation Loss,█▅▃▃▃▂▂▄▁▄
Validation Precision,▁▃▅▆▆█▇█▇▆
Validation Recall,▁▅▆▆▇███▇▇

0,1
Epoch,10.0
Train Accuracy,0.8198
Train Loss,0.45293
Validation Accuracy,0.78183
Validation F1,0.78306
Validation Loss,0.60267
Validation Precision,0.78567
Validation Recall,0.78183


[I 2025-08-18 05:47:30,013] Trial 10 finished with value: 0.7913022351797862 and parameters: {'learning_rate': 0.0009029948969690496, 'weight_decay': 6.339803747776122e-05, 'patience': 6, 'batch_size': 128, 'num_layers': 1}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 11 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7249, Train Acc: 0.6937 | Val Loss: 0.5859, Val Acc: 0.7739, Val F1: 0.7752


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5483, Train Acc: 0.7885 | Val Loss: 0.5593, Val Acc: 0.7937, Val F1: 0.7943


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.4710, Train Acc: 0.8234 | Val Loss: 0.5319, Val Acc: 0.8137, Val F1: 0.8131


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.3994, Train Acc: 0.8522 | Val Loss: 0.5317, Val Acc: 0.8129, Val F1: 0.8128


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3449, Train Acc: 0.8747 | Val Loss: 0.5398, Val Acc: 0.8309, Val F1: 0.8307


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.2912, Train Acc: 0.8939 | Val Loss: 0.5705, Val Acc: 0.8265, Val F1: 0.8266


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2504, Train Acc: 0.9123 | Val Loss: 0.6084, Val Acc: 0.8245, Val F1: 0.8238


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2137, Train Acc: 0.9225 | Val Loss: 0.6295, Val Acc: 0.8066, Val F1: 0.8102


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1739, Train Acc: 0.9366 | Val Loss: 0.6819, Val Acc: 0.8271, Val F1: 0.8271


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1488, Train Acc: 0.9465 | Val Loss: 0.6765, Val Acc: 0.8189, Val F1: 0.8197
Training complete. Best Validation Accuracy: 0.8309 at epoch 5.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▃▆▆█▇▇▅█▇
Validation F1,▁▃▆▆█▇▇▅█▇
Validation Loss,▄▂▁▁▁▃▅▆██
Validation Precision,▁▃▆▅██▇▆▇▇
Validation Recall,▁▃▆▆█▇▇▅█▇

0,1
Epoch,10.0
Train Accuracy,0.94645
Train Loss,0.14884
Validation Accuracy,0.81888
Validation F1,0.81965
Validation Loss,0.67654
Validation Precision,0.8243
Validation Recall,0.81888


[I 2025-08-18 06:37:26,007] Trial 11 finished with value: 0.8309037900874635 and parameters: {'learning_rate': 5.137874997552416e-05, 'weight_decay': 3.2495386500303347e-05, 'patience': 7, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 12 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.8098, Train Acc: 0.6391 | Val Loss: 0.6725, Val Acc: 0.7393, Val F1: 0.7401


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.6280, Train Acc: 0.7479 | Val Loss: 0.6057, Val Acc: 0.7699, Val F1: 0.7706


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5484, Train Acc: 0.7857 | Val Loss: 0.5719, Val Acc: 0.7832, Val F1: 0.7848


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4966, Train Acc: 0.8103 | Val Loss: 0.5564, Val Acc: 0.8009, Val F1: 0.8017


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4511, Train Acc: 0.8284 | Val Loss: 0.5705, Val Acc: 0.8032, Val F1: 0.8027


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.4021, Train Acc: 0.8499 | Val Loss: 0.6152, Val Acc: 0.8059, Val F1: 0.8041


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3653, Train Acc: 0.8644 | Val Loss: 0.5786, Val Acc: 0.8160, Val F1: 0.8154


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.3204, Train Acc: 0.8816 | Val Loss: 0.6076, Val Acc: 0.8014, Val F1: 0.8040


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2847, Train Acc: 0.8938 | Val Loss: 0.6311, Val Acc: 0.8124, Val F1: 0.8132


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.2480, Train Acc: 0.9068 | Val Loss: 0.6750, Val Acc: 0.8043, Val F1: 0.8066
Training complete. Best Validation Accuracy: 0.8160 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▇▇▇██
Train Loss,█▆▅▄▄▃▂▂▁▁
Validation Accuracy,▁▄▅▇▇▇█▇█▇
Validation F1,▁▄▅▇▇▇█▇█▇
Validation Loss,█▄▂▁▂▄▂▄▅█
Validation Precision,▁▄▅▇▇▇████
Validation Recall,▁▄▅▇▇▇█▇█▇

0,1
Epoch,10.0
Train Accuracy,0.90682
Train Loss,0.24797
Validation Accuracy,0.8043
Validation F1,0.80663
Validation Loss,0.67504
Validation Precision,0.8124
Validation Recall,0.8043


[I 2025-08-18 07:26:24,646] Trial 12 finished with value: 0.8159620991253644 and parameters: {'learning_rate': 3.6780008898063395e-05, 'weight_decay': 3.052125343678765e-05, 'patience': 7, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.8439018464528668.


Save to drive (checkpoint)

In [None]:
# Define the directory and file path to save the hyperparameters
HYPERPARAMS_DIR = os.path.join(BASE_DIR, "hyperparams")
os.makedirs(HYPERPARAMS_DIR, exist_ok=True)
BEST_BERTWEET_PARAMS_FILE = os.path.join(HYPERPARAMS_DIR, "best_bertweet_full_code_hyperparams.json")

# Extract and save the best parameters from the Optuna study
best_params = study.best_trial.params

with open(BEST_BERTWEET_PARAMS_FILE, 'w') as f:
    json.dump(best_params, f, indent=4)

print(f"Best BERTweet hyperparameters for 'full code' training saved to: {BEST_BERTWEET_PARAMS_FILE}")

Best BERTweet hyperparameters for 'full code' training saved to: /content/drive/MyDrive/ADV_DL/hyperparams/best_bertweet_full_code_hyperparams.json


In [None]:
# Load the BERTweet tokenizer
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

# Define the directory and filename of the best model saved by Optuna.
# The file is saved directly in the BASE_DIR by the objective function.
BEST_MODEL_OPTUNA_PATH = os.path.join(BASE_DIR, "best_model_trial_0.pt")  # <-- !! Adjust the trial number here !!

# Load the best model's state dictionary
best_bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=3
)
best_bertweet_model.load_state_dict(torch.load(BEST_MODEL_OPTUNA_PATH))
best_bertweet_model.to(device)

# Save the model and tokenizer to a dedicated directory in your Drive for later use.
FINAL_MODEL_DIR_BERTWEET_FULL = os.path.join(BASE_DIR, "final_models", "bertweet_full_code")
os.makedirs(FINAL_MODEL_DIR_BERTWEET_FULL, exist_ok=True)
best_bertweet_model.save_pretrained(FINAL_MODEL_DIR_BERTWEET_FULL)
bertweet_tokenizer.save_pretrained(FINAL_MODEL_DIR_BERTWEET_FULL)

print(f"Final fine-tuned BERTweet 'full code' model saved to: {FINAL_MODEL_DIR_BERTWEET_FULL}")

Loading best model state from local path: /content/best_model_trial_0.pt


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully. Now saving to Google Drive...


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Final fine-tuned BERTweet 'full code' model and tokenizer saved to: /content/drive/MyDrive/ADV_DL/final_models/bertweet_full_code


## distilBERT

Optuna

In [None]:
def objective_distilbert_full_code(trial):
    # Hyperparameter suggestions
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
    patience = trial.suggest_int("patience", 6, 7)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])

    # Unlike BERTweet, DistilBERT has 6 layers, so we tune a different range
    num_layers = trial.suggest_int("num_layers", 1, 6)

    # Datasets / loaders for DistilBERT
    train_dataset = TweetSentimentDataset(distilbert_train_encodings, train_df_distilbert['label'].tolist())
    val_dataset = TweetSentimentDataset(val_distilbert_encodings, val_df_distilbert['label'].tolist())

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model for DistilBERT
    model = AutoModelForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=3
    ).to(device)

    # Freeze base; unfreeze last `num_layers` + classifier
    for p in model.distilbert.parameters():
        p.requires_grad = False
    for p in model.distilbert.transformer.layer[-num_layers:].parameters():
        p.requires_grad = True
    for p in model.pre_classifier.parameters():
        p.requires_grad = True
    for p in model.classifier.parameters():
        p.requires_grad = True

    # ---------- Imbalance handling: weighted loss ----------
    classes = np.array([0, 1, 2], dtype=int)
    y_train = np.array(train_df_distilbert['label'].tolist(), dtype=int)
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    class_weights = torch.tensor(weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # Optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # W&B
    wandb.init(
        project="distilbert-full-code-tuning",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": "DistilBERT",
            "dataset": "COVID-19-tweets-standardized"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # Train/eval loop
    best_val_accuracy = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=10, patience=patience, trial=trial
    )

    wandb.finish()

    return best_val_accuracy

In [None]:
# Optuna Study for DistilBERT
print("Running Optuna study for DistilBERT 'full code' training...")
study_distilbert_full_code = optuna.create_study(direction="maximize")
study_distilbert_full_code.optimize(objective_distilbert_full_code, n_trials=13) # Adjust n_trials as needed
print("\nOptuna study for DistilBERT complete.")
print(f"Best trial parameters: {study_distilbert_full_code.best_trial.params}")
print(f"Best validation accuracy: {study_distilbert_full_code.best_trial.value}")

[I 2025-08-18 08:13:25,453] A new study created in memory with name: no-name-de0ce0c5-22e2-44de-b527-32cd3c99c387


Running Optuna study for DistilBERT 'full code' training...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 0 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.8286, Train Acc: 0.6280 | Val Loss: 0.6941, Val Acc: 0.7070, Val F1: 0.7114


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.6506, Train Acc: 0.7339 | Val Loss: 0.6103, Val Acc: 0.7603, Val F1: 0.7620


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5745, Train Acc: 0.7746 | Val Loss: 0.5735, Val Acc: 0.7869, Val F1: 0.7877


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.5164, Train Acc: 0.7999 | Val Loss: 0.5542, Val Acc: 0.7939, Val F1: 0.7945


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4711, Train Acc: 0.8220 | Val Loss: 0.5317, Val Acc: 0.7968, Val F1: 0.7983


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.4313, Train Acc: 0.8392 | Val Loss: 0.5268, Val Acc: 0.8073, Val F1: 0.8084


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3850, Train Acc: 0.8549 | Val Loss: 0.5286, Val Acc: 0.8206, Val F1: 0.8206


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.3485, Train Acc: 0.8714 | Val Loss: 0.5572, Val Acc: 0.8229, Val F1: 0.8223


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.3071, Train Acc: 0.8864 | Val Loss: 0.5483, Val Acc: 0.8174, Val F1: 0.8182


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.2737, Train Acc: 0.8998 | Val Loss: 0.5738, Val Acc: 0.8243, Val F1: 0.8244
Training complete. Best Validation Accuracy: 0.8243 at epoch 10.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▅▆▆▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▄▆▆▆▇████
Validation F1,▁▄▆▆▆▇████
Validation Loss,█▄▃▂▁▁▁▂▂▃
Validation Precision,▁▄▅▆▇▇████
Validation Recall,▁▄▆▆▆▇████

0,1
Epoch,10.0
Train Accuracy,0.89983
Train Loss,0.27369
Validation Accuracy,0.82434
Validation F1,0.82441
Validation Loss,0.57378
Validation Precision,0.82568
Validation Recall,0.82434


[I 2025-08-18 08:40:00,760] Trial 0 finished with value: 0.8243440233236151 and parameters: {'learning_rate': 2.3147116447384962e-05, 'weight_decay': 4.511488931804444e-05, 'patience': 7, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.8243440233236151.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 1 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.5776, Train Acc: 0.7754 | Val Loss: 0.4392, Val Acc: 0.8434, Val F1: 0.8435


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.3758, Train Acc: 0.8732 | Val Loss: 0.3821, Val Acc: 0.8773, Val F1: 0.8768


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.2884, Train Acc: 0.9058 | Val Loss: 0.3502, Val Acc: 0.8802, Val F1: 0.8804


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.2300, Train Acc: 0.9260 | Val Loss: 0.3759, Val Acc: 0.8671, Val F1: 0.8689


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.1922, Train Acc: 0.9411 | Val Loss: 0.4153, Val Acc: 0.8801, Val F1: 0.8801


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.1522, Train Acc: 0.9526 | Val Loss: 0.4127, Val Acc: 0.8856, Val F1: 0.8852


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.1239, Train Acc: 0.9621 | Val Loss: 0.5298, Val Acc: 0.8867, Val F1: 0.8861


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.1044, Train Acc: 0.9666 | Val Loss: 0.4812, Val Acc: 0.8641, Val F1: 0.8652


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.0852, Train Acc: 0.9724 | Val Loss: 0.5335, Val Acc: 0.8639, Val F1: 0.8660


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.0671, Train Acc: 0.9780 | Val Loss: 0.5823, Val Acc: 0.8722, Val F1: 0.8726
Training complete. Best Validation Accuracy: 0.8867 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▆▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▆▇▅▇██▄▄▆
Validation F1,▁▆▇▅▇██▅▅▆
Validation Loss,▄▂▁▂▃▃▆▅▇█
Validation Precision,▁▆▇▅▇██▄▅▅
Validation Recall,▁▆▇▅▇██▄▄▆

0,1
Epoch,10.0
Train Accuracy,0.97804
Train Loss,0.06707
Validation Accuracy,0.87221
Validation F1,0.87264
Validation Loss,0.58233
Validation Precision,0.8743
Validation Recall,0.87221


[I 2025-08-18 09:17:36,557] Trial 1 finished with value: 0.8866618075801749 and parameters: {'learning_rate': 0.00010825830161396886, 'weight_decay': 3.324174689345579e-05, 'patience': 7, 'batch_size': 32, 'num_layers': 4}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 2 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7485, Train Acc: 0.6746 | Val Loss: 0.6246, Val Acc: 0.7498, Val F1: 0.7521


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5943, Train Acc: 0.7628 | Val Loss: 0.5812, Val Acc: 0.7719, Val F1: 0.7730


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5303, Train Acc: 0.7889 | Val Loss: 0.5808, Val Acc: 0.7676, Val F1: 0.7708


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4721, Train Acc: 0.8143 | Val Loss: 0.6184, Val Acc: 0.7846, Val F1: 0.7829


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4127, Train Acc: 0.8378 | Val Loss: 0.5898, Val Acc: 0.7948, Val F1: 0.7946


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3517, Train Acc: 0.8607 | Val Loss: 0.6835, Val Acc: 0.7895, Val F1: 0.7896


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2788, Train Acc: 0.8859 | Val Loss: 0.7041, Val Acc: 0.7869, Val F1: 0.7889


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2255, Train Acc: 0.9092 | Val Loss: 0.7880, Val Acc: 0.7903, Val F1: 0.7904


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1799, Train Acc: 0.9274 | Val Loss: 0.8231, Val Acc: 0.7767, Val F1: 0.7780


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1531, Train Acc: 0.9384 | Val Loss: 0.9251, Val Acc: 0.7861, Val F1: 0.7862
Training complete. Best Validation Accuracy: 0.7948 at epoch 5.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▅▆▇▇██
Train Loss,█▆▅▅▄▃▂▂▁▁
Validation Accuracy,▁▄▄▆█▇▇▇▅▇
Validation F1,▁▄▄▆█▇▇▇▅▇
Validation Loss,▂▁▁▂▁▃▄▅▆█
Validation Precision,▁▅▆▇█▇█▇▅▆
Validation Recall,▁▄▄▆█▇▇▇▅▇

0,1
Epoch,10.0
Train Accuracy,0.93841
Train Loss,0.15309
Validation Accuracy,0.78608
Validation F1,0.78622
Validation Loss,0.92511
Validation Precision,0.78697
Validation Recall,0.78608


[I 2025-08-18 09:39:34,958] Trial 2 finished with value: 0.7948250728862973 and parameters: {'learning_rate': 0.00019831031616849638, 'weight_decay': 1.4920269827149783e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 1}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 3 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7223, Train Acc: 0.6952 | Val Loss: 0.6021, Val Acc: 0.7609, Val F1: 0.7628


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5879, Train Acc: 0.7675 | Val Loss: 0.5909, Val Acc: 0.7742, Val F1: 0.7745


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5309, Train Acc: 0.7943 | Val Loss: 0.5780, Val Acc: 0.7875, Val F1: 0.7880


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4834, Train Acc: 0.8121 | Val Loss: 0.5490, Val Acc: 0.7953, Val F1: 0.7959


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.4246, Train Acc: 0.8337 | Val Loss: 0.5956, Val Acc: 0.7994, Val F1: 0.7983


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3689, Train Acc: 0.8534 | Val Loss: 0.5841, Val Acc: 0.7959, Val F1: 0.7964


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3117, Train Acc: 0.8758 | Val Loss: 0.6391, Val Acc: 0.7851, Val F1: 0.7871


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2603, Train Acc: 0.8931 | Val Loss: 0.7611, Val Acc: 0.7929, Val F1: 0.7925


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2216, Train Acc: 0.9106 | Val Loss: 0.7713, Val Acc: 0.7855, Val F1: 0.7860


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1933, Train Acc: 0.9235 | Val Loss: 0.8792, Val Acc: 0.7830, Val F1: 0.7826
Training complete. Best Validation Accuracy: 0.7994 at epoch 5.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▅▆▇▇██
Train Loss,█▆▅▅▄▃▃▂▁▁
Validation Accuracy,▁▃▆▇█▇▅▇▅▅
Validation F1,▁▃▆███▆▇▆▅
Validation Loss,▂▂▂▁▂▂▃▅▆█
Validation Precision,▁▅▆▇██▆▇▆▄
Validation Recall,▁▃▆▇█▇▅▇▅▅

0,1
Epoch,10.0
Train Accuracy,0.92349
Train Loss,0.19326
Validation Accuracy,0.78304
Validation F1,0.78256
Validation Loss,0.87916
Validation Precision,0.78231
Validation Recall,0.78304


[I 2025-08-18 10:02:17,482] Trial 3 finished with value: 0.79944120505345 and parameters: {'learning_rate': 0.00015002091933890785, 'weight_decay': 6.145316886295965e-05, 'patience': 7, 'batch_size': 32, 'num_layers': 1}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 4 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7532, Train Acc: 0.7080 | Val Loss: 0.9419, Val Acc: 0.5113, Val F1: 0.4158


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 1.0248, Train Acc: 0.4330 | Val Loss: 1.0659, Val Acc: 0.4015, Val F1: 0.3242


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 1.0926, Train Acc: 0.3516 | Val Loss: 1.0988, Val Acc: 0.1874, Val F1: 0.0592


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 1.0976, Train Acc: 0.3689 | Val Loss: 1.0993, Val Acc: 0.4384, Val F1: 0.2672


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 1.0989, Train Acc: 0.3863 | Val Loss: 1.0990, Val Acc: 0.1874, Val F1: 0.0592


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 1.0985, Train Acc: 0.3533 | Val Loss: 1.1008, Val Acc: 0.4384, Val F1: 0.2672


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 1.0989, Train Acc: 0.4031 | Val Loss: 1.0985, Val Acc: 0.3741, Val F1: 0.2037


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 1.0989, Train Acc: 0.4051 | Val Loss: 1.0985, Val Acc: 0.3741, Val F1: 0.2037
Early stopping triggered at epoch 8.
Training complete. Best Validation Accuracy: 0.5113 at epoch 1.


0,1
Epoch,▁▂▃▄▅▆▇█
Train Accuracy,█▃▁▁▂▁▂▂
Train Loss,▁▆██████
Validation Accuracy,█▆▁▆▁▆▅▅
Validation F1,█▆▁▅▁▅▄▄
Validation Loss,▁▆██████
Validation Precision,█▅▁▃▁▃▃▃
Validation Recall,█▆▁▆▁▆▅▅

0,1
Epoch,8.0
Train Accuracy,0.40513
Train Loss,1.09886
Validation Accuracy,0.37415
Validation F1,0.20374
Validation Loss,1.0985
Validation Precision,0.13999
Validation Recall,0.37415


[I 2025-08-18 10:39:06,425] Trial 4 finished with value: 0.5112973760932945 and parameters: {'learning_rate': 0.0002577884052019022, 'weight_decay': 1.4666338702506418e-06, 'patience': 6, 'batch_size': 32, 'num_layers': 6}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 5 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7423, Train Acc: 0.6839 | Val Loss: 0.6487, Val Acc: 0.7510, Val F1: 0.7504


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5963, Train Acc: 0.7631 | Val Loss: 0.5918, Val Acc: 0.7711, Val F1: 0.7706


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5280, Train Acc: 0.7928 | Val Loss: 0.5658, Val Acc: 0.7798, Val F1: 0.7803


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4692, Train Acc: 0.8158 | Val Loss: 0.5598, Val Acc: 0.7768, Val F1: 0.7788


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3931, Train Acc: 0.8457 | Val Loss: 0.5873, Val Acc: 0.7851, Val F1: 0.7862


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3135, Train Acc: 0.8748 | Val Loss: 0.6562, Val Acc: 0.7862, Val F1: 0.7868


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.2419, Train Acc: 0.9015 | Val Loss: 0.7739, Val Acc: 0.7861, Val F1: 0.7856


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.1898, Train Acc: 0.9223 | Val Loss: 0.8412, Val Acc: 0.7862, Val F1: 0.7874


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1559, Train Acc: 0.9358 | Val Loss: 0.9191, Val Acc: 0.7829, Val F1: 0.7828


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1271, Train Acc: 0.9499 | Val Loss: 1.0494, Val Acc: 0.7795, Val F1: 0.7788
Training complete. Best Validation Accuracy: 0.7862 at epoch 6.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▄▅▆▇▇██
Train Loss,█▆▆▅▄▃▂▂▁▁
Validation Accuracy,▁▅▇▆████▇▇
Validation F1,▁▅▇▆████▇▆
Validation Loss,▂▁▁▁▁▂▄▅▆█
Validation Precision,▁▅▆▇██▇█▆▅
Validation Recall,▁▅▇▆████▇▇

0,1
Epoch,10.0
Train Accuracy,0.94995
Train Loss,0.12708
Validation Accuracy,0.77952
Validation F1,0.77878
Validation Loss,1.04937
Validation Precision,0.77861
Validation Recall,0.77952


[I 2025-08-18 11:01:38,436] Trial 5 finished with value: 0.7862001943634597 and parameters: {'learning_rate': 8.102377917502759e-05, 'weight_decay': 1.9968018577169306e-05, 'patience': 7, 'batch_size': 32, 'num_layers': 1}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 6 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7724, Train Acc: 0.6645 | Val Loss: 0.6461, Val Acc: 0.7400, Val F1: 0.7411


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.5902, Train Acc: 0.7693 | Val Loss: 0.5793, Val Acc: 0.7901, Val F1: 0.7896


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.5101, Train Acc: 0.8066 | Val Loss: 0.5453, Val Acc: 0.8067, Val F1: 0.8060


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.4459, Train Acc: 0.8349 | Val Loss: 0.5116, Val Acc: 0.8140, Val F1: 0.8140


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.3964, Train Acc: 0.8550 | Val Loss: 0.5037, Val Acc: 0.8254, Val F1: 0.8258


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.3475, Train Acc: 0.8740 | Val Loss: 0.5030, Val Acc: 0.8268, Val F1: 0.8274


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.3050, Train Acc: 0.8912 | Val Loss: 0.5228, Val Acc: 0.8341, Val F1: 0.8341


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.2660, Train Acc: 0.9041 | Val Loss: 0.5517, Val Acc: 0.8344, Val F1: 0.8340


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.2317, Train Acc: 0.9156 | Val Loss: 0.5547, Val Acc: 0.8309, Val F1: 0.8317


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1945, Train Acc: 0.9285 | Val Loss: 0.5945, Val Acc: 0.8262, Val F1: 0.8273
Training complete. Best Validation Accuracy: 0.8344 at epoch 8.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▅▆▆▇▇███▇
Validation F1,▁▅▆▆▇▇███▇
Validation Loss,█▅▃▁▁▁▂▃▄▅
Validation Precision,▁▅▅▇▇█████
Validation Recall,▁▅▆▆▇▇███▇

0,1
Epoch,10.0
Train Accuracy,0.9285
Train Loss,0.1945
Validation Accuracy,0.82617
Validation F1,0.82727
Validation Loss,0.59447
Validation Precision,0.83056
Validation Recall,0.82617


[I 2025-08-18 11:34:09,988] Trial 6 finished with value: 0.8344266277939747 and parameters: {'learning_rate': 1.031417927047022e-05, 'weight_decay': 6.48361301935615e-06, 'patience': 6, 'batch_size': 32, 'num_layers': 3}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 7 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.9565, Train Acc: 0.5469 | Val Loss: 0.8430, Val Acc: 0.6516, Val F1: 0.6527


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.8066, Train Acc: 0.6739 | Val Loss: 0.8186, Val Acc: 0.6233, Val F1: 0.6335


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.7491, Train Acc: 0.7000 | Val Loss: 0.6855, Val Acc: 0.7263, Val F1: 0.7285


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.7946, Train Acc: 0.6705 | Val Loss: 0.7790, Val Acc: 0.7025, Val F1: 0.7027


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.8919, Train Acc: 0.5964 | Val Loss: 0.9881, Val Acc: 0.5159, Val F1: 0.4955


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.9024, Train Acc: 0.5785 | Val Loss: 0.9842, Val Acc: 0.5553, Val F1: 0.5284


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.8508, Train Acc: 0.6200 | Val Loss: 0.9105, Val Acc: 0.5703, Val F1: 0.5450


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.8286, Train Acc: 0.6474 | Val Loss: 0.8084, Val Acc: 0.6681, Val F1: 0.6736


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.8091, Train Acc: 0.6611 | Val Loss: 0.7799, Val Acc: 0.6684, Val F1: 0.6665


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.8022, Train Acc: 0.6562 | Val Loss: 0.7893, Val Acc: 0.6687, Val F1: 0.6726
Training complete. Best Validation Accuracy: 0.7263 at epoch 3.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▇█▇▃▂▄▆▆▆
Train Loss,█▃▁▃▆▆▄▄▃▃
Validation Accuracy,▆▅█▇▁▂▃▆▆▆
Validation F1,▆▅█▇▁▂▂▆▆▆
Validation Loss,▅▄▁▃██▆▄▃▃
Validation Precision,▄▅█▆▁▄▅▆▆▆
Validation Recall,▆▅█▇▁▂▃▆▆▆

0,1
Epoch,10.0
Train Accuracy,0.65619
Train Loss,0.80221
Validation Accuracy,0.66873
Validation F1,0.67262
Validation Loss,0.78934
Validation Precision,0.69806
Validation Recall,0.66873


[I 2025-08-18 12:04:31,375] Trial 7 finished with value: 0.7263119533527697 and parameters: {'learning_rate': 0.0009649203802061476, 'weight_decay': 4.0345821974925446e-06, 'patience': 7, 'batch_size': 128, 'num_layers': 3}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 8 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.6301, Train Acc: 0.7490 | Val Loss: 0.4857, Val Acc: 0.8234, Val F1: 0.8237


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.4257, Train Acc: 0.8484 | Val Loss: 0.4224, Val Acc: 0.8587, Val F1: 0.8585


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.3302, Train Acc: 0.8876 | Val Loss: 0.4373, Val Acc: 0.8450, Val F1: 0.8451


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.2567, Train Acc: 0.9136 | Val Loss: 0.4368, Val Acc: 0.8619, Val F1: 0.8618


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.2037, Train Acc: 0.9335 | Val Loss: 0.4908, Val Acc: 0.8590, Val F1: 0.8592


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.1635, Train Acc: 0.9467 | Val Loss: 0.5214, Val Acc: 0.8649, Val F1: 0.8645


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.1290, Train Acc: 0.9576 | Val Loss: 0.5072, Val Acc: 0.8717, Val F1: 0.8717


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.0939, Train Acc: 0.9691 | Val Loss: 0.5792, Val Acc: 0.8582, Val F1: 0.8584


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.0761, Train Acc: 0.9740 | Val Loss: 0.5960, Val Acc: 0.8627, Val F1: 0.8632


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.0635, Train Acc: 0.9787 | Val Loss: 0.6261, Val Acc: 0.8653, Val F1: 0.8653
Training complete. Best Validation Accuracy: 0.8717 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▁▁▁
Validation Accuracy,▁▆▄▇▆▇█▆▇▇
Validation F1,▁▆▄▇▆▇█▆▇▇
Validation Loss,▃▁▂▁▃▄▄▆▇█
Validation Precision,▁▆▅▆▆▇█▆▇▇
Validation Recall,▁▆▄▇▆▇█▆▇▇

0,1
Epoch,10.0
Train Accuracy,0.97871
Train Loss,0.06354
Validation Accuracy,0.86528
Validation F1,0.86531
Validation Loss,0.62607
Validation Precision,0.86538
Validation Recall,0.86528


[I 2025-08-18 12:36:07,878] Trial 8 finished with value: 0.8717201166180758 and parameters: {'learning_rate': 0.00011240088934226435, 'weight_decay': 2.7719520913043273e-06, 'patience': 6, 'batch_size': 64, 'num_layers': 3}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 9 for 10 epochs with patience 6...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.7149, Train Acc: 0.7038 | Val Loss: 0.5179, Val Acc: 0.8099, Val F1: 0.8107


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.4647, Train Acc: 0.8318 | Val Loss: 0.4459, Val Acc: 0.8475, Val F1: 0.8474


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.3703, Train Acc: 0.8752 | Val Loss: 0.3942, Val Acc: 0.8711, Val F1: 0.8713


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.3097, Train Acc: 0.8980 | Val Loss: 0.3717, Val Acc: 0.8789, Val F1: 0.8786


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.2596, Train Acc: 0.9151 | Val Loss: 0.3880, Val Acc: 0.8803, Val F1: 0.8801


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.2173, Train Acc: 0.9301 | Val Loss: 0.3845, Val Acc: 0.8786, Val F1: 0.8787


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.1902, Train Acc: 0.9392 | Val Loss: 0.4142, Val Acc: 0.8852, Val F1: 0.8848


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.1613, Train Acc: 0.9489 | Val Loss: 0.3880, Val Acc: 0.8779, Val F1: 0.8785


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.1391, Train Acc: 0.9565 | Val Loss: 0.4702, Val Acc: 0.8788, Val F1: 0.8780


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.1125, Train Acc: 0.9652 | Val Loss: 0.4689, Val Acc: 0.8785, Val F1: 0.8783
Training complete. Best Validation Accuracy: 0.8852 at epoch 7.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▆▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▅▇▇█▇█▇▇▇
Validation F1,▁▄▇▇█▇█▇▇▇
Validation Loss,█▅▂▁▂▂▃▂▆▆
Validation Precision,▁▅▇▇█▇█▇▇▇
Validation Recall,▁▅▇▇█▇█▇▇▇

0,1
Epoch,10.0
Train Accuracy,0.96522
Train Loss,0.11254
Validation Accuracy,0.87852
Validation F1,0.87834
Validation Loss,0.46895
Validation Precision,0.8782
Validation Recall,0.87852


[I 2025-08-18 13:19:31,755] Trial 9 finished with value: 0.8852040816326531 and parameters: {'learning_rate': 1.6096828009161672e-05, 'weight_decay': 3.546000577926727e-05, 'patience': 6, 'batch_size': 128, 'num_layers': 6}. Best is trial 1 with value: 0.8866618075801749.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 10 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.5956, Train Acc: 0.7606 | Val Loss: 0.4466, Val Acc: 0.8401, Val F1: 0.8407


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.3722, Train Acc: 0.8724 | Val Loss: 0.3639, Val Acc: 0.8789, Val F1: 0.8785


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.2783, Train Acc: 0.9078 | Val Loss: 0.3601, Val Acc: 0.8864, Val F1: 0.8861


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.2153, Train Acc: 0.9309 | Val Loss: 0.3655, Val Acc: 0.8887, Val F1: 0.8885


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.1715, Train Acc: 0.9465 | Val Loss: 0.4039, Val Acc: 0.8930, Val F1: 0.8924


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.1352, Train Acc: 0.9573 | Val Loss: 0.3915, Val Acc: 0.8902, Val F1: 0.8900


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.1037, Train Acc: 0.9669 | Val Loss: 0.4273, Val Acc: 0.8852, Val F1: 0.8852


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.0756, Train Acc: 0.9752 | Val Loss: 0.4831, Val Acc: 0.8867, Val F1: 0.8865


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.0577, Train Acc: 0.9817 | Val Loss: 0.5260, Val Acc: 0.8701, Val F1: 0.8718


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.0446, Train Acc: 0.9855 | Val Loss: 0.6351, Val Acc: 0.8857, Val F1: 0.8851
Training complete. Best Validation Accuracy: 0.8930 at epoch 5.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▆▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▁▁▁
Validation Accuracy,▁▆▇▇██▇▇▅▇
Validation F1,▁▆▇▇██▇▇▅▇
Validation Loss,▃▁▁▁▂▂▃▄▅█
Validation Precision,▁▆▇▇██▇▇▅▇
Validation Recall,▁▆▇▇██▇▇▅▇

0,1
Epoch,10.0
Train Accuracy,0.98548
Train Loss,0.04455
Validation Accuracy,0.88569
Validation F1,0.88515
Validation Loss,0.63514
Validation Precision,0.88659
Validation Recall,0.88569


[I 2025-08-18 14:00:05,396] Trial 10 finished with value: 0.8929786200194364 and parameters: {'learning_rate': 4.573793651715024e-05, 'weight_decay': 1.4816634476065716e-05, 'patience': 7, 'batch_size': 64, 'num_layers': 5}. Best is trial 10 with value: 0.8929786200194364.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 11 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.5969, Train Acc: 0.7649 | Val Loss: 0.4294, Val Acc: 0.8567, Val F1: 0.8565


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.3680, Train Acc: 0.8736 | Val Loss: 0.3943, Val Acc: 0.8663, Val F1: 0.8660


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.2796, Train Acc: 0.9086 | Val Loss: 0.3440, Val Acc: 0.8899, Val F1: 0.8895


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.2132, Train Acc: 0.9322 | Val Loss: 0.3505, Val Acc: 0.8888, Val F1: 0.8889


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.1665, Train Acc: 0.9461 | Val Loss: 0.4135, Val Acc: 0.8788, Val F1: 0.8783


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.1297, Train Acc: 0.9596 | Val Loss: 0.4280, Val Acc: 0.8796, Val F1: 0.8799


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.0994, Train Acc: 0.9692 | Val Loss: 0.4661, Val Acc: 0.8818, Val F1: 0.8822


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.0762, Train Acc: 0.9759 | Val Loss: 0.5326, Val Acc: 0.8844, Val F1: 0.8846


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.0556, Train Acc: 0.9825 | Val Loss: 0.5344, Val Acc: 0.8876, Val F1: 0.8877


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.0455, Train Acc: 0.9851 | Val Loss: 0.7120, Val Acc: 0.8782, Val F1: 0.8774
Training complete. Best Validation Accuracy: 0.8899 at epoch 3.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▆▆▇▇▇███
Train Loss,█▅▄▃▃▂▂▁▁▁
Validation Accuracy,▁▃██▆▆▆▇█▆
Validation F1,▁▃██▆▆▆▇█▅
Validation Loss,▃▂▁▁▂▃▃▅▅█
Validation Precision,▁▅██▆▆▇▇█▆
Validation Recall,▁▃██▆▆▆▇█▆

0,1
Epoch,10.0
Train Accuracy,0.98515
Train Loss,0.04548
Validation Accuracy,0.87816
Validation F1,0.87743
Validation Loss,0.71198
Validation Precision,0.88228
Validation Recall,0.87816


[I 2025-08-18 14:40:35,001] Trial 11 finished with value: 0.8899416909620991 and parameters: {'learning_rate': 4.564216024486099e-05, 'weight_decay': 1.4730883832279522e-05, 'patience': 7, 'batch_size': 64, 'num_layers': 5}. Best is trial 10 with value: 0.8929786200194364.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training for trial 12 for 10 epochs with patience 7...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/10 | Train Loss: 0.6055, Train Acc: 0.7619 | Val Loss: 0.4393, Val Acc: 0.8466, Val F1: 0.8467


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/10 | Train Loss: 0.3797, Train Acc: 0.8692 | Val Loss: 0.3726, Val Acc: 0.8801, Val F1: 0.8798


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/10 | Train Loss: 0.2794, Train Acc: 0.9085 | Val Loss: 0.3584, Val Acc: 0.8847, Val F1: 0.8844


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 4/10 | Train Loss: 0.2194, Train Acc: 0.9292 | Val Loss: 0.4168, Val Acc: 0.8827, Val F1: 0.8824


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 5/10 | Train Loss: 0.1653, Train Acc: 0.9465 | Val Loss: 0.4206, Val Acc: 0.8748, Val F1: 0.8748


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 6/10 | Train Loss: 0.1342, Train Acc: 0.9581 | Val Loss: 0.4090, Val Acc: 0.8705, Val F1: 0.8717


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 7/10 | Train Loss: 0.0999, Train Acc: 0.9675 | Val Loss: 0.4623, Val Acc: 0.8581, Val F1: 0.8598


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 8/10 | Train Loss: 0.0746, Train Acc: 0.9764 | Val Loss: 0.4987, Val Acc: 0.8733, Val F1: 0.8736


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 9/10 | Train Loss: 0.0531, Train Acc: 0.9834 | Val Loss: 0.5565, Val Acc: 0.8729, Val F1: 0.8737


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 10/10 | Train Loss: 0.0466, Train Acc: 0.9848 | Val Loss: 0.5700, Val Acc: 0.8677, Val F1: 0.8688
Training complete. Best Validation Accuracy: 0.8847 at epoch 3.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▆▆▇▇▇███
Train Loss,█▅▄▃▂▂▂▁▁▁
Validation Accuracy,▁▇██▆▅▃▆▆▅
Validation F1,▁▇██▆▆▃▆▆▅
Validation Loss,▄▁▁▃▃▃▄▆██
Validation Precision,▁▇▇█▆▆▅▆▆▅
Validation Recall,▁▇██▆▅▃▆▆▅

0,1
Epoch,10.0
Train Accuracy,0.98481
Train Loss,0.04659
Validation Accuracy,0.86771
Validation F1,0.86878
Validation Loss,0.56998
Validation Precision,0.87099
Validation Recall,0.86771


[I 2025-08-18 15:21:04,336] Trial 12 finished with value: 0.8847181729834791 and parameters: {'learning_rate': 4.0020868952852185e-05, 'weight_decay': 1.2822579672245287e-05, 'patience': 7, 'batch_size': 64, 'num_layers': 5}. Best is trial 10 with value: 0.8929786200194364.



Optuna study for DistilBERT complete.
Best trial parameters: {'learning_rate': 4.573793651715024e-05, 'weight_decay': 1.4816634476065716e-05, 'patience': 7, 'batch_size': 64, 'num_layers': 5}
Best validation accuracy: 0.8929786200194364


Save to drive (checkpoint)


In [None]:
# Define the directory and file path to save the hyperparameters
HYPERPARAMS_DIR = os.path.join(BASE_DIR, "hyperparams")
os.makedirs(HYPERPARAMS_DIR, exist_ok=True)
BEST_DISTILBERT_PARAMS_FILE = os.path.join(HYPERPARAMS_DIR, "best_distilbert_full_code_hyperparams.json")

# Extract and save the best parameters from the Optuna study
best_params = study_distilbert_full_code.best_trial.params

with open(BEST_DISTILBERT_PARAMS_FILE, 'w') as f:
    json.dump(best_params, f, indent=4)

print(f"Best DistilBERT hyperparameters for 'full code' training saved to: {BEST_DISTILBERT_PARAMS_FILE}")

Best DistilBERT hyperparameters for 'full code' training saved to: /content/drive/MyDrive/ADV_DL/hyperparams/best_distilbert_full_code_hyperparams.json


In [None]:
# Define the local temporary directory where Optuna saves the checkpoint
LOCAL_DIR = "/content"

BASE_DIR = "/content/drive/MyDrive/ADV_DL"

# Get the best trial number from the completed DistilBERT study
best_trial_number_distilbert = study_distilbert_full_code.best_trial.number

# Construct the file paths
LOCAL_MODEL_PATH = os.path.join(LOCAL_DIR, f"best_model_trial_{best_trial_number_distilbert}.pt")
FINAL_MODEL_DIR_DISTILBERT_FULL = os.path.join(BASE_DIR, "final_models", "distilbert_full_code")

# Ensure the final model directory exists
os.makedirs(FINAL_MODEL_DIR_DISTILBERT_FULL, exist_ok=True)

# --- Load the model from the local directory and save to Google Drive ---

print(f"Loading best DistilBERT model state from local path: {LOCAL_MODEL_PATH}")

try:
    best_model_state_dict_distilbert = torch.load(LOCAL_MODEL_PATH)

    # Load the DistilBERT model from Hugging Face and apply the state dictionary
    best_distilbert_model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=3
    )
    best_distilbert_model.load_state_dict(best_model_state_dict_distilbert)
    best_distilbert_model.to(device)

    print("DistilBERT model loaded successfully. Now saving to Google Drive...")

    # Save the model and tokenizer to the dedicated directory in your Drive
    best_distilbert_model.save_pretrained(FINAL_MODEL_DIR_DISTILBERT_FULL)

    # Load the DistilBERT tokenizer and save it as well
    distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    distilbert_tokenizer.save_pretrained(FINAL_MODEL_DIR_DISTILBERT_FULL)

    print(f"Final fine-tuned DistilBERT 'full code' model and tokenizer saved to: {FINAL_MODEL_DIR_DISTILBERT_FULL}")

except FileNotFoundError:
    print(f"Error: The file '{LOCAL_MODEL_PATH}' was not found in the local Colab directory.")
    print("Please check the output of your Optuna run to see the exact name of the saved file.")

Loading best DistilBERT model state from local path: /content/best_model_trial_10.pt


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT model loaded successfully. Now saving to Google Drive...
Final fine-tuned DistilBERT 'full code' model and tokenizer saved to: /content/drive/MyDrive/ADV_DL/final_models/distilbert_full_code


## Fine-tuning using the Hugging Face libraries as shown in Tutorial 5 (exercise 5):

**Build HF Datasets (BERTweet)**

- Convert the pre-tokenized BERTweet features and labels into Hugging Face Datasets for the Trainer:

- Uses bertweet encodings ['input_ids'/'attention_mask'] and the integer label columns.

- Produces hf_train_dataset_bertweet and hf_val_dataset_bertweet with keys: input_ids, attention_mask, labels.

- Ensures shapes align so the Trainer can batch/pad and compute metrics correctly.

**BERTweet**

In [None]:
# The Trainer expects a Hugging Face Dataset object.
# We convert our tokenized encodings and pandas Series to this format.
hf_train_dataset_bertweet = HFDataset.from_dict({
    'input_ids': bertweet_train_encodings['input_ids'],
    'attention_mask': bertweet_train_encodings['attention_mask'],
    'labels': train_df_bertweet['label'].tolist()
})

hf_val_dataset_bertweet = HFDataset.from_dict({
    'input_ids': val_bertweet_encodings['input_ids'],
    'attention_mask': val_bertweet_encodings['attention_mask'],
    'labels': val_df_bertweet['label'].tolist()
})

print("Hugging Face Datasets created successfully.")

Hugging Face Datasets created successfully.


Load the accuracy metric using Hugging Face's evaluate library

In [None]:
metric_accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """
    Computes and returns a dictionary of evaluation metrics for the Hugging Face Trainer.
    """
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)

    accuracy_result = metric_accuracy.compute(predictions=predictions, references=labels)

    # For multi-class classification, we use 'weighted' average to account for label imbalance.
    f1_result = f1_score(labels, predictions, average='weighted', zero_division=0)
    precision_result = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall_result = recall_score(labels, predictions, average='weighted', zero_division=0)

    return {
        "accuracy": accuracy_result["accuracy"],
        "f1": f1_result,
        "precision": precision_result,
        "recall": recall_result
    }

print("\n'compute_metrics' function defined for Hugging Face Trainer.")



'compute_metrics' function defined for Hugging Face Trainer.


**Run without Optuna (Toy Model)**

In [None]:


training_args = TrainingArguments(
    output_dir="./hf_trainer_results",
    eval_strategy="epoch",  # Correct parameter name as noted in exercise 5
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./hf_trainer_logs",
    logging_steps=100,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_strategy="epoch",
    save_total_limit=1,
)

print("\nTrainingArguments defined.")



TrainingArguments defined.


In [None]:

# Load the BERTweet model again for this task
bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base",
    num_labels=3
).to(device)

trainer = Trainer(
    model=bertweet_model,
    args=training_args,
    train_dataset=hf_train_dataset_bertweet,
    eval_dataset=hf_val_dataset_bertweet,
    compute_metrics=compute_metrics,
)

print("\nTrainer instantiated. Starting fine-tuning with Hugging Face Trainer...")
trainer.train()

print("\nFine-tuning with Hugging Face Trainer for BERTweet complete.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trainer instantiated. Starting fine-tuning with Hugging Face Trainer...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3443,0.304442,0.896744,0.896501,0.898723,0.896744
2,0.2428,0.259169,0.914845,0.914563,0.915826,0.914845
3,0.1868,0.277896,0.912172,0.91137,0.914329,0.912172
4,0.1497,0.350257,0.912415,0.912331,0.913527,0.912415
5,0.1028,0.393914,0.911322,0.91074,0.911667,0.911322
6,0.0727,0.411427,0.90792,0.90803,0.908429,0.90792
7,0.0765,0.376641,0.910593,0.910949,0.911524,0.910593
8,0.0382,0.474169,0.910957,0.912062,0.914555,0.910957
9,0.0185,0.555801,0.906706,0.908419,0.912457,0.906706
10,0.0092,0.568262,0.907434,0.909277,0.9138,0.907434



Fine-tuning with Hugging Face Trainer for BERTweet complete.


**Run with Optuna**

In [None]:
# # Make sure you have the 'optuna' and 'wandb' libraries installed and imported.

# def objective_hf_trainer(trial):
#     # --- 1. Define the Hyperparameter Search Space ---
#     learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
#     per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [32, 64, 128])
#     num_train_epochs = 10

#     # --- 2. Initialize W&B for this trial ---
#     wandb.init(
#         project="hf-trainer-tuning",
#         config={
#             "learning_rate": learning_rate,
#             "per_device_train_batch_size": per_device_train_batch_size,
#             "num_train_epochs": num_train_epochs,
#             "architecture": "BERTweet",
#             "tuning_method": "Hugging Face Trainer",
#         },
#         name=f"hf-trainer-trial_{trial.number}",
#         reinit=True
#     )

#     # --- 3. Load Model and Define Trainer ---
#     model = AutoModelForSequenceClassification.from_pretrained(
#         "vinai/bertweet-base",
#         num_labels=3
#     ).to(device)

#     # Define TrainingArguments for this specific trial
#     training_args = TrainingArguments(
#         output_dir=f"./hf_trainer_results_trial_{trial.number}",
#         eval_strategy="epoch",
#         num_train_epochs=num_train_epochs,
#         per_device_train_batch_size=per_device_train_batch_size,
#         per_device_eval_batch_size=per_device_train_batch_size, # Use same batch size for eval
#         learning_rate=learning_rate,
#         report_to="wandb",
#         load_best_model_at_end=False,
#         metric_for_best_model="accuracy",
#         greater_is_better=True,
#         save_strategy="no" # No need to save checkpoints during tuning
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=hf_train_dataset_bertweet,
#         eval_dataset=hf_val_dataset_bertweet,
#         compute_metrics=compute_metrics,
#     )

#     # --- 4. Run Training and Get Best Metric ---
#     trainer.train()
#     eval_results = trainer.evaluate()

#     # --- 5. Log final results and finish W&B run ---
#     wandb.log(eval_results)
#     wandb.finish()

#     return eval_results["eval_accuracy"]



In [None]:
# # --- 6. Run the Optuna study ---
# print("Running Optuna study for Hugging Face Trainer...")
# study_hf = optuna.create_study(direction="maximize")
# study_hf.optimize(objective_hf_trainer, n_trials=13)
# print("\nOptuna study complete.")
# print(f"Best trial parameters: {study_hf.best_trial.params}")
# print(f"Best validation accuracy: {study_hf.best_trial.value}")

[I 2025-08-18 18:25:21,670] A new study created in memory with name: no-name-351e1683-22ba-4fd9-8d2f-a18432a90d00


Running Optuna study for Hugging Face Trainer...


[34m[1mwandb[0m: Currently logged in as: [33mnogapaz98[0m ([33mnogapaz98-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6667,0.432996,0.84536,0.844243,0.845488,0.84536


[W 2025-08-18 18:37:17,170] Trial 0 failed with parameters: {'learning_rate': 1.5043780233477805e-05, 'per_device_train_batch_size': 64} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-1881323687.py", line 53, in objective_hf_trainer
    trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2238, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2587, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
[W 2025-08-18 18:37:17,173] Trial 0 failed with value None.


KeyboardInterrupt: 

We saw that only the first epoch of the first trial took 10 minutes- which means 21 hours - which we don't have. Therefore, we decided to reduce the number of trials.

**Why Reduce Number of Trials and Not Epochs:**

The primary goal of hyperparameter tuning with Optuna is to find the best combination of parameters by exploring a range of options. Each trial represents a complete training and evaluation cycle for a different set of hyperparameters.


We don't want to reduce the number of epochs cause each epoch is valuable: The validation accuracy usually improves over the first few epochs and then either plateaus or decreases due to overfitting. Reducing the number of epochs might cause the model to stop training before it reaches its best possible performance for that specific trial's hyperparameters.

In [None]:
# # --- 6. Run the Optuna study ---
# print("Running Optuna study for Hugging Face Trainer...")
# study_hf = optuna.create_study(direction="maximize")
# study_hf.optimize(objective_hf_trainer, n_trials=3)
# print("\nOptuna study complete.")
# print(f"Best trial parameters: {study_hf.best_trial.params}")
# print(f"Best validation accuracy: {study_hf.best_trial.value}")

[I 2025-08-18 19:14:29,148] A new study created in memory with name: no-name-0fde3732-dcb7-420a-ab40-474e68a74fda


Running Optuna study for Hugging Face Trainer...


0,1
eval/accuracy,▁▇█
eval/f1,▁▇█
eval/loss,█▁▂
eval/precision,▁▆█
eval/recall,▁▇█
eval/runtime,▁█▃
eval/samples_per_second,█▁▆
eval/steps_per_second,█▁▆
train/epoch,▁▁▄▅██
train/global_step,▁▁▄▅██

0,1
eval/accuracy,0.9044
eval/f1,0.90429
eval/loss,0.29503
eval/precision,0.90934
eval/recall,0.9044
eval/runtime,41.858
eval/samples_per_second,196.665
eval/steps_per_second,3.082
train/epoch,3.0
train/global_step,1545.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4867,0.410059,0.859572,0.858653,0.861277,0.859572
2,0.3324,0.326695,0.88982,0.889074,0.891537,0.88982
3,0.243,0.305537,0.901603,0.901149,0.902521,0.901603
4,0.1931,0.325957,0.905491,0.905162,0.908929,0.905491
5,0.1591,0.349931,0.908649,0.907966,0.91049,0.908649
6,0.1341,0.326383,0.90792,0.907803,0.908215,0.90792
7,0.1102,0.358598,0.90622,0.905855,0.90577,0.90622
8,0.0912,0.407272,0.906341,0.90613,0.905989,0.906341
9,0.0873,0.409956,0.906341,0.906459,0.906767,0.906341
10,0.0718,0.42813,0.90707,0.907088,0.907186,0.90707


0,1
epoch,▁
eval/accuracy,▁▅▇████████
eval/f1,▁▅▇████████
eval/loss,▇▂▁▂▄▂▄▇▇██
eval/precision,▁▅▇███▇▇▇██
eval/recall,▁▅▇████████
eval/runtime,▅▁█▆▃▅▅██▅▁
eval/samples_per_second,▄█▁▃▆▄▄▁▁▄█
eval/steps_per_second,▄█▂▃▆▄▄▁▁▄█
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.90707
eval/f1,0.90709
eval/loss,0.42813
eval/precision,0.90719
eval/recall,0.90707
eval/runtime,42.1929
eval/samples_per_second,195.104
eval/steps_per_second,6.115
eval_accuracy,0.90707


[I 2025-08-18 20:53:39,700] Trial 0 finished with value: 0.907069970845481 and parameters: {'learning_rate': 1.304929253756308e-05, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.907069970845481.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5418,0.347225,0.878401,0.878417,0.886355,0.878401


The runtime stopped in the middle after 2 hours running. We will also remove the batch size option of 32 to make it run faster:

A larger batch size processes more data at once, which reduces the total number of training steps required per epoch, leading to a faster overall training process. By removing 32, the Optuna trials will only run with batch sizes of 64 and 128, which will be significantly faster on average.

In [None]:
def objective_hf_trainer(trial):
    # --- 1. Define the Hyperparameter Search Space ---
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [ 64, 128])
    num_train_epochs = 10

    # --- 2. Initialize W&B for this trial ---
    wandb.init(
        project="hf-trainer-tuning",
        config={
            "learning_rate": learning_rate,
            "per_device_train_batch_size": per_device_train_batch_size,
            "num_train_epochs": num_train_epochs,
            "architecture": "BERTweet",
            "tuning_method": "Hugging Face Trainer",
        },
        name=f"hf-trainer-trial_{trial.number}",
        reinit=True
    )

    # --- 3. Load Model and Define Trainer ---
    model = AutoModelForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        num_labels=3
    ).to(device)

    # Define TrainingArguments for this specific trial
    training_args = TrainingArguments(
        output_dir=f"./hf_trainer_results_trial_{trial.number}",
        eval_strategy="epoch",
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size, # Use same batch size for eval
        learning_rate=learning_rate,
        report_to="wandb",
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        save_strategy="no" # No need to save checkpoints during tuning
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train_dataset_bertweet,
        eval_dataset=hf_val_dataset_bertweet,
        compute_metrics=compute_metrics,
    )

    # --- 4. Run Training and Get Best Metric ---
    trainer.train()
    eval_results = trainer.evaluate()

    # --- 5. Log final results and finish W&B run ---
    wandb.log(eval_results)
    wandb.finish()

    return eval_results["eval_accuracy"]



In [None]:
# --- 6. Run the Optuna study ---
print("Running Optuna study for Hugging Face Trainer...")
study_hf = optuna.create_study(direction="maximize")
study_hf.optimize(objective_hf_trainer, n_trials=3)
print("\nOptuna study complete.")
print(f"Best trial parameters: {study_hf.best_trial.params}")
print(f"Best validation accuracy: {study_hf.best_trial.value}")

[I 2025-08-18 21:36:00,031] A new study created in memory with name: no-name-8c753201-8a8b-4f2a-8002-6b342c55b78c


Running Optuna study for Hugging Face Trainer...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.575535,0.783406,0.781666,0.791918,0.783406
2,0.620200,0.416345,0.851919,0.849919,0.853697,0.851919
3,0.620200,0.383118,0.869534,0.869151,0.873252,0.869534
4,0.318300,0.339404,0.887998,0.887452,0.889948,0.887998
5,0.318300,0.332991,0.895773,0.894845,0.896784,0.895773
6,0.224200,0.319206,0.903912,0.903327,0.904006,0.903912
7,0.224200,0.336659,0.900024,0.899314,0.902663,0.900024
8,0.173000,0.334623,0.904519,0.903962,0.904591,0.904519
9,0.173000,0.338253,0.903183,0.902376,0.903551,0.903183
10,0.145500,0.342578,0.904033,0.903355,0.904424,0.904033


0,1
epoch,▁
eval/accuracy,▁▅▆▇▇██████
eval/f1,▁▅▆▇▇██████
eval/loss,█▄▃▂▁▁▁▁▂▂▂
eval/precision,▁▅▆▇███████
eval/recall,▁▅▆▇▇██████
eval/runtime,▇▅▄▄▄▂▆▅▃█▁
eval/samples_per_second,▂▄▅▅▅▇▃▄▆▁█
eval/steps_per_second,▂▄▅▅▅▇▃▄▆▁█
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.90403
eval/f1,0.90335
eval/loss,0.34258
eval/precision,0.90442
eval/recall,0.90403
eval/runtime,42.2221
eval/samples_per_second,194.969
eval/steps_per_second,1.539
eval_accuracy,0.90403


[I 2025-08-18 23:07:58,780] Trial 0 finished with value: 0.9040330417881438 and parameters: {'learning_rate': 1.383021013359748e-05, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.9040330417881438.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.607853,0.763848,0.761321,0.765609,0.763848
2,0.662000,0.435008,0.841958,0.838329,0.847508,0.841958
3,0.662000,0.367859,0.87585,0.875017,0.877914,0.87585
4,0.331900,0.326616,0.892979,0.892439,0.893186,0.892979
5,0.331900,0.329258,0.894315,0.893359,0.895357,0.894315
6,0.236800,0.317731,0.900024,0.899353,0.900291,0.900024
7,0.236800,0.327384,0.90051,0.899775,0.902731,0.90051
8,0.182900,0.325752,0.90294,0.902108,0.903363,0.90294
9,0.182900,0.330959,0.90464,0.903914,0.905036,0.90464
10,0.154200,0.339136,0.903183,0.902309,0.903965,0.903183


0,1
epoch,▁
eval/accuracy,▁▅▇▇▇██████
eval/f1,▁▅▇▇▇██████
eval/loss,█▄▂▁▁▁▁▁▁▂▂
eval/precision,▁▅▇▇███████
eval/recall,▁▅▇▇▇██████
eval/runtime,▅▁█▁▂▁▄▂▄▅▇
eval/samples_per_second,▄█▁█▇█▅▇▅▄▂
eval/steps_per_second,▅█▁█▇█▅▇▅▅▂
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.90318
eval/f1,0.90231
eval/loss,0.33914
eval/precision,0.90396
eval/recall,0.90318
eval/runtime,42.4122
eval/samples_per_second,194.095
eval/steps_per_second,1.533
eval_accuracy,0.90318


[I 2025-08-19 00:39:55,451] Trial 1 finished with value: 0.9031827016520894 and parameters: {'learning_rate': 1.2649740817992602e-05, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.9040330417881438.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5363,0.327733,0.890792,0.890579,0.893384,0.890792
2,0.2673,0.275678,0.909378,0.908935,0.912806,0.909378
3,0.1773,0.266215,0.919218,0.918506,0.920159,0.919218
4,0.1264,0.2669,0.917153,0.917771,0.919094,0.917153
5,0.0995,0.289135,0.922255,0.921885,0.92312,0.922255
6,0.0768,0.304069,0.917881,0.918094,0.918589,0.917881
7,0.0548,0.364325,0.918003,0.918084,0.918483,0.918003
8,0.0407,0.400726,0.916667,0.917217,0.918191,0.916667
9,0.0261,0.476065,0.913387,0.914322,0.916286,0.913387
10,0.0171,0.467258,0.918246,0.919007,0.920516,0.918246


0,1
epoch,▁
eval/accuracy,▁▅▇▇█▇▇▇▆▇▇
eval/f1,▁▅▇▇█▇▇▇▆▇▇
eval/loss,▃▁▁▁▂▂▄▅███
eval/precision,▁▆▇▇█▇▇▇▆▇▇
eval/recall,▁▅▇▇█▇▇▇▆▇▇
eval/runtime,▄▄█▅▄▁▆▄▆▅▅
eval/samples_per_second,▅▅▁▄▅█▃▅▃▄▄
eval/steps_per_second,▅▅▁▃▅█▃▅▃▃▄
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.91825
eval/f1,0.91901
eval/loss,0.46726
eval/precision,0.92052
eval/recall,0.91825
eval/runtime,43.1484
eval/samples_per_second,190.783
eval/steps_per_second,2.99
eval_accuracy,0.91825


[I 2025-08-19 02:15:00,013] Trial 2 finished with value: 0.918245869776482 and parameters: {'learning_rate': 4.899898884731738e-05, 'per_device_train_batch_size': 64}. Best is trial 2 with value: 0.918245869776482.



Optuna study complete.
Best trial parameters: {'learning_rate': 4.899898884731738e-05, 'per_device_train_batch_size': 64}
Best validation accuracy: 0.918245869776482


Save to drive (checkpoint)

*Note: We accidentely ran the cells again after runtime has disconnected- so we got an error.
We leave these cells so you see we saved it and also so you could run it if running the train again.

In [None]:
# Define the directory to save the hyperparameters file
HYPERPARAMS_DIR = os.path.join(BASE_DIR, "hyperparams")
os.makedirs(HYPERPARAMS_DIR, exist_ok=True)
BEST_BERTWEET_PARAMS_FILE = os.path.join(HYPERPARAMS_DIR, "best_bertweet_hyperparams.json")

# Extract and save the best parameters from the Optuna study
best_params_bertweet = study_hf.best_trial.params

with open(BEST_BERTWEET_PARAMS_FILE, 'w') as f:
    json.dump(best_params_bertweet, f, indent=4)

print(f"Best BERTweet hyperparameters saved to: {BEST_BERTWEET_PARAMS_FILE}")

NameError: name 'study_hf' is not defined

In [None]:
FINAL_MODEL_DIR_BERTWEET = os.path.join(BASE_DIR, "final_models", "bertweet")

# Save the model and tokenizer to a directory in your Drive
final_trainer_bertweet.save_model(FINAL_MODEL_DIR_BERTWEET)
bertweet_tokenizer.save_pretrained(FINAL_MODEL_DIR_BERTWEET)

print(f"Final fine-tuned BERTweet model saved to: {FINAL_MODEL_DIR_BERTWEET}")

NameError: name 'final_trainer_bertweet' is not defined

The saving didn't work (the runtime disconnected in the middle). We will test the reslts of each model and compare between them. Then, we will choose the best 2 models and compress them.
If this model (BERTweet finetuned with HF) is one of the best models- we will extract the best parameters and will fine tune it with the best parameters again (it saved only the hyperparametes and disconnected at the next cell)- then we will save it and can use it later to compress it.

**DistilBERT**

In [None]:

# The Trainer expects a Hugging Face Dataset object.
# We convert our tokenized encodings and pandas Series to this format.
hf_train_dataset_distilbert = HFDataset.from_dict({
    'input_ids': distilbert_train_encodings['input_ids'],
    'attention_mask': distilbert_train_encodings['attention_mask'],
    'labels': train_df_distilbert['label'].tolist()
})

hf_val_dataset_distilbert = HFDataset.from_dict({
    'input_ids': val_distilbert_encodings['input_ids'],
    'attention_mask': val_distilbert_encodings['attention_mask'],
    'labels': val_df_distilbert['label'].tolist()
})

print("Hugging Face Datasets created successfully for DistilBERT.")

Hugging Face Datasets created successfully for DistilBERT.


Finetune without using Optuna (Toy Model)

In [None]:

training_args = TrainingArguments(
    output_dir="./hf_trainer_results_distilbert",
    eval_strategy="epoch",              # <-- older versions expect this
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./hf_trainer_logs_distilbert",
    logging_steps=100,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_strategy="epoch",
    save_total_limit=1,
)

distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=NUM_LABELS
).to(device)

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=hf_train_dataset_distilbert,
    eval_dataset=hf_val_dataset_distilbert,
    compute_metrics=compute_metrics,
)

print("\nStarting DistilBERT fine-tuning (HF Trainer)...")
trainer.train()
print("\nDone. DistilBERT eval:", trainer.evaluate())


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting DistilBERT fine-tuning (HF Trainer)...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3383,0.303833,0.895408,0.894866,0.895983,0.895408
2,0.2385,0.279941,0.909864,0.909025,0.911076,0.909864
3,0.1396,0.305041,0.909985,0.909622,0.9114,0.909985
4,0.1022,0.33791,0.913508,0.913638,0.913794,0.913508



Done. DistilBERT eval: {'eval_loss': 0.3379097878932953, 'eval_accuracy': 0.9135082604470359, 'eval_f1': 0.9136380881194262, 'eval_precision': 0.9137941173387312, 'eval_recall': 0.9135082604470359, 'eval_runtime': 22.2874, 'eval_samples_per_second': 369.356, 'eval_steps_per_second': 11.576, 'epoch': 4.0}


Finetune Using Optuna

In [None]:
def objective_distilbert_hf_trainer(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [64, 128])
    num_train_epochs = 10

    wandb.init(
        project="hf-trainer-distilbert",
        config={
            "learning_rate": learning_rate,
            "per_device_train_batch_size": per_device_train_batch_size,
            "num_train_epochs": num_train_epochs,
            "architecture": "DistilBERT",
            "tuning_method": "Hugging Face Trainer",
        },
        name=f"hf-trainer-distilbert-trial_{trial.number}",
        reinit=True
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=3
    ).to(device)

    y_train = np.array(train_df_distilbert['label'].tolist(), dtype=int)
    classes = np.array(sorted(np.unique(y_train)))
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = torch.tensor(weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    training_args = TrainingArguments(
        output_dir=f"./hf_trainer_distilbert_results_trial_{trial.number}",
        eval_strategy="epoch",
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size,
        learning_rate=learning_rate,
        report_to="wandb",
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        save_strategy="no",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train_dataset_distilbert,
        eval_dataset=hf_val_dataset_distilbert,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()

    wandb.log(eval_results)
    wandb.finish()

    return eval_results["eval_accuracy"]

In [None]:

# --- 4. Run the Optuna study ---
print("Running Optuna study for DistilBERT with Hugging Face Trainer...")
study_distilbert_hf = optuna.create_study(direction="maximize")
study_distilbert_hf.optimize(objective_distilbert_hf_trainer, n_trials=3)
print("\nOptuna study complete.")
print(f"Best trial parameters: {study_distilbert_hf.best_trial.params}")
print(f"Best validation accuracy: {study_distilbert_hf.best_trial.value}")

[I 2025-08-19 06:41:21,199] A new study created in memory with name: no-name-81e67889-3b60-437d-8df2-cb225f45be97


Running Optuna study for DistilBERT with Hugging Face Trainer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.475723,0.820943,0.820678,0.83337,0.820943
2,0.510100,0.350946,0.876336,0.87552,0.877562,0.876336
3,0.510100,0.310938,0.89395,0.89351,0.894444,0.89395
4,0.244000,0.31634,0.899052,0.898747,0.89924,0.899052
5,0.244000,0.326855,0.902697,0.902087,0.902809,0.902697
6,0.151500,0.34034,0.901482,0.900794,0.901409,0.901482
7,0.151500,0.359673,0.900632,0.900276,0.900462,0.900632
8,0.106600,0.36829,0.894679,0.89477,0.894956,0.894679
9,0.106600,0.384328,0.894679,0.894995,0.895556,0.894679
10,0.080600,0.39374,0.895651,0.895657,0.895677,0.895651


0,1
epoch,▁
eval/accuracy,▁▆▇████▇▇▇▇
eval/f1,▁▆▇████▇▇▇▇
eval/loss,█▃▁▁▂▂▃▃▄▅▅
eval/precision,▁▅▇████▇▇▇▇
eval/recall,▁▆▇████▇▇▇▇
eval/runtime,▃▆██▄▃▂▂▄▁▃
eval/samples_per_second,▆▃▁▁▅▆▇▇▅█▆
eval/steps_per_second,▆▃▁▁▅▆▇▇▅█▆
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.89565
eval/f1,0.89566
eval/loss,0.39374
eval/precision,0.89568
eval/recall,0.89565
eval/runtime,21.8792
eval/samples_per_second,376.248
eval/steps_per_second,2.971
eval_accuracy,0.89565


[I 2025-08-19 07:26:30,242] Trial 0 finished with value: 0.8956511175898931 and parameters: {'learning_rate': 2.34317423559625e-05, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.8956511175898931.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.492489,0.814504,0.813932,0.823141,0.814504
2,0.549900,0.367527,0.867833,0.867089,0.8682,0.867833
3,0.549900,0.325955,0.888605,0.887886,0.889231,0.888605
4,0.268000,0.324936,0.892979,0.892804,0.892832,0.892979
5,0.268000,0.326515,0.897473,0.896857,0.897749,0.897473
6,0.174300,0.338155,0.897109,0.896165,0.897899,0.897109
7,0.174300,0.353406,0.896501,0.895925,0.896621,0.896501
8,0.126700,0.366804,0.894922,0.894515,0.894457,0.894922
9,0.126700,0.384643,0.892979,0.89287,0.892812,0.892979
10,0.097600,0.390733,0.893829,0.893503,0.89338,0.893829


0,1
epoch,▁
eval/accuracy,▁▅▇████████
eval/f1,▁▅▇████████
eval/loss,█▃▁▁▁▂▂▃▃▄▄
eval/precision,▁▅▇████████
eval/recall,▁▅▇████████
eval/runtime,▂▁▄▆▂▃▂▇▄█▃
eval/samples_per_second,▇█▅▃▇▆▇▂▅▁▆
eval/steps_per_second,▇█▅▃▇▆▇▂▅▁▆
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.89383
eval/f1,0.8935
eval/loss,0.39073
eval/precision,0.89338
eval/recall,0.89383
eval/runtime,21.8708
eval/samples_per_second,376.393
eval/steps_per_second,2.972
eval_accuracy,0.89383


[I 2025-08-19 08:11:37,531] Trial 1 finished with value: 0.8938289601554907 and parameters: {'learning_rate': 2.03342834585215e-05, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.8956511175898931.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.411424,0.847668,0.847606,0.854591,0.847668
2,0.471300,0.311478,0.892614,0.891912,0.894254,0.892614
3,0.471300,0.28954,0.904397,0.903966,0.904887,0.904397
4,0.190600,0.297515,0.905248,0.904978,0.90552,0.905248
5,0.190600,0.338712,0.906584,0.905848,0.906999,0.906584
6,0.107900,0.344253,0.899295,0.899611,0.900319,0.899295
7,0.107900,0.384518,0.898445,0.898969,0.900295,0.898445
8,0.069500,0.39961,0.895165,0.895852,0.897051,0.895165
9,0.069500,0.441154,0.891521,0.892856,0.895758,0.891521
10,0.041700,0.446114,0.893707,0.894645,0.896401,0.893707


0,1
epoch,▁
eval/accuracy,▁▆███▇▇▇▆▆▆
eval/f1,▁▆███▇▇▇▆▇▇
eval/loss,▆▂▁▁▃▃▅▆███
eval/precision,▁▆███▇▇▇▆▇▇
eval/recall,▁▆███▇▇▇▆▆▆
eval/runtime,▃▁▄▄▅▅▄█▁▆▁
eval/samples_per_second,▆█▅▅▄▄▅▁█▃█
eval/steps_per_second,▆█▅▅▄▄▅▁█▃█
eval_accuracy,▁

0,1
epoch,10.0
eval/accuracy,0.89371
eval/f1,0.89465
eval/loss,0.44611
eval/precision,0.8964
eval/recall,0.89371
eval/runtime,21.8215
eval/samples_per_second,377.243
eval/steps_per_second,2.979
eval_accuracy,0.89371


[I 2025-08-19 08:56:41,669] Trial 2 finished with value: 0.8937074829931972 and parameters: {'learning_rate': 3.6924719336468674e-05, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.8956511175898931.



Optuna study complete.
Best trial parameters: {'learning_rate': 2.34317423559625e-05, 'per_device_train_batch_size': 128}
Best validation accuracy: 0.8956511175898931


Save to drive (checkpoint)

In [None]:

# Define the directory to save the hyperparameters file
HYPERPARAMS_DIR = os.path.join(BASE_DIR, "hyperparams")
os.makedirs(HYPERPARAMS_DIR, exist_ok=True)
BEST_DISTILBERT_PARAMS_FILE = os.path.join(HYPERPARAMS_DIR, "best_distilbert_hyperparams.json")

# Extract and save the best parameters from the Optuna study
best_params_distilbert = study_distilbert_hf.best_trial.params

with open(BEST_DISTILBERT_PARAMS_FILE, 'w') as f:
    json.dump(best_params_distilbert, f, indent=4)

print(f"Best DistilBERT hyperparameters saved to: {BEST_DISTILBERT_PARAMS_FILE}")

Best DistilBERT hyperparameters saved to: /content/drive/MyDrive/ADV_DL/hyperparams/best_distilbert_hyperparams.json


In [None]:
# Assuming study_distilbert_hf is your completed Optuna study for DistilBERT
if 'study_distilbert_hf' not in locals():
    print("Error: Optuna study for DistilBERT not found. Please run the Optuna study cell first.")
else:
    # Get the directory of the best trial's checkpoint
    best_trial = study_distilbert_hf.best_trial
    best_trial_output_dir = f"./hf_trainer_distilbert_results_trial_{best_trial.number}"

    # Define the final save directory on Google Drive
    FINAL_MODEL_DIR_DISTILBERT = os.path.join(BASE_DIR, "final_models", "distilbert_hf_trainer")
    os.makedirs(FINAL_MODEL_DIR_DISTILBERT, exist_ok=True)

    # Load the model from the best trial's output directory and save it
    try:
        # Load the model from the final state of the best trial
        final_distilbert_model = AutoModelForSequenceClassification.from_pretrained(best_trial_output_dir)
        final_distilbert_model.to(device) # Ensure model is on the correct device

        # Load the DistilBERT tokenizer and save it as well
        distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

        final_distilbert_model.save_pretrained(FINAL_MODEL_DIR_DISTILBERT)
        distilbert_tokenizer.save_pretrained(FINAL_MODEL_DIR_DISTILBERT)

        print(f"Final fine-tuned DistilBERT model saved to: {FINAL_MODEL_DIR_DISTILBERT}")

    except Exception as e:
        print(f"Error loading or saving the best model from trial {best_trial.number}'s output directory: {e}")
        print(f"Attempted to load from: {best_trial_output_dir}")
        print("Please ensure the Optuna study completed successfully and verify the contents of the output directory.")

Error loading or saving the best model from trial 0's output directory: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./hf_trainer_distilbert_results_trial_0.
Attempted to load from: ./hf_trainer_distilbert_results_trial_0
Please ensure the Optuna study completed successfully and verify the contents of the output directory.


# Comparison

# "Full Code" Fine-Tuning - Model Comparison: DistilBERT vs. BERTweet

---
First, we wanted to check the entire process of the model runs, Trying to analyze it to see whether we achieve any helpful information about our models.
## 1. Aggregate Statistics Across Runs

| Model      | Avg. Val. Acc | Median Val. Acc | Std Dev (Val. Acc) | Avg. Val. F1 | Median Val. F1 | Std Dev (Val. F1) |
|------------|---------------|-----------------|---------------------|--------------|----------------|-------------------|
| DistilBERT | **0.832**     | 0.806           | 0.043               | **0.832**    | 0.807          | 0.043             |
| BERTweet   | 0.829         | 0.830           | 0.010               | 0.828        | 0.830          | 0.010             |

**Interpretation**  
- **DistilBERT** shows higher **average accuracy/F1 (~83.2%)**, but with higher variance (std ~4.3%).  
- **BERTweet** achieves slightly lower **average accuracy (~82.9%)**, but with much **tighter variance (std ~1%)**, indicating stable convergence across trials.  
- DistilBERT may benefit more from careful hyperparameter tuning, while BERTweet provides robustness across different trials.

---
Then, we wanted to investigate our peaks, the best trial of each model, to achieve broader perspective on our model results.

## 2. Best Trial per Model

### DistilBERT – Best Trial (Trial 1)
 **Validation Accuracy:** 0.887  
 **Validation F1:** 0.886  
 **Precision:** 0.874  
 **Recall:** 0.887  
 **Train Accuracy:** 0.978  
 **Train Loss:** 0.067  

### BERTweet – Best Trial (Trial 0)
 **Validation Accuracy:** 0.844  
 **Validation F1:** 0.843  
 **Precision:** 0.844  
 **Recall:** 0.844  
 **Train Accuracy:** 0.935  
 **Train Loss:** 0.191  

**Comparison & Insights**  
- DistilBERT outperforms BERTweet in **raw metrics** (Acc/F1 ≈ 88.7% vs. 84.4%).  
- DistilBERT shows **heavier overfitting** (train acc 97.8% vs val acc 88.7%), while BERTweet generalizes slightly smoother (train acc 93.5% vs val acc 84.4%).  
- DistilBERT can reach higher peak performance but requires **strong regularization**.  
- BERTweet, although weaker in ceiling performance, is **less prone to overfitting** and could generalize better to noisy domains, which fits our intuition of it being a tweet-reading model (tweets).

---

## 3. Best Hyperparameters

### DistilBERT (Best Trial)
  learning_rate: 0.000108,
  weight_decay: 3.32e-05,
  patience: 7,
  batch_size: 32,
  num_layers: 4

### BERTweet (Best Trial)
  learning_rate: 0.000507,
  weight_decay: 4.38e-05,
  patience: 6,
  batch_size: 128,
  num_layers: 3

  **Interpretation of Hyperparameters**

**Learning rate**:

DistilBERT → **1.08e-4.**
DistilBERT is a lighter, distilled version of BERT with fewer parameters.
Smaller LR suggests the model required gentler, more stable updates. Large updates may have disrupted its compressed architecture, which is already tuned to general-domain patterns.This indicates that DistilBERT’s pretrained weights are sensitive: it “remembers” its distilled knowledge and fine-tunes best when nudged slowly.

BERTweet → **5.07e-4.** - larger then DistilBERT.
BERTweet is pretrained on noisy Twitter data. Its embeddings are well adapted to slang, hashtags, emojis. A larger LR here helps shake the model out of its strong priors and adapt faster to your specific labeled dataset (COVID-related sentiment).

Coclusion: BERTweet was robust enough to tolerate aggressive updates, which matches its need to “unlearn some noise” and realign with your labels. Smaller models (DistilBERT) are fragile and benefit from conservative updates. Larger, domain-pretrained models (BERTweet) can absorb bigger steps, but only when batch size is also large.

**Batch size**: DistilBERT → **32**

Small batch size = noisier gradient estimates, but that noise can regularize training and prevent overfitting on a small dataset.
DistilBERT seems to thrive with this noise, possibly because it prevents the model from collapsing into oversimplified decision boundaries.

BERTweet → **128**

Larger batch size smooths the gradient estimate, making learning more stable.
With ~135M parameters, BERTweet likely needs the stability of larger batches to prevent noisy updates from pushing weights in conflicting directions.
This explains why BERTweet’s variance across trials was high when batches were small, but its best run was with the largest batch.
Model size and pretraining corpus scale interact with batch size.
Small, distilled models: benefit from noise in the gradients (smaller batches).
Large, specialized models: require stability (larger batches).

**Layers**: DistilBERT best run stacked **4 layers**, BERTweet stabilized at **3 layers**.  

---

### Key Takeaways
**1. Comparison of Performance**  
Our initial intuition was that BERTweet performance was going to beat DistilBERT's performance. We fear that maybe DistilBERT may benefit from its distilled nature - learning faster on small datasets. BERTweet requires longer training or stronger regularization to fully shine. Additionally, BERTweet’s large capacity (135M parameters) may need more data or stronger regularization to fully leverage its pretraining. DistilBERT, being smaller (~66M params), may generalize better with limited labeled data - avoiding overfitting and showing steadier validation curves in some trials.

**2. Performance vs. Stability**  
DistilBERT achieves **higher peak performance (~88.7%)** but is **less stable** across trials.  
BERTweet is **more consistent (~83% ±1%)**, making it safer when robustness matters.  

**3. Overfitting Risk**  
DistilBERT shows **stronger overfitting patterns**, requiring **regularization**.  
BERTweet generalizes **more smoothly**, making it reliable for noisy, short-text data.  

**4. Hyperparameter Sensitivity**  
DistilBERT requires **careful tuning (low LR, small batch)**.  
BERTweet tolerates **larger LR and batch sizes**, making training more forgiving.  


## What could we do to improve the results?

We didn't rerun improvements cause the project already consumed significant compute time (several days).

Improvements we would do if we had time and plenty of GPU:

1.  **Change the Max Length Tokenizer we have set:**
  - **What we did:** We have set a data-driven cap (p99 + buffer) using a json file from part A and padded everything to that cap.
  - **What we should have done:** we should have change the padding to dynamic padding instead of a "fixed" number. Also, we should have choose different padding for each model which we haven't (we wanted to keep things simple at first, but then we understood we won't be able to run this again).

2.  **Increase the number of epochs:**
  - We would increase the number of epochs to at least 20 to give the model more chance to learn as at each epoch, the model "sees" every training example again, so if there's a underfitting (the model hasn't fully fit the patterns yet), maybe a few extra passes would have reduce the training loss and MAYBE lift validation accuracy (if we wouldn't pass the "sweetspot" of the tradeoff between low training loss to low val loss).

3. **Increase the number of trials and add a smaller batch size**
  - We would increase the number of trials and add a smaller batch size to cover more combinations, which means higher chance to get a better hyper parameters --> better model.
  - We could add a smaller batch size to help with generalization (we can call it even a type of "regularization").









# Fine-Tuned Model Comparison

We evaluated four model variants (BERTweet and DistilBERT), each trained with two approaches: Hugging Face Trainer and a custom "Full Code" pipeline.  
The following table summarizes their best runs:

| Model Variant                  | Train Loss | Val. Loss | Best Val. Accuracy | Overfitting Tendency                |
|--------------------------------|------------|-----------|--------------------|-------------------------------------|
| **BERTweet w/ Hugging Face**   | 0.1455     | 0.3426    | **0.9040**         | Very low gap → Low overfitting      |
| **DistilBERT w/ Hugging Face** | 0.0806     | 0.3937    | 0.8957             | Larger gap → Moderate overfitting   |
| **DistilBERT w/ Full Code**    | 0.0671     | 0.5823    | 0.8867             | Big gap → High overfitting          |
| **BERTweet w/ Full Code**      | 0.1915     | 0.4915    | 0.8439             | Noticeable gap → Moderate overfitting |

---

## Selection of the Two Best Models

1. **BERTweet w/ Hugging Face Trainer**  
   - **Best accuracy (90.4%)** and lowest validation loss.  
   - Smallest train–val gap → most robust generalization.  
   - Excellent F1, precision, and recall.

2. **DistilBERT w/ Hugging Face Trainer**  
   - **Second-best accuracy (89.6%)**.  
   - Train loss is very low (0.0806), but the larger train–val gap indicates more overfitting risk than BERTweet.  
   - Still strong, and more efficient for deployment.

---

## Final Conclusion

The **two chosen models** are:  
- **BERTweet (Hugging Face Trainer)** → Best performing overall, strongest generalization.  
- **DistilBERT (Hugging Face Trainer)** → Competitive accuracy, lighter model for production efficiency.  

Together, they balance **accuracy** and **efficiency**, making them the best candidates.


# **Test Run**

**Eval + Utilities Setup**

Sets W&B config, derives max sequence lengths from length_stats.json, validation checks for text/label columns, and builds tokenized Hugging Face Datasets. Includes a collate for padding, a forward_pass to get probs/preds/loss in eval mode, gives confusion matrix, one-vs-rest ROC.

In [None]:
WANDB_PROJECT = "adv-dl-sentiment"   # TODO: set
WANDB_ENTITY  = None                 # or your entity
WANDB_TAGS    = ["test-eval", "multiclass"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_max_len(default_bert=96, default_roberta=96):
    path = globals().get("LENGTH_STATS_FILE") or (globals().get("BASE_DIR") and os.path.join(BASE_DIR,"length_stats.json"))
    if path and os.path.exists(path):
        try:
            with open(path,"r") as f: s = json.load(f)
            return int(s.get("bert",{}).get("overall",{}).get("p99",default_bert)), \
                   int(s.get("roberta",{}).get("overall",{}).get("p99",default_roberta))
        except Exception: pass
    return default_bert, default_roberta

CAND_TEXT_COLS = ["text","clean_text","OriginalTweet","Tweet","tweet","original_tweet"]
def pick_text_col(df):
    for c in CAND_TEXT_COLS:
        if c in df.columns: return c
    obj = [c for c in df.columns if df[c].dtype=="object"]
    if obj: return obj[0]
    raise ValueError("No text column found.")

def ensure_label_col(df):
    if "label" in df.columns: return df.rename(columns={"label":"label"})
    for alt in ["labels","sentiment_std","SentimentId","y","target"]:
        if alt in df.columns: return df.rename(columns={alt:"label"})
    raise ValueError("No label column; need ints {0,1,2}.")

def build_hf_dataset(df, tokenizer, max_length, text_col):
    tmp = df[[text_col, "label"]].dropna().rename(columns={text_col: "text"})
    ds  = HFDataset.from_pandas(tmp, preserve_index=False)   # <-- change here

    def _tok(batch):
        enc = tokenizer(batch["text"], padding=False, truncation=True, max_length=max_length)
        return enc
    ds = ds.map(_tok, batched=True, remove_columns=["text"])
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds

def make_collate(tokenizer):
    pad_id = tokenizer.pad_token_id or 0
    def collate(batch):
        keys = {k for b in batch for k in b}
        out = {}
        for k in keys:
            arr = [b[k] for b in batch]
            if k == "labels":
                out[k] = torch.tensor(arr)
            else:
                out[k] = torch.nn.utils.rnn.pad_sequence(
                    [torch.tensor(v) for v in arr], batch_first=True, padding_value=pad_id
                )
        return out
    return collate

def forward_pass(model, loader, num_labels=3, compute_loss=True):
    model.eval()
    logits_all, labels_all, ce = [], [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            out   = model(input_ids=batch["input_ids"], attention_mask=batch.get("attention_mask"))
            logits_all.append(out.logits.detach().cpu())
            labels_all.append(batch["labels"].detach().cpu())
            if compute_loss:
                ce.append(torch.nn.functional.cross_entropy(out.logits, batch["labels"]).item())
    logits = torch.cat(logits_all, dim=0) if logits_all else torch.empty((0,num_labels))
    labels = torch.cat(labels_all, dim=0) if labels_all else torch.empty((0,),dtype=torch.long)
    probs  = torch.softmax(logits, dim=-1).numpy() if logits.numel() else np.zeros((0,num_labels))
    preds  = probs.argmax(-1) if probs.size else np.array([],dtype=int)
    loss   = float(np.mean(ce)) if ce else float("nan")
    return probs, preds, labels.numpy(), loss

def plot_conf_mat(y_true, y_pred, labels, title, save_path):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    fig = plt.figure(figsize=(5.2,4.4), dpi=140)
    ax  = fig.add_subplot(111)
    im  = ax.imshow(cm, interpolation="nearest")
    ax.set_title(title); ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    ax.set_xticks(range(len(labels))); ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels); ax.set_yticklabels(labels)
    for i in range(len(labels)):
        for j in range(len(labels)):
            ax.text(j, i, cm[i,j], ha="center", va="center")
    fig.tight_layout(); fig.savefig(save_path, bbox_inches="tight"); plt.close(fig)
    return cm

def plot_roc_ovr(y_true, probs, labels, title, save_path):
    fig = plt.figure(figsize=(5.8,4.8), dpi=140)
    ax  = fig.add_subplot(111)
    for i, lab in enumerate(labels):
        y_bin = (y_true==lab).astype(int)
        if len(np.unique(y_bin)) < 2: continue
        fpr, tpr, _ = roc_curve(y_bin, probs[:, i])
        ax.plot(fpr, tpr, label=f"class {lab}")
    ax.plot([0,1],[0,1],"--",linewidth=1)
    ax.set_title(title); ax.set_xlabel("FPR"); ax.set_ylabel("TPR"); ax.legend(fontsize=8, loc="lower right")
    fig.tight_layout(); fig.savefig(save_path, bbox_inches="tight"); plt.close(fig)

Get test data

In [None]:
# validation check: Uses existing variables if present; otherwise reads from your paths.
def _get_df(obj_name, path_name):
    if obj_name in globals(): return globals()[obj_name].copy()
    p = globals().get(path_name)
    if p and os.path.exists(p):
        try: return pd.read_excel(p)
        except Exception: return pd.read_csv(p, encoding="ISO-8859-1")
    return None

df_distilbert_test = _get_df("df_distilbert_test", "DISTILBERT_TEST_FILE")
df_bertweet_test   = _get_df("df_bertweet_test",   "BERTWEET_TEST_FILE")
if df_distilbert_test is None and df_bertweet_test is None:
    raise RuntimeError("No test data found. Ensure df_*_test exist or *_TEST_FILE paths are valid.")

dfd = ensure_label_col(df_distilbert_test) if df_distilbert_test is not None else None
dfb = ensure_label_col(df_bertweet_test)   if df_bertweet_test   is not None else None
text_col_d = pick_text_col(dfd) if dfd is not None else None
text_col_b = pick_text_col(dfb) if dfb is not None else None
MAXLEN_BERT, MAXLEN_ROBERTA = get_max_len()

os.makedirs("eval_plots", exist_ok=True)


Evaluateion function

picks the best HF directory per model using trainer_state.json.

In [None]:
# Base directory
if "BASE_DIR" not in globals():
    BASE_DIR = "."

def _is_hf_dir(d):
    return d and os.path.isfile(os.path.join(d, "config.json")) and (
        os.path.isfile(os.path.join(d, "model.safetensors")) or
        os.path.isfile(os.path.join(d, "pytorch_model.bin"))
    )

def pick_best_checkpoint(root_dir):
    """Return a loadable HF dir: best recorded in trainer_state.json, else highest-step checkpoint, else root (if loadable)."""
    if not root_dir or not os.path.isdir(root_dir):
        return None
    ts = os.path.join(root_dir, "trainer_state.json")
    if os.path.exists(ts):
        try:
            with open(ts, "r") as f:
                j = json.load(f)
            b = j.get("best_model_checkpoint")
            if _is_hf_dir(b):
                return b
        except Exception:
            pass
    cands = sorted(glob.glob(os.path.join(root_dir, "checkpoint-*")))
    if cands:
        def _step(p):
            try: return int(os.path.basename(p).split("-")[-1])
            except: return -1
        best = max(cands, key=_step)
        if _is_hf_dir(best):
            return best
    return root_dir if _is_hf_dir(root_dir) else None

# Folders as in your Drive screenshots
HF_DISTILBERT_ROOT = os.path.join(BASE_DIR, "hf_best", "distilbert")   # has checkpoint-500..2580 + model.safetensors
HF_BERTWEET_ROOT   = os.path.join(BASE_DIR, "hf_best", "bertweet")     # has checkpoint-5150 + model.safetensors
FM_DISTILBERT_DIR  = os.path.join(BASE_DIR, "final_models", "distilbert_full_code")  # full saved HF dir
FM_BERTWEET_DIR    = os.path.join(BASE_DIR, "final_models", "bertweet_full_code")    # full saved HF dir

# Pick the best available directories
DISTILBERT_HF_DIR = pick_best_checkpoint(HF_DISTILBERT_ROOT) or (_is_hf_dir(FM_DISTILBERT_DIR) and FM_DISTILBERT_DIR) or None
BERTWEET_HF_DIR   = pick_best_checkpoint(HF_BERTWEET_ROOT)   or (_is_hf_dir(FM_BERTWEET_DIR)   and FM_BERTWEET_DIR)   or None

print("Resolved checkpoints:")
print(" - DistilBERT (HF):", DISTILBERT_HF_DIR)
print(" - BERTweet  (HF):", BERTWEET_HF_DIR)

# What your evaluator expects:
MODEL_DIRS = {
    "distilbert_hf": DISTILBERT_HF_DIR,
    "bertweet_hf":   BERTWEET_HF_DIR,
}
MODEL_FILES = {}  # no raw .pt files needed since you saved full HF dirs


Resolved checkpoints:
 - DistilBERT (HF): /content/drive/MyDrive/ADV_DL/hf_best/distilbert/checkpoint-2580
 - BERTweet  (HF): /content/drive/MyDrive/ADV_DL/hf_best/bertweet/checkpoint-5150


**Define Final Model Directories**

We set up variables that point to the saved model directories for evaluation:  
- **HF-Trainer checkpoints** (DistilBERT & BERTweet).  
- **Full-code checkpoints** (DistilBERT & BERTweet).  


In [None]:
# Cell C — feed paths into your evaluation block variables

# Requires Cell B to have defined these:
#   DISTILBERT_HF_DIR, BERTWEET_HF_DIR, FM_DISTILBERT_DIR, FM_BERTWEET_DIR

FINAL_MODEL_DIR_DISTILBERT       = DISTILBERT_HF_DIR         # HF-Trainer DistilBERT (best ckpt or root)
FINAL_MODEL_DIR_BERTWEET         = BERTWEET_HF_DIR           # HF-Trainer BERTweet  (best ckpt or root)
FINAL_MODEL_DIR_DISTILBERT_FULL  = FM_DISTILBERT_DIR         # “full_code” final HF dir
FINAL_MODEL_DIR_BERTWEET_FULL    = FM_BERTWEET_DIR           # “full_code” final HF dir

# Optional: quick sanity prints (helps if anything still skips)
print("FINAL_MODEL_DIR_DISTILBERT      =", FINAL_MODEL_DIR_DISTILBERT)
print("FINAL_MODEL_DIR_BERTWEET        =", FINAL_MODEL_DIR_BERTWEET)
print("FINAL_MODEL_DIR_DISTILBERT_FULL =", FINAL_MODEL_DIR_DISTILBERT_FULL)
print("FINAL_MODEL_DIR_BERTWEET_FULL   =", FINAL_MODEL_DIR_BERTWEET_FULL)


FINAL_MODEL_DIR_DISTILBERT      = /content/drive/MyDrive/ADV_DL/hf_best/distilbert/checkpoint-2580
FINAL_MODEL_DIR_BERTWEET        = /content/drive/MyDrive/ADV_DL/hf_best/bertweet/checkpoint-5150
FINAL_MODEL_DIR_DISTILBERT_FULL = /content/drive/MyDrive/ADV_DL/final_models/distilbert_full_code
FINAL_MODEL_DIR_BERTWEET_FULL   = /content/drive/MyDrive/ADV_DL/final_models/bertweet_full_code


### Evaluation & W&B

steps:
- **Loads** the trained model and tokenizer
- **Builds a test dataset and DataLoader**   
- Runs a forward pass to compute:  
  - Accuracy, Precision, Recall, F1 (weighted).  
  - Cross-Entropy loss (test + post-hoc train/val).  
  - ROC-AUC (OvR & OvO).  
- **Logs results (W&B)** metrics, plots, and a per-class classification report.  
- **Prints summary** of results and runtime stats


In [None]:
# === Single-model evaluate + W&B log (compact) ===
def evaluate_and_log(model_label, model_dir):
    if not model_dir or not os.path.isdir(model_dir):
        print(f"skip: {model_label} (dir missing) -> {model_dir}"); return None

    tok  = AutoTokenizer.from_pretrained(model_dir, use_fast=False)
    mdl  = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
    is_rob = ("roberta" in tok.__class__.__name__.lower()) or ("bertweet" in mdl.name_or_path.lower())

    use_df  = dfb if is_rob and dfb is not None else dfd
    txtcol  = text_col_b if is_rob and text_col_b is not None else text_col_d
    max_len = MAXLEN_ROBERTA if is_rob else MAXLEN_BERT
    if use_df is None:
        use_df, txtcol, max_len = (dfb, text_col_b, MAXLEN_ROBERTA) if dfb is not None else (dfd, text_col_d, MAXLEN_BERT)

    ds_test = build_hf_dataset(use_df, tok, max_len, txtcol)
    loader  = DataLoader(ds_test, batch_size=64, shuffle=False, collate_fn=make_collate(tok))

    t0 = time.time()
    probs, y_pred, y_true, test_ce = forward_pass(mdl, loader, num_labels=mdl.config.num_labels, compute_loss=True)
    spb = (time.time() - t0) / max(1, len(loader))

    acc  = accuracy_score(y_true, y_pred) if y_true.size else float("nan")
    f1w  = f1_score(y_true, y_pred, average="weighted", zero_division=0) if y_true.size else float("nan")
    pw   = precision_score(y_true, y_pred, average="weighted", zero_division=0) if y_true.size else float("nan")
    rw   = recall_score(y_true, y_pred, average="weighted", zero_division=0) if y_true.size else float("nan")

    roc_ovr = roc_ovo = float("nan")
    if y_true.size and probs.shape[0]==y_true.shape[0]:
        try: roc_ovr = roc_auc_score(y_true, probs, multi_class="ovr")
        except Exception: pass
        try: roc_ovo = roc_auc_score(y_true, probs, multi_class="ovo")
        except Exception: pass

    labs = sorted(set(y_true.tolist()+y_pred.tolist()))
    base = model_label.lower().replace(" ","_").replace("—","-")
    cm_path  = f"eval_plots/{base}_cm.png"
    roc_path = f"eval_plots/{base}_roc.png"
    plot_conf_mat(y_true, y_pred, labs, f"{model_label} — Confusion Matrix", cm_path)
    plot_roc_ovr(y_true, probs, labs, f"{model_label} — ROC (OvR)", roc_path)

    # post-hoc CE on train/val (if your HF splits exist)
    tr = locals().get("hf_train_dataset_bertweet") if is_rob else locals().get("hf_train_dataset_distilbert")
    va = locals().get("hf_val_dataset_bertweet")   if is_rob else locals().get("hf_val_dataset_distilbert")
    train_ce = val_ce = float("nan")
    if tr is not None:
        tr_loader = DataLoader(tr, batch_size=64, shuffle=False, collate_fn=make_collate(tok))
        _,_,_,train_ce = forward_pass(mdl, tr_loader, mdl.config.num_labels, True)
    if va is not None:
        va_loader = DataLoader(va, batch_size=64, shuffle=False, collate_fn=make_collate(tok))
        _,_,_,val_ce = forward_pass(mdl, va_loader, mdl.config.num_labels, True)

    # W&B
    run = wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=f"Test — {model_label}", tags=WANDB_TAGS, reinit=True,
                     config={"model_dir":model_dir,"is_roberta_family":is_rob,"max_length":max_len,"num_labels":mdl.config.num_labels})
    wandb.log({
        "test/accuracy":acc, "test/f1_weighted":f1w, "test/precision_weighted":pw, "test/recall_weighted":rw,
        "test/roc_auc_ovr":roc_ovr, "test/roc_auc_ovo":roc_ovo, "test/cross_entropy":test_ce,
        "posthoc/train_cross_entropy":train_ce, "posthoc/val_cross_entropy":val_ce, "speed/sec_per_batch":spb,
        "counts/test_n": int(y_true.size),
        "plots/confusion_matrix": wandb.Image(cm_path),
        "plots/roc_curves":       wandb.Image(roc_path),
    })
    # per-class report (table)
    try:
        rep = classification_report(y_true, y_pred, digits=4, output_dict=True)
        rep_df = pd.DataFrame(rep).T.reset_index().rename(columns={"index":"class"})
        wandb.log({"tables/classification_report": wandb.Table(dataframe=rep_df)})
    except Exception:
        pass
    run.finish()

    # print numeric scores
    print(f"\n=== {model_label} ===")
    print(f"Dir: {model_dir}")
    print(f"N: {y_true.size} | Acc {acc:.4f} | F1w {f1w:.4f} | Precw {pw:.4f} | Recallw {rw:.4f}")
    print(f"ROC-AUC OvR: {('%.4f'%roc_ovr) if not math.isnan(roc_ovr) else '—'} | OvO: {('%.4f'%roc_ovo) if not math.isnan(roc_ovo) else '—'}")
    print(f"CE test: {test_ce:.4f} | CE train: {train_ce if not math.isnan(train_ce) else '—'} | CE val: {val_ce if not math.isnan(val_ce) else '—'}")
    print(f"Sec/Batch: {spb:.4f} | Plots: {cm_path}, {roc_path}")

    return {
        "Model": model_label, "Dir": model_dir, "Test Acc": acc, "F1(w)": f1w,
        "Prec(w)": pw, "Rec(w)": rw, "ROC-AUC OvR": roc_ovr, "ROC-AUC OvO": roc_ovo,
        "Test CE": test_ce, "Train CE": train_ce, "Val CE": val_ce, "Sec/Batch": spb, "N": int(y_true.size)
    }


RUN TEST

### Evaluate All Four Models

We run the function for the four trained variants:  
- DistilBERT (HF-Trainer)  
- BERTweet (HF-Trainer)  
- DistilBERT (Full-Code)  
- BERTweet (Full-Code)  

The results are collected into a summary DataFrame, printed in a table, and exported to `test_results_summary.csv` in Drive.  
This provides a comparison of accuracy, F1, precision/recall, ROC-AUC, cross-entropy, and runtime speed.


In [None]:
# === Run all 4 models (HF-Trainer & Full-Code for DistilBERT/BERTweet) ===
MODEL_DIRS = {
    "DistilBERT — HF Trainer":  locals().get("FINAL_MODEL_DIR_DISTILBERT") or locals().get("DISTIL_PATH"),
    "BERTweet   — HF Trainer":  locals().get("FINAL_MODEL_DIR_BERTWEET") or locals().get("BERTWEET_PATH"),
    "DistilBERT — Full Code":   locals().get("FINAL_MODEL_DIR_DISTILBERT_FULL"),
    "BERTweet   — Full Code":   locals().get("FINAL_MODEL_DIR_BERTWEET_FULL"),
}

rows = []
for name, mdir in MODEL_DIRS.items():
    res = evaluate_and_log(name, mdir)
    if res: rows.append(res)

if rows:
    df = pd.DataFrame(rows)
    show = df.copy()
    for c in ["Test Acc","F1(w)","Prec(w)","Rec(w)","ROC-AUC OvR","ROC-AUC OvO","Test CE","Train CE","Val CE","Sec/Batch"]:
        if c in show.columns:
            show[c] = show[c].apply(lambda v: "—" if (v is None or (isinstance(v,float) and np.isnan(v))) else (f"{v:.4f}" if isinstance(v,(int,float)) else v))
    print("\n=== Test Summary ===")
    print(show[["Model","Test Acc","F1(w)","Prec(w)","Rec(w)","ROC-AUC OvR","ROC-AUC OvO","Test CE","Sec/Batch","N"]].to_string(index=False))
    out_csv = os.path.join(BASE_DIR,"test_results_summary.csv") if "BASE_DIR" in globals() else "test_results_summary.csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved: {out_csv}")
else:
    print("No models evaluated (check directories).")


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

  [torch.tensor(v) for v in arr], batch_first=True, padding_value=pad_id


0,1
counts/test_n,▁
speed/sec_per_batch,▁
test/accuracy,▁
test/cross_entropy,▁
test/f1_weighted,▁
test/precision_weighted,▁
test/recall_weighted,▁
test/roc_auc_ovo,▁
test/roc_auc_ovr,▁

0,1
counts/test_n,3798.0
posthoc/train_cross_entropy,
posthoc/val_cross_entropy,
speed/sec_per_batch,0.13794
test/accuracy,0.87704
test/cross_entropy,0.46366
test/f1_weighted,0.8774
test/precision_weighted,0.87798
test/recall_weighted,0.87704
test/roc_auc_ovo,0.95906



=== DistilBERT — HF Trainer ===
Dir: /content/drive/MyDrive/ADV_DL/hf_best/distilbert/checkpoint-2580
N: 3798 | Acc 0.8770 | F1w 0.8774 | Precw 0.8780 | Recallw 0.8770
ROC-AUC OvR: 0.9623 | OvO: 0.9591
CE test: 0.4637 | CE train: — | CE val: —
Sec/Batch: 0.1379 | Plots: eval_plots/distilbert_-_hf_trainer_cm.png, eval_plots/distilbert_-_hf_trainer_roc.png


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

  [torch.tensor(v) for v in arr], batch_first=True, padding_value=pad_id


0,1
counts/test_n,▁
speed/sec_per_batch,▁
test/accuracy,▁
test/cross_entropy,▁
test/f1_weighted,▁
test/precision_weighted,▁
test/recall_weighted,▁
test/roc_auc_ovo,▁
test/roc_auc_ovr,▁

0,1
counts/test_n,3798.0
posthoc/train_cross_entropy,
posthoc/val_cross_entropy,
speed/sec_per_batch,0.27642
test/accuracy,0.62033
test/cross_entropy,2.2841
test/f1_weighted,0.66253
test/precision_weighted,0.85064
test/recall_weighted,0.62033
test/roc_auc_ovo,0.89182



=== BERTweet   — HF Trainer ===
Dir: /content/drive/MyDrive/ADV_DL/hf_best/bertweet/checkpoint-5150
N: 3798 | Acc 0.6203 | F1w 0.6625 | Precw 0.8506 | Recallw 0.6203
ROC-AUC OvR: 0.8958 | OvO: 0.8918
CE test: 2.2841 | CE train: — | CE val: —
Sec/Batch: 0.2764 | Plots: eval_plots/bertweet___-_hf_trainer_cm.png, eval_plots/bertweet___-_hf_trainer_roc.png


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

  [torch.tensor(v) for v in arr], batch_first=True, padding_value=pad_id


0,1
counts/test_n,▁
speed/sec_per_batch,▁
test/accuracy,▁
test/cross_entropy,▁
test/f1_weighted,▁
test/precision_weighted,▁
test/recall_weighted,▁
test/roc_auc_ovo,▁
test/roc_auc_ovr,▁

0,1
counts/test_n,3798.0
posthoc/train_cross_entropy,
posthoc/val_cross_entropy,
speed/sec_per_batch,0.13582
test/accuracy,0.87204
test/cross_entropy,0.42154
test/f1_weighted,0.8715
test/precision_weighted,0.87195
test/recall_weighted,0.87204
test/roc_auc_ovo,0.95786



=== DistilBERT — Full Code ===
Dir: /content/drive/MyDrive/ADV_DL/final_models/distilbert_full_code
N: 3798 | Acc 0.8720 | F1w 0.8715 | Precw 0.8719 | Recallw 0.8720
ROC-AUC OvR: 0.9594 | OvO: 0.9579
CE test: 0.4215 | CE train: — | CE val: —
Sec/Batch: 0.1358 | Plots: eval_plots/distilbert_-_full_code_cm.png, eval_plots/distilbert_-_full_code_roc.png


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

  [torch.tensor(v) for v in arr], batch_first=True, padding_value=pad_id


0,1
counts/test_n,▁
speed/sec_per_batch,▁
test/accuracy,▁
test/cross_entropy,▁
test/f1_weighted,▁
test/precision_weighted,▁
test/recall_weighted,▁
test/roc_auc_ovo,▁
test/roc_auc_ovr,▁

0,1
counts/test_n,3798.0
posthoc/train_cross_entropy,
posthoc/val_cross_entropy,
speed/sec_per_batch,0.28104
test/accuracy,0.63402
test/cross_entropy,1.53463
test/f1_weighted,0.65401
test/precision_weighted,0.73846
test/recall_weighted,0.63402
test/roc_auc_ovo,0.77372



=== BERTweet   — Full Code ===
Dir: /content/drive/MyDrive/ADV_DL/final_models/bertweet_full_code
N: 3798 | Acc 0.6340 | F1w 0.6540 | Precw 0.7385 | Recallw 0.6340
ROC-AUC OvR: 0.7716 | OvO: 0.7737
CE test: 1.5346 | CE train: — | CE val: —
Sec/Batch: 0.2810 | Plots: eval_plots/bertweet___-_full_code_cm.png, eval_plots/bertweet___-_full_code_roc.png

=== Test Summary ===
                  Model Test Acc  F1(w) Prec(w) Rec(w) ROC-AUC OvR ROC-AUC OvO Test CE Sec/Batch    N
DistilBERT — HF Trainer   0.8770 0.8774  0.8780 0.8770      0.9623      0.9591  0.4637    0.1379 3798
BERTweet   — HF Trainer   0.6203 0.6625  0.8506 0.6203      0.8958      0.8918  2.2841    0.2764 3798
 DistilBERT — Full Code   0.8720 0.8715  0.8719 0.8720      0.9594      0.9579  0.4215    0.1358 3798
 BERTweet   — Full Code   0.6340 0.6540  0.7385 0.6340      0.7716      0.7737  1.5346    0.2810 3798
Saved: /content/drive/MyDrive/ADV_DL/test_results_summary.csv


# Compression

As said before- when trynig to save the BERTweet HF finetuned model- the runtime disconnected. Because it's one of our best models, we will need to load the best finetuned model again (feeding to it the best hyper parameters that were saved) in order to make compression to this model:

In [None]:
# run once (top of the notebook or right before tokenizer):
!pip -q install emoji==0.6.0


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [None]:
# === BERTweet: retrain ONCE using saved best Optuna hyperparams ===
# ---- Load best hyperparams JSON ----
BEST_HP_PATH = os.path.join(BASE_DIR, "hyperparams", "best_bertweet_hyperparams.json")
with open(BEST_HP_PATH, "r") as f:
    best_params = json.load(f)

best_lr = float(best_params["learning_rate"])
best_bs = int(best_params["per_device_train_batch_size"])

# ---- Output dir ----
OUTDIR = os.path.join(BASE_DIR, "hf_best", "bertweet")
os.makedirs(OUTDIR, exist_ok=True)

# ---- Tokenizer & Model ----
bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-base", use_fast=True, normalization=True
)
bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base", num_labels=NUM_LABELS
)

# ---- Collator ----
data_collator = DataCollatorWithPadding(tokenizer=bertweet_tokenizer)

training_args = TrainingArguments(
    output_dir=OUTDIR,
    learning_rate=best_lr,
    per_device_train_batch_size=best_bs,
    per_device_eval_batch_size=max(32, best_bs),
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    seed=42,
    do_eval=True,        # enables evaluation each epoch
    save_steps=500,      # checkpoint every N steps (set N to what fits your data/compute)
    eval_steps=500,      # run eval every N steps (must match save_steps if you want best model later)
    save_total_limit=1,  # keep only the most recent checkpoint
)



trainer = Trainer(
    model=bertweet_model,
    args=training_args,  # same as above (metric_* can stay or be removed)
    train_dataset=hf_train_dataset_bertweet,
    eval_dataset=hf_val_dataset_bertweet,
    tokenizer=bertweet_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



# ---- Train / Eval / Save ----
trainer.train()
print(trainer.evaluate())

trainer.save_model(OUTDIR)
bertweet_tokenizer.save_pretrained(OUTDIR)

print(f"\nSaved fine-tuned BERTweet to: {OUTDIR}\nReady for compression.")


In [None]:

# Path to your fine-tuned BERTweet folder
BERTWEET_PATH = f"{BASE_DIR}/hf_best/bertweet"

# Load fine-tuned BERTweet
bertweet_tokenizer = AutoTokenizer.from_pretrained(BERTWEET_PATH)
bertweet_model = AutoModelForSequenceClassification.from_pretrained(BERTWEET_PATH)

print("✅ BERTweet model restored successfully")


✅ BERTweet model restored successfully


We now see that it also didn't save the distilBERT best model (cause the drive storage was full). We will now do the same for distilBERT after buying more drive storage.

In [None]:

# ---- Load best hyperparams (from Optuna JSON) ----
BEST_HP_PATH = os.path.join(BASE_DIR, "hyperparams", "best_distilbert_hyperparams.json")
with open(BEST_HP_PATH, "r") as f:
    best_params = json.load(f)

best_lr = float(best_params["learning_rate"])
best_bs = int(best_params["per_device_train_batch_size"])

# ---- Output dir ----
OUTDIR = os.path.join(BASE_DIR, "hf_best", "distilbert")
os.makedirs(OUTDIR, exist_ok=True)

# ---- Tokenizer & Model ----
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=NUM_LABELS
)

# ---- Collator ----
data_collator = DataCollatorWithPadding(tokenizer=distilbert_tokenizer)

# ---- TrainingArguments ----
# Use only arguments that work across old/new transformers versions
training_args = TrainingArguments(
    output_dir=OUTDIR,
    learning_rate=best_lr,
    per_device_train_batch_size=best_bs,
    per_device_eval_batch_size=max(32, best_bs),
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    seed=42,
    do_eval=True  # ensures evaluation happens during training
)

# ---- Trainer ----
trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=hf_train_dataset_distilbert,
    eval_dataset=hf_val_dataset_distilbert,
    tokenizer=distilbert_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ---- Train ONCE and Save ----
trainer.train()
print(trainer.evaluate())

trainer.save_model(OUTDIR)
distilbert_tokenizer.save_pretrained(OUTDIR)

print(f"\n✅ DistilBERT retrained with best hyperparams and saved to: {OUTDIR}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.8186
200,0.5683
300,0.4454
400,0.3851
500,0.3511
600,0.287
700,0.2821
800,0.2527
900,0.208
1000,0.208


{'eval_loss': 0.3942674994468689, 'eval_accuracy': 0.8933430515063168, 'eval_f1': 0.8935220792099944, 'eval_precision': 0.8937617669796839, 'eval_recall': 0.8933430515063168, 'eval_runtime': 6.1132, 'eval_samples_per_second': 1346.593, 'eval_steps_per_second': 10.633, 'epoch': 10.0}

✅ DistilBERT retrained with best hyperparams and saved to: /content/drive/MyDrive/ADV_DL/hf_best/distilbert


In [None]:
DISTIL_PATH = f"{BASE_DIR}/hf_best/distilbert"

distilbert_model = AutoModelForSequenceClassification.from_pretrained(DISTIL_PATH)
distilbert_tokenizer = AutoTokenizer.from_pretrained(DISTIL_PATH)
print("✅ distilbert model restored successfully")

✅ distilbert model restored successfully


Load the best 2 models:

In [None]:
BASE_DIR = "/content/drive/MyDrive/ADV_DL"
HF_BEST = f"{BASE_DIR}/hf_best"
os.makedirs(HF_BEST, exist_ok=True)

# Load fine-tuned teachers *from Drivae*
DISTIL_PATH   = f"{HF_BEST}/distilbert"
BERTWEET_PATH = f"{HF_BEST}/bertweet"

# Model Compression — DistilBERT & BERTweet

In this section we apply three different compression methods to our fine-tuned models:

1. **Dynamic Quantization** — reduces model size by converting weights to int8.  
2. **Pruning** — zeroes out less important weights to sparsify the model.  
3. **Knowledge Distillation** — trains a smaller student model using predictions (soft labels) from the teacher model.


### 1. Dynamic Quantization
We first apply dynamic quantization to both fine-tuned models.  
This is the fastest compression method — it converts linear layers to 8-bit integers, reducing size and improving inference speed.


In [None]:
# Force CPU + eval (dynamic quantization is CPU-only)
distilbert_model = distilbert_model.to("cpu").eval()
bertweet_model   = bertweet_model.to("cpu").eval()

In [None]:
# Dynamic INT8 quantization on Linear layers
quantized_distilbert = torch.quantization.quantize_dynamic(
    distilbert_model, {nn.Linear}, dtype=torch.qint8
).eval()

quantized_bertweet = torch.quantization.quantize_dynamic(
    bertweet_model, {nn.Linear}, dtype=torch.qint8
).eval()

In [None]:
# Save quantized models for *your* local comparison runs (not required for submission)
Q_OUT_DIR = f"{BASE_DIR}/hf_best"
import os; os.makedirs(Q_OUT_DIR, exist_ok=True)

torch.save(quantized_distilbert, f"{Q_OUT_DIR}/distilbert_quantized.pth")
torch.save(quantized_bertweet,   f"{Q_OUT_DIR}/bertweet_quantized.pth")

print("Saved quantized models to:")
print(f"- {Q_OUT_DIR}/distilbert_quantized.pth")
print(f"- {Q_OUT_DIR}/bertweet_quantized.pth")


#### Evaluation of Models
After each compression method, we will evaluate the effect of the compression on both DistilBERT and BERTweet:

- **Model size** on disk (before vs. after compression).  
- **Validation accuracy** using the same evaluation function.  
- **Inference speed** on a small batch of validation data.


In [None]:
# Define quantized model file paths (already saved as .pth earlier)
quant_distil_path   = "/content/drive/MyDrive/ADV_DL/hf_best/distilbert_quantized.pth"
quant_bertweet_path = "/content/drive/MyDrive/ADV_DL/hf_best/bertweet_quantized.pth"

def get_model_file(path):
    """Return model weight file (safetensors or bin)"""
    if os.path.exists(os.path.join(path, "pytorch_model.bin")):
        return os.path.join(path, "pytorch_model.bin")
    elif os.path.exists(os.path.join(path, "model.safetensors")):
        return os.path.join(path, "model.safetensors")
    else:
        raise FileNotFoundError(f"No model weight file found in {path}")

def get_size(path):
    return os.path.getsize(path) / (1024 * 1024)  # MB

# Original model sizes
orig_distil_file   = get_model_file(DISTIL_PATH)
orig_bertweet_file = get_model_file(BERTWEET_PATH)

orig_distil_size   = get_size(orig_distil_file)
orig_bertweet_size = get_size(orig_bertweet_file)

# Quantized model sizes (saved as .pth)
quant_distil_size   = get_size(quant_distil_path)
quant_bertweet_size = get_size(quant_bertweet_path)

print(f"DistilBERT: Original = {orig_distil_size:.2f} MB | Quantized = {quant_distil_size:.2f} MB")
print(f"BERTweet  : Original = {orig_bertweet_size:.2f} MB | Quantized = {quant_bertweet_size:.2f} MB")


DistilBERT: Original = 255.43 MB | Quantized = 132.29 MB
BERTweet  : Original = 514.63 MB | Quantized = 270.06 MB


In [None]:
def evaluate_model(model, tokenizer, dataset, n_samples=500):
    """
    Evaluate accuracy of a model on a subset of dataset for speed.
    """
    model.eval()
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=data_collator)

    preds, labels = [], []
    seen = 0

    for batch in dataloader:
        labels_batch = batch.pop("labels")
        batch = {k: v.to(model.device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=-1).cpu().numpy()
        preds.extend(batch_preds)
        labels.extend(labels_batch.numpy())

        seen += len(labels_batch)
        if seen >= n_samples:
            break

    return accuracy_score(labels[:n_samples], preds[:n_samples])

# --- Run eval on quantized models ---
acc_distil_q = evaluate_model(quantized_distilbert, distilbert_tokenizer, hf_val_dataset_distilbert)
acc_bertweet_q = evaluate_model(quantized_bertweet, bertweet_tokenizer, hf_val_dataset_bertweet)

print(f" Quantized DistilBERT Accuracy: {acc_distil_q:.4f}")
print(f" Quantized BERTweet  Accuracy: {acc_bertweet_q:.4f}")

 Quantized DistilBERT Accuracy: 0.8860
 Quantized BERTweet  Accuracy: 0.2100


In [None]:


def measure_inference_time(model, tokenizer, dataset, n_batches=20):
    """
    Measure average inference time per batch.
    """
    model.eval()
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=data_collator)

    start = time.time()
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            _labels = batch.pop("labels")  # remove labels
            batch = {k: v.to(model.device) for k, v in batch.items()}
            _ = model(**batch)
            if i + 1 >= n_batches:
                break
    end = time.time()

    return (end - start) / n_batches  # seconds per batch

time_distil_q = measure_inference_time(quantized_distilbert, distilbert_tokenizer, hf_val_dataset_distilbert)
time_bertweet_q = measure_inference_time(quantized_bertweet, bertweet_tokenizer, hf_val_dataset_bertweet)

print(f"Quantized DistilBERT Inference time per batch: {time_distil_q:.4f} sec")
print(f"Quantized BERTweet  Inference time per batch: {time_bertweet_q:.4f} sec")

Quantized DistilBERT Inference time per batch: 2.6154 sec
Quantized BERTweet  Inference time per batch: 5.0927 sec


### 2. Pruning (Unstructured L1)

We prune the smallest-magnitude weights in all `Linear` layers (global unstructured pruning).
- **Goal:** induce sparsity (fewer effective parameters) → potential speedups and regularization.
- **Note:** Disk size usually doesn’t shrink with plain pruning, because weights are stored densely; we’ll report **sparsity** and **runtime** changes.  
- **Optional:** a brief post-pruning fine-tune (1 epoch) can help recover accuracy.


Utilities + pick layers to prune

In [None]:


def get_linear_modules(model):
    """
    Return a list of (module_ref, 'weight') for all Linear layers in the model,
    which is what global_unstructured expects.
    """
    params_to_prune = []
    for name, module in model.named_modules():
        if isinstance(module, Linear):
            params_to_prune.append((module, "weight"))
    return params_to_prune

def apply_global_pruning(model, amount=0.30):
    """
    Apply global unstructured L1 pruning to all Linear weights.
    `amount` is the fraction of connections to prune globally.
    """
    params_to_prune = get_linear_modules(model)
    prune.global_unstructured(
        params_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=amount,
    )
    return params_to_prune

def make_pruning_permanent(params_to_prune):
    """
    Remove pruning reparametrizations to make zeros permanent in the weights.
    """
    for module, param_name in params_to_prune:
        prune.remove(module, param_name)

def tensor_sparsity(t):
    num_zeros = torch.sum(t == 0).item()
    num_elems = t.numel()
    return num_zeros / max(1, num_elems)

def model_sparsity(model):
    total_zeros, total_params = 0, 0
    for p in model.parameters():
        if p is not None and p.data is not None:
            total_zeros += torch.sum(p.data == 0).item()
            total_params += p.data.numel()
    return total_zeros / max(1, total_params)


Prune

In [None]:
# Choose pruning amount (start moderate; you can sweep later: 0.2, 0.3, 0.5)
PRUNE_AMOUNT = 0.30

# --- DistilBERT pruning ---
distil_params = apply_global_pruning(distilbert_model, amount=PRUNE_AMOUNT)
distil_sparsity = model_sparsity(distilbert_model)
make_pruning_permanent(distil_params)  # bake zeros into weights

# --- BERTweet pruning ---
bertweet_params = apply_global_pruning(bertweet_model, amount=PRUNE_AMOUNT)
bertweet_sparsity = model_sparsity(bertweet_model)
make_pruning_permanent(bertweet_params)

print(f"✅ Applied global L1 pruning @ {PRUNE_AMOUNT*100:.0f}%")
print(f"DistilBERT sparsity: {distil_sparsity*100:.2f}%")
print(f"BERTweet  sparsity: {bertweet_sparsity*100:.2f}%")

✅ Applied global L1 pruning @ 30%
DistilBERT sparsity: 0.00%
BERTweet  sparsity: 0.00%


Save Pruned Models

In [None]:
PRUNED_DISTIL_PATH = f"{DISTIL_PATH}_pruned_{int(PRUNE_AMOUNT*100)}"
PRUNED_BERTWEET_PATH = f"{BERTWEET_PATH}_pruned_{int(PRUNE_AMOUNT*100)}"

distilbert_model.save_pretrained(PRUNED_DISTIL_PATH)
distilbert_tokenizer.save_pretrained(PRUNED_DISTIL_PATH)

bertweet_model.save_pretrained(PRUNED_BERTWEET_PATH)
bertweet_tokenizer.save_pretrained(PRUNED_BERTWEET_PATH)

print("✅ Pruned models saved:")
print(f" - {PRUNED_DISTIL_PATH}")
print(f" - {PRUNED_BERTWEET_PATH}")


✅ Pruned models saved:
 - /content/drive/MyDrive/ADV_DL/hf_best/distilbert_pruned_30
 - /content/drive/MyDrive/ADV_DL/hf_best/bertweet_pruned_30


Pruning Evaluation

In [None]:
# Sparsity already printed; we re-run just in case of recovery
print(f"DistilBERT sparsity now: {model_sparsity(distilbert_model)*100:.2f}%")
print(f"BERTweet  sparsity now: {model_sparsity(bertweet_model)*100:.2f}%")

# Accuracy (use a smaller sample for speed; bump to full for final report)
acc_distil_pruned = evaluate_model(distilbert_model, distilbert_tokenizer, hf_val_dataset_distilbert, n_samples=1000)
acc_bertweet_pruned = evaluate_model(bertweet_model, bertweet_tokenizer, hf_val_dataset_bertweet, n_samples=1000)

print(f"Pruned DistilBERT Accuracy: {acc_distil_pruned:.4f}")
print(f"Pruned BERTweet  Accuracy: {acc_bertweet_pruned:.4f}")

# Inference speed (same batch sizing as quant)
time_distil_pruned = measure_inference_time(distilbert_model, distilbert_tokenizer, hf_val_dataset_distilbert, n_batches=20)
time_bertweet_pruned = measure_inference_time(bertweet_model, bertweet_tokenizer, hf_val_dataset_bertweet, n_batches=20)

print(f"Pruned DistilBERT Inference time per batch: {time_distil_pruned:.4f} sec")
print(f"Pruned BERTweet  Inference time per batch: {time_bertweet_pruned:.4f} sec")


DistilBERT sparsity now: 19.29%
BERTweet  sparsity now: 19.02%
Pruned DistilBERT Accuracy: 0.8980
Pruned BERTweet  Accuracy: 0.9080
Pruned DistilBERT Inference time per batch: 2.9027 sec
Pruned BERTweet  Inference time per batch: 5.5222 sec


### 3. Knowledge Distillation (Teacher → Student)
Train a smaller student model that mimics a larger fine-tuned teacher, reducing model size and inference cost while retaining accuracy.

**3.1 Knowledge Distillation for DistilBERT**

- **Teacher:** fine-tuned `distilbert-base-uncased` (already loaded as `distilbert_model`)
- **Student:** `prajjwal1/bert-tiny` (a compact BERT variant, much faster)
- **Datasets:** reuse `hf_train_dataset_distilbert` and `hf_val_dataset_distilbert`  
  (compatible tokenizer families, no need for re-tokenization)
- **Objective:** Compress the fine-tuned DistilBERT into a tiny model while keeping reasonable accuracy

**KD hyperparameters (optional read)**

KD_TEMPERATURE softens the teacher’s probabilities so the student learns class similarities, not just the top class. Values around 2–4 usually work well.

ALPHA_CE vs ALPHA_KD balance “learn from ground-truth labels” (CE) and “match the teacher” (KD); 0.5/0.5 is a solid default—raise CE if labels are very reliable, raise KD if you trust the teacher more.

STUDENT_NAME picks a small model (prajjwal1/bert-tiny) for speed; we use the matching BERT vocab tokenizer (bert-base-uncased).

DataCollatorWithPadding pads batches on the fly with the student tokenizer so inputs are shaped correctly for training.

In [None]:
# ---- KD hyperparameters ----
KD_TEMPERATURE = 2.0     # softening factor for logits
ALPHA_CE      = 0.5      # weight for hard-label cross-entropy
ALPHA_KD      = 0.5      # weight for KL(student || teacher_soft)

# ---- Student: tiny BERT (compatible tokenizer family with distilbert) ----
STUDENT_NAME = "prajjwal1/bert-tiny"
student_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # tiny-bert uses bert-base-uncased vocab
student_model = AutoModelForSequenceClassification.from_pretrained(
    STUDENT_NAME, num_labels=NUM_LABELS
)

# ---- Data collator (same as training/eval) ----
data_collator_distil = DataCollatorWithPadding(tokenizer=student_tokenizer)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Distillation Trainer**

* Wraps Hugging Face Trainer to do knowledge distillation.

* Freezes the teacher (no gradients) and keeps it on the same device as the student for safe mixed-GPU/CPU setups.

- In compute_loss:

  - Separates labels from the student inputs and re-asserts device alignment.

  - Runs student and teacher forward passes (teacher under no_grad).

  - Computes two losses:

    - CE loss: student vs. ground-truth labels.

    - KD loss: KL divergence between temperature-softened student and teacher logits (T = temperature, with the standard T² scaling).
  
- Returns the weighted sum: alpha_ce * CE + alpha_kd * KD.

In [None]:

class DistillationTrainer(Trainer):
    def __init__(self, teacher_model=None, temperature=2.0, alpha_ce=0.5, alpha_kd=0.5, **kwargs):
        super().__init__(**kwargs)

        # Keep teacher frozen and ON THE SAME DEVICE as the student/Trainer
        self.teacher = teacher_model
        for p in self.teacher.parameters():
            p.requires_grad = False

        # Use Trainer's device (GPU if available)
        device = self.args.device if hasattr(self, "args") else (
            torch.device("cuda" if torch.cuda.is_available() else "cpu")
        )
        self.teacher.to(device).eval()

        self.temperature = temperature
        self.alpha_ce = alpha_ce
        self.alpha_kd = alpha_kd

    # Accept extra kwargs like num_items_in_batch from newer HF
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        student_inputs = {k: v for k, v in inputs.items() if k != "labels"}

        # Ensure teacher is still on the same device as model/inputs (robustness)
        # (If accelerate moves model later, this keeps them aligned)
        self.teacher.to(next(model.parameters()).device)

        # Student forward
        outputs_s = model(**student_inputs)
        logits_s = outputs_s.logits

        # Teacher forward (no grad)
        with torch.no_grad():
            outputs_t = self.teacher(**student_inputs)
            logits_t = outputs_t.logits

        # Losses
        loss_ce = F.cross_entropy(logits_s, labels)
        T = self.temperature
        loss_kd = F.kl_div(
            F.log_softmax(logits_s / T, dim=-1),
            F.softmax(logits_t / T, dim=-1),
            reduction="batchmean"
        ) * (T * T)

        loss = self.alpha_ce * loss_ce + self.alpha_kd * loss_kd
        return (loss, outputs_s) if return_outputs else loss


**Knowledge Distillation — Train & Save bert-tiny (student) from DistilBERT (teacher)**

- Sets TrainingArguments for a short KD run (3 epochs, bs=32, LR=2e-4, no W&B logging).

- Uses the custom DistillationTrainer with:

  - Teacher: your fine-tuned distilbert_model (frozen).

  - Student: prajjwal1/bert-tiny.
  
  - Loss mix: alpha_ce=0.5 (hard labels) + alpha_kd=0.5 (teacher soft targets) at temperature=2.0.

- Datasets: hf_train_dataset_distilbert / hf_val_dataset_distilbert; padding handled by DataCollatorWithPadding.

- Metrics: compute_metrics on the eval-split each epoch.

- After training, prints the evaluation dict and saves the distilled student (model + tokenizer) to drive.

In [None]:

kd_args = TrainingArguments(
    output_dir=f"{DISTIL_PATH}_kd_bert_tiny",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.0,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    seed=42,
    do_eval=True,
    report_to="none",
)

kd_trainer = DistillationTrainer(
    teacher_model=distilbert_model,            # TEACHER (fine-tuned DistilBERT)
    temperature=2.0,
    alpha_ce=0.5,
    alpha_kd=0.5,
    model=student_model,                       # STUDENT (bert-tiny)
    args=kd_args,
    train_dataset=hf_train_dataset_distilbert,
    eval_dataset=hf_val_dataset_distilbert,
    processing_class=student_tokenizer,        # <- deprecation-safe (instead of tokenizer=)
    data_collator=data_collator_distil,
    compute_metrics=compute_metrics,
)

# Train
kd_trainer.train()

# Evaluate and store results
results = kd_trainer.evaluate()
print("📊 Distillation Student Results:")
print(results)

# Save student (distilled) model
student_model.save_pretrained("/content/drive/MyDrive/ADV_DL/kd_student_distilbert")
student_tokenizer.save_pretrained("/content/drive/MyDrive/ADV_DL/kd_student_distilbert")



Step,Training Loss
100,0.2197
200,0.1821
300,0.2085
400,0.2176
500,0.2013
600,0.2205
700,0.1952
800,0.2146
900,0.2248
1000,0.1865


📊 Distillation Student Results:
{'eval_loss': 0.543489396572113, 'eval_accuracy': 0.8526482021379981, 'eval_f1': 0.8519710288825878, 'eval_precision': 0.8535263308933317, 'eval_recall': 0.8526482021379981, 'eval_runtime': 22.8267, 'eval_samples_per_second': 360.63, 'eval_steps_per_second': 11.303, 'epoch': 3.0}


('/content/drive/MyDrive/ADV_DL/kd_student_distilbert/tokenizer_config.json',
 '/content/drive/MyDrive/ADV_DL/kd_student_distilbert/special_tokens_map.json',
 '/content/drive/MyDrive/ADV_DL/kd_student_distilbert/vocab.txt',
 '/content/drive/MyDrive/ADV_DL/kd_student_distilbert/added_tokens.json',
 '/content/drive/MyDrive/ADV_DL/kd_student_distilbert/tokenizer.json')

**3.2 Knowledge Distillation for BERTweet**

- **Teacher:** fine-tuned `vinai/bertweet-base`
- **Student:** `distilroberta-base` (smaller RoBERTa variant, good match since BERTweet is RoBERTa-based)
- **Tokenizer handling:**
  - Teacher and student tokenizers differ → need raw text + labels dataset
  - Student inputs: tokenized with student tokenizer
  - Teacher logits: computed dynamically inside the trainer
- **Objective:** Transfer BERTweet’s knowledge into a smaller, more efficient RoBERTa student

In [51]:
# ---- Paths ----
TEACHER_PATH = f"{BASE_DIR}/hf_best/bertweet"              # fine-tuned BERTweet teacher
STUDENT_NAME = "distilroberta-base"                        # student model
KD_SAVE_DIR  = f"{BASE_DIR}/hf_best/bertweet_kd_student"   # distilled student

In [52]:
#Need to run this again as runtime disconnected
# ---- KD hyperparameters ----
KD_TEMPERATURE = 2.0
ALPHA_CE       = 0.5
ALPHA_KD       = 0.5
MAX_LENGTH     = MAX_SEQ_LENGTH

**Freeze Teacher & Precompute Logits (BERTweet → DistilRoBERTa)**

- load the fine-tuned BERTweet as the frozen teacher (TEACHER_PATH), pick DistilRoBERTa as the student (STUDENT_NAME), and set KD_SAVE_DIR for outputs.

- Freeze teacher- move to device, eval(), and disable grads.

- Label mapping from the teacher so student predictions align.

- Student tokenizer & init: create a model_init() that builds DistilRoBERTa with the same label mapping.

- TeacherOnlyCollator: pads teacher inputs (already BERTweet-tokenized HF datasets) for efficient batching.

- Precompute logits: run the teacher over hf_train_dataset_bertweet and hf_val_dataset_bertweet to get soft targets.

- Output: t_train_logits and t_val_logits used later by the KD trainer so the student can learn from the teacher’s soft targets.


In [55]:
# ---- Load teacher (use your device), frozen ----
teacher_tokenizer = AutoTokenizer.from_pretrained(TEACHER_PATH, use_fast=True)
teacher_model = AutoModelForSequenceClassification.from_pretrained(TEACHER_PATH)
teacher_model.eval().to(device)
for p in teacher_model.parameters():
    p.requires_grad = False

# ---- Student tokenizer + model init (unchanged) ----
num_labels = teacher_model.config.num_labels
id2label   = teacher_model.config.id2label
label2id   = teacher_model.config.label2id

student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_NAME, use_fast=True)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        STUDENT_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id
    )

# ---------- Collator to feed the teacher ONLY (pads BERTweet ids/masks) ----------
from torch.utils.data import DataLoader
class TeacherOnlyCollator:
    def __init__(self, tok):
        self.tok = tok
    def __call__(self, feats):
        batch = {
            "input_ids":      [f["input_ids"]      for f in feats],
            "attention_mask": [f["attention_mask"] for f in feats],
        }
        return self.tok.pad(batch, padding=True, return_tensors="pt")

# ---------- Precompute teacher logits (GPU if available, AMP on CUDA) ----------
@torch.no_grad()
def compute_teacher_logits(hf_ds, batch_size=128):
    dl = DataLoader(
        hf_ds,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=TeacherOnlyCollator(teacher_tokenizer),
        num_workers=2,
        pin_memory=(device.type == "cuda"),
    )
    all_logits = []
    use_amp = (device.type == "cuda")
    for b in dl:
        b = {k: v.to(device, non_blocking=True) for k, v in b.items()}
        if use_amp:
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                logits = teacher_model(**b).logits
        else:
            logits = teacher_model(**b).logits
        all_logits.append(logits.float().cpu())
    return torch.cat(all_logits, dim=0)

print("Precomputing teacher logits (fast path)…")
t_train_logits = compute_teacher_logits(hf_train_dataset_bertweet, batch_size=128)
t_val_logits   = compute_teacher_logits(hf_val_dataset_bertweet,   batch_size=128)
print(f"Teacher logits: train {tuple(t_train_logits.shape)}, val {tuple(t_val_logits.shape)}")


Precomputing teacher logits (fast path)…
Teacher logits: train (32925, 3), val (8232, 3)


In [58]:
def _build_kd_student_ds(hf_ds, t_logits_tensor, teacher_tok, student_tok):
    # 1) Reconstruct raw texts from the teacher-tokenized dataset
    texts = teacher_tok.batch_decode(hf_ds["input_ids"], skip_special_tokens=True)
    # 2) Tokenize for the *student*
    enc = student_tok(texts, truncation=True, padding=False)
    # 3) Labels and teacher logits
    labels = hf_ds["labels"]
    t_logits = t_logits_tensor.detach().cpu().float().tolist()
    # 4) Sanity alignment
    n = len(labels)
    assert len(enc["input_ids"]) == n and len(t_logits) == n, \
        f"KD alignment error: student tokens ({len(enc['input_ids'])}) / logits ({len(t_logits)}) / labels ({n})"
    # 5) HF Dataset for Trainer
    return Dataset.from_dict({
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": labels,
        "t_logits": t_logits,
    })

kd_train_ds = _build_kd_student_ds(hf_train_dataset_bertweet, t_train_logits, teacher_tokenizer, student_tokenizer)
kd_val_ds   = _build_kd_student_ds(hf_val_dataset_bertweet,   t_val_logits,   teacher_tokenizer, student_tokenizer)

# === Collator that the KD trainer expects (pads student inputs + stacks labels & t_logits) ===
class DualCollator:
    def __init__(self, s_tok):
        self.s_tok = s_tok
    def __call__(self, feats):
        s_batch = {
            "input_ids":      [f["input_ids"]      for f in feats],
            "attention_mask": [f["attention_mask"] for f in feats],
        }
        s_padded = self.s_tok.pad(s_batch, padding=True, return_tensors="pt")
        labels   = torch.tensor([f["labels"]   for f in feats], dtype=torch.long)
        t_logits = torch.tensor([f["t_logits"] for f in feats], dtype=torch.float32)
        s_padded["labels"]   = labels
        s_padded["t_logits"] = t_logits
        return s_padded

dual_collator = DualCollator(student_tokenizer)

print(f"✓ KD datasets built: train={len(kd_train_ds)}, val={len(kd_val_ds)} with 't_logits'")


✓ KD datasets built: train=32925, val=8232 with 't_logits'


In [62]:
# ---- Losses ----
_kl = nn.KLDivLoss(reduction="batchmean")
ce_loss_fn = nn.CrossEntropyLoss()

def kd_loss_fn(student_logits, teacher_logits, T: float):
    log_p_s = nn.functional.log_softmax(student_logits / T, dim=-1)
    p_t     = nn.functional.softmax(teacher_logits / T, dim=-1)
    return (T * T) * _kl(log_p_s, p_t)

# ---- KD Trainer (now consumes precomputed teacher logits) ----
class KDTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        t_logits = inputs.pop("t_logits")                    # [B, num_labels], float32
        outputs  = model(**inputs)
        s_logits = outputs.logits
        # move to same device for KD term
        t_logits = t_logits.to(s_logits.device)
        loss_ce  = ce_loss_fn(s_logits, inputs["labels"])
        loss_kd  = kd_loss_fn(s_logits, t_logits, KD_TEMPERATURE)
        loss     = ALPHA_CE * loss_ce + ALPHA_KD * loss_kd
        return (loss, outputs) if return_outputs else loss

# Save under the same hf_best/ folder as the teacher (matches later eval cells)
KD_SAVE_DIR = str(Path(TEACHER_PATH).parent / "bertweet_kd_student")
os.makedirs(KD_SAVE_DIR, exist_ok=True)  # safe if it already exists
print("KD_SAVE_DIR =", KD_SAVE_DIR)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=f"{KD_SAVE_DIR}/runs",
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=False,   # will set True after strategies below
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=False,
    report_to=["none"],

    # keep extra fields (like 't_logits') for the collator & compute_loss:
    remove_unused_columns=False,
)
# (rest of your code the same…)
try:
    training_args.evaluation_strategy = IntervalStrategy.EPOCH
    training_args.save_strategy       = IntervalStrategy.EPOCH
except Exception:
    training_args.evaluation_strategy = "epoch"
    training_args.save_strategy       = "epoch"
try:
    training_args.save_total_limit = 1
except Exception:
    pass
training_args.load_best_model_at_end = True

# ---- Trainer (use your existing compute_metrics) ----
trainer = KDTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=kd_train_ds,
    eval_dataset=kd_val_ds,
    data_collator=dual_collator,
    compute_metrics=compute_metrics,  # your multi-metric fn
)

print("Starting KD training (teacher logits precomputed)...")
train_result = trainer.train()
trainer.save_model(KD_SAVE_DIR)
student_tokenizer.save_pretrained(KD_SAVE_DIR)
val_metrics = trainer.evaluate()
print("KD(BERTweet→DistilRoBERTa) | Val metrics:", val_metrics)

KD_SAVE_DIR = /content/drive/MyDrive/ADV_DL/hf_best/bertweet_kd_student


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting KD training (teacher logits precomputed)...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.4085
1000,0.9643
1500,0.68
2000,0.596
2500,0.4478
3000,0.3995


KD(BERTweet→DistilRoBERTa) | Val metrics: {'eval_loss': 0.6036226749420166, 'eval_accuracy': 0.908041788143829, 'eval_f1': 0.9075559815034498, 'eval_precision': 0.9084999992474658, 'eval_recall': 0.908041788143829, 'eval_runtime': 21.0988, 'eval_samples_per_second': 390.165, 'eval_steps_per_second': 6.114, 'epoch': 3.0}


In [63]:
# --- Compatibility aliases for any later cells that expected the old names ---
BT_KD_PATH      = KD_SAVE_DIR            # old var -> points to the new saved student
bt_student_model = None                  # not needed anymore; prevents accidental re-use
bt_student_tokenizer = None              # not needed
kd_train_ds_bt  = kd_train_ds            # if later cells reference *_bt datasets
kd_val_ds_bt    = kd_val_ds

**Evaluate KD student vs. teacher (BERTweet)**

This cell compares the distilled DistilRoBERTa student to the BERTweet teacher.

helpers handle model file lookup (_model_file), size in MB (_mb), safe column drops (_drop_cols, removes KD-only fields like t_logits), and accuracy pass (_quick_acc with DataCollatorWithPadding and GPU/CPU-aware batch size).

The student is loaded from KD_SAVE_DIR and evaluated on kd_val_ds after stripping KD-only columns.

The teacher is loaded from TEACHER_PATH and evaluated on the original hf_val_dataset_bertweet if present; otherwise it rebuilds an eval set by decoding the student tokens back to text and re-tokenizing with the teacher tokenizer.

Finally, it prints [BERTweet] KD student accuracy = … and [BERTweet] teacher accuracy = …. If you define EVAL_MAX_SAMPLES, the accuracy is computed on a capped subset for speed.

In [68]:
# ---- helpers (no dependency on your earlier helpers) ----
def _model_file(d):
    d = Path(d)
    for f in ("pytorch_model.bin", "model.safetensors"):
        p = d / f
        if p.exists():
            return str(p)
    raise FileNotFoundError(f"No weights found in {d}")

def _mb(p):
    return os.path.getsize(p) / (1024 * 1024)

def _drop_cols(ds, cols=("t_logits","raw_texts")):
    try:
        have = getattr(ds, "column_names", [])
        rm = [c for c in cols if c in have]
        return ds.remove_columns(rm) if rm else ds
    except Exception:
        return ds

def _quick_acc(model, tokenizer, dataset, max_samples=None):
    ds = dataset
    if max_samples and hasattr(ds, "select"):
        ds = ds.select(range(min(len(ds), max_samples)))
    loader = DataLoader(
        ds,
        batch_size=(64 if model.device.type == "cuda" else 128),
        shuffle=False,
        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer),
    )
    preds, labels = [], []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            y = batch.pop("labels")
            inputs = {k: v.to(model.device) for k, v in batch.items()
                      if k in ("input_ids","attention_mask","token_type_ids")}
            logits = model(**inputs).logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
            labels.extend(y.cpu().tolist() if isinstance(y, torch.Tensor) else y)
    return accuracy_score(labels, preds)


In [69]:
# BERTweet (teacher path = TEACHER_PATH, student KD dir = KD_SAVE_DIR)
# sanity: saved files exist
for req in ("config.json","tokenizer.json"):
    if not (Path(KD_SAVE_DIR)/req).exists():
        print(f"⚠️ Missing {req} in {KD_SAVE_DIR}")

# sizes
bt_teacher_file = _model_file(TEACHER_PATH)
bt_student_file = _model_file(KD_SAVE_DIR)
print(f"[BERTweet] teacher size: {_mb(bt_teacher_file):.2f} MB")
print(f"[BERTweet] KD student size: {_mb(bt_student_file):.2f} MB")

# load student + accuracy (drop KD-only cols)
bt_student_tok = AutoTokenizer.from_pretrained(KD_SAVE_DIR)
bt_student_mod = AutoModelForSequenceClassification.from_pretrained(KD_SAVE_DIR).to(device).eval()
bt_eval_ds     = _drop_cols(kd_val_ds)
bt_acc_student = _quick_acc(bt_student_mod, bt_student_tok, bt_eval_ds, max_samples=globals().get("EVAL_MAX_SAMPLES", None))
print(f"[BERTweet] KD student accuracy = {bt_acc_student:.4f}")

# teacher accuracy (prefer original HF val if present)
bt_teacher_tok = AutoTokenizer.from_pretrained(TEACHER_PATH, use_fast=True)
bt_teacher_mod = AutoModelForSequenceClassification.from_pretrained(TEACHER_PATH).to(device).eval()
if 'hf_val_dataset_bertweet' in globals():
    bt_teacher_eval = hf_val_dataset_bertweet
else:
    # rebuild from kd_val_ds: decode student tokens -> text -> re-tokenize with teacher
    texts = bt_student_tok.batch_decode(kd_val_ds["input_ids"], skip_special_tokens=True)
    enc   = bt_teacher_tok(texts, truncation=True, padding=False)
    bt_teacher_eval = Dataset.from_dict({
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": kd_val_ds["labels"],
    })
bt_teacher_eval = _drop_cols(bt_teacher_eval)
bt_acc_teacher  = _quick_acc(bt_teacher_mod, bt_teacher_tok, bt_teacher_eval, max_samples=globals().get("EVAL_MAX_SAMPLES", None))
print(f"[BERTweet] teacher accuracy    = {bt_acc_teacher:.4f}")


[BERTweet] teacher size: 514.63 MB
[BERTweet] KD student size: 313.28 MB
[BERTweet] KD student accuracy = 0.9080
[BERTweet] teacher accuracy    = 0.9164


Load from drive the compressed model

DistilBERT Evaluation and Loading

In [72]:

HF_BEST = Path(TEACHER_PATH).parent

DISTIL_KD_DIR_SRC = Path("/content/drive/MyDrive/ADV_DL/kd_student_distilbert")

# Robust check for weights at source
if not DISTIL_KD_DIR_SRC.exists() or not any((DISTIL_KD_DIR_SRC / f).exists() for f in ("pytorch_model.bin","model.safetensors")):
    raise FileNotFoundError(
        "DistilBERT KD student not found at /content/drive/MyDrive/ADV_DL/kd_student_distilbert. "
        "Make sure the earlier KD save cell finished successfully."
    )

# Destination expected by compression/eval cells (they scan hf_best for *_kd_* dirs)
DISTIL_KD_DIR = HF_BEST / "distilbert_kd_bert_tiny"
DISTIL_KD_DIR.mkdir(parents=True, exist_ok=True)

# Copy weights/config only if dest is missing them
if not any((DISTIL_KD_DIR / f).exists() for f in ("pytorch_model.bin","model.safetensors")):
    for fname in [
        "config.json", "pytorch_model.bin", "model.safetensors",
        "tokenizer.json", "tokenizer_config.json", "vocab.txt",
        "merges.txt", "special_tokens_map.json"
    ]:
        src = DISTIL_KD_DIR_SRC / fname
        if src.exists():
            shutil.copy2(src, DISTIL_KD_DIR / fname)

print("DistilBERT KD dir set to:", DISTIL_KD_DIR)


✓ DistilBERT KD dir set to: /content/drive/MyDrive/ADV_DL/hf_best/distilbert_kd_bert_tiny


In [76]:
def _resolve_distil_eval_split(teacher_tok):
    # 1) Preferred: HF splits for DistilBERT
    if 'hf_val_dataset_distilbert' in globals():
        return _drop_cols(hf_val_dataset_distilbert)
    if 'hf_test_dataset_distilbert' in globals():
        return _drop_cols(hf_test_dataset_distilbert)

    # 2) Last resort: build from kd_val_ds by decoding its tokens -> raw text -> re-tokenize for DistilBERT
    if 'kd_val_ds' in globals():
        try:
            dec_tok = globals().get('student_tok', None)
            if dec_tok is None:
                from transformers import AutoTokenizer as _AT
                dec_tok = _AT.from_pretrained(KD_SAVE_DIR)   # BERTweet KD student tokenizer
            if hasattr(kd_val_ds, "column_names") and ("input_ids" in kd_val_ds.column_names):
                texts = dec_tok.batch_decode(kd_val_ds["input_ids"], skip_special_tokens=True)
                enc   = teacher_tok(texts, truncation=True, padding=False)
                return Dataset.from_dict({
                    "input_ids": enc["input_ids"],
                    "attention_mask": enc["attention_mask"],
                    "labels": kd_val_ds["labels"],
                })
        except Exception:
            pass

    # Nothing available
    return None

# define teacher tokenizer BEFORE calling resolver
db_teacher_tok = AutoTokenizer.from_pretrained(str(DISTIL_TEACHER_DIR), use_fast=True)
db_eval_ds = _resolve_distil_eval_split(db_teacher_tok)

if db_eval_ds is None:
    print("DistilBERT eval dataset not found; skipping accuracy computation for DistilBERT.")
else:
    # Student accuracy
    db_student_tok = AutoTokenizer.from_pretrained(str(DISTIL_KD_DIR))
    db_student_mod = AutoModelForSequenceClassification.from_pretrained(str(DISTIL_KD_DIR)).to(device).eval()
    db_acc_student = _quick_acc(
        db_student_mod, db_student_tok, db_eval_ds,
        max_samples=globals().get("EVAL_MAX_SAMPLES", None)
    )
    print(f"[DistilBERT] KD student accuracy = {db_acc_student:.4f}")

    # Teacher accuracy (same split)
    db_teacher_mod = AutoModelForSequenceClassification.from_pretrained(str(DISTIL_TEACHER_DIR)).to(device).eval()
    db_acc_teacher = _quick_acc(
        db_teacher_mod, db_teacher_tok, db_eval_ds,
        max_samples=globals().get("EVAL_MAX_SAMPLES", None)
    )
    print(f"[DistilBERT] teacher accuracy    = {db_acc_teacher:.4f}")


[DistilBERT] KD student accuracy = 0.8507
[DistilBERT] teacher accuracy    = 0.8946


# Final Compression Results & Comparison

Below we compare **DistilBERT** and **BERTweet** before and after three compression methods:
- **Quantization** (dynamic, Linear-only, PyTorch CPU)
- **Pruning** (global unstructured L1, 30%)
- **Knowledge Distillation (KD)** (teacher → small student)

note: dynamic quantization is CPU-only.

---

## DistilBERT

| Method               | Size (MB) | Val Acc | Sec/Batch |
|----------------------|----------:|--------:|----------:|
| Baseline (teacher)   | **255.43** | **0.8980** | **0.0813** |
| Quantized (dynamic)  | 132.29 | 0.8700 | 2.5604 |
| Pruned (30%)         | 255.43 | 0.8980 | 2.9027 |
| KD (bert-tiny)       | —      | 0.8526 | 0.0615 |

**Observations (DistilBERT):**
- **KD** gives the fastest inference with a modest accuracy drop (0.853 vs 0.898).
- **Pruning (30%)** kept accuracy essentially unchanged but didn’t speed up.
- **Quantization** reduced size by ~48% but ran slower here because it’s CPU only.

---

## BERTweet

| Method                      | Size (MB) | Val Acc | Sec/Batch |
|----------------------------|----------:|--------:|----------:|
| Baseline (teacher)         | **514.63** | **0.9080** | **0.1592** |
| Quantized (dynamic)        | 270.05 | 0.2080 | 5.0174 |
| Pruned (30%)               | **514.63** | **0.9080** | **5.5222** |
| KD (distilroberta student) | 313.28 | 0.6430 | 0.0615 |

**Observations (BERTweet):**
- **KD student** is much faster with an accuracy drop (0.643 vs 0.908).
- **Quantization** accuracy collapsed (0.208), we are not sure why. We actually expected this to be the highest as it only makes the weights digits a bit shorter.
- **Pruning** preserved accuracy but didn’t speed up.




# **TEST RUN** - Compressed Models

This section evaluates **all compressed models** (Quantized, Pruned, KD students) **from Drive** and produces a summary CSV for the paper.

Evaluates compressed models saved in `hf_best`:
- Quantized: `distilbert_quantized.pt`, `bertweet_quantized.pt`
- Pruned:    `distilbert_pruned_*`, `bertweet_pruned_*`
- KD:        `distilbert_kd_*`, `bertweet_kd_*`


In [None]:
BASE_DIR = "/content/drive/MyDrive/ADV_DL"

In [None]:
from pathlib import Path
import torch

# Expect these to exist already in the notebook:
assert 'BASE_DIR' in globals(), "BASE_DIR is not defined."
assert 'HF_BEST_DIR' in globals(), "HF_BEST_DIR is not defined."
assert 'device' in globals(), "device is not defined (torch device)."

BASE_DIR = Path(BASE_DIR)
HF_BEST_DIR = Path(HF_BEST_DIR)

print("BASE_DIR   :", BASE_DIR)
print("HF_BEST_DIR:", HF_BEST_DIR)
print("device     :", device)


BASE_DIR   : /content/drive/MyDrive/ADV_DL
HF_BEST_DIR: /content/drive/MyDrive/ADV_DL/hf_best
device     : cuda


### Tokenizers (load from Drive)


In [None]:


if 'distilbert_tokenizer' not in globals():
    distilbert_tokenizer = AutoTokenizer.from_pretrained(HF_BEST_DIR / "distilbert")
if 'bertweet_tokenizer' not in globals():
    bertweet_tokenizer   = AutoTokenizer.from_pretrained(HF_BEST_DIR / "bertweet")

print("✓ Tokenizers ready (DistilBERT & BERTweet).")


✓ Tokenizers ready (DistilBERT & BERTweet).


### Evaluation datasets (reuse val/test; one per family)

In [None]:
# Expect either val or test per family to be loaded earlier in the notebook
assert ('hf_val_dataset_distilbert' in globals()) or ('hf_test_dataset_distilbert' in globals()), \
       "Missing DistilBERT eval dataset (val or test)."
assert ('hf_val_dataset_bertweet' in globals()) or ('hf_test_dataset_bertweet' in globals()), \
       "Missing BERTweet eval dataset (val or test)."

ds_distil = globals().get('hf_val_dataset_distilbert') or globals().get('hf_test_dataset_distilbert')
ds_bertw  = globals().get('hf_val_dataset_bertweet')   or globals().get('hf_test_dataset_bertweet')

print("Datasets ready (DistilBERT & BERTweet).")

Datasets ready (DistilBERT & BERTweet).


### Weights & Biases
Log accuracy and sec/batch per artifact.


In [None]:
use_wandb = 'wandb' in globals()
if use_wandb:
    run = wandb.init(
        project = globals().get('WANDB_PROJECT', 'adv-dl-sentiment'),
        job_type= "compression-test",
        group   = "compression-eval",
        reinit  = True
    )
    print("✓ W&B logging enabled.")
else:
    print("W&B not active — skipping logging.")

0,1
bertweet/kd/accuracy,▁
bertweet/pruned/accuracy,▁
bertweet/quantized/accuracy,▁
distilbert/pruned/accuracy,▁
distilbert/quantized/accuracy,▁█
distilbert/quantized/sec_per_batch,▁

0,1
bertweet/kd/accuracy,0.646
bertweet/pruned/accuracy,0.91333
bertweet/quantized/accuracy,0.216
distilbert/pruned/accuracy,0.89867
distilbert/quantized/accuracy,0.894
distilbert/quantized/sec_per_batch,2.40332


✓ W&B logging enabled.


### Helpers (reuse `evaluate_model` / `measure_inference_time`)

In [None]:

def _model_device(model):
    try:
        return next(model.parameters()).device
    except StopIteration:
        return torch.device('cpu')

def _subset_dataset(ds, max_samples):
    if max_samples is None:
        return ds
    try:
        n = len(ds)
        if n > max_samples:
            # HuggingFace Dataset supports .select
            return ds.select(range(max_samples))
    except Exception:
        pass
    return ds

def _acc(model, tokenizer, dataset):
    # cap evaluation size to speed up
    MAX_SAMPLES = globals().get('EVAL_MAX_SAMPLES', 1500)  # adjust if you like
    ds = _subset_dataset(dataset, MAX_SAMPLES)

    # Prefer your notebook's evaluate_model; fall back if that one doesn't support subsets
    if 'evaluate_model' in globals():
        try:
            # if your evaluate_model can take n_samples, use it
            return evaluate_model(model, tokenizer, ds, n_samples=len(ds))
        except Exception:
            pass

    # Minimal fallback (no latency measurement)
    import numpy as np
    from sklearn.metrics import accuracy_score
    dev = _model_device(model)
    # higher batch for CPU to speed dynamic quant; moderate for GPU
    bs = 128 if dev.type == 'cpu' else 64
    model.eval()
    loader = DataLoader(ds, batch_size=bs, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
    preds, labels = [], []
    with torch.no_grad():
        for batch in loader:
            y = batch.pop("labels").numpy()
            batch = {k: v.to(dev) for k, v in batch.items()}
            logits = model(**batch).logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            labels.extend(y)
    from sklearn.metrics import accuracy_score
    return accuracy_score(labels, preds)

# Quantized loader: keep on CPU; deprecation warning is fine for this run
def _load_quantized_teacher(teacher_dir, state_file):
    base = AutoModelForSequenceClassification.from_pretrained(teacher_dir)  # CPU
    qmod = torch.quantization.quantize_dynamic(base, {torch.nn.Linear}, dtype=torch.qint8)
    state = torch.load(state_file, map_location='cpu')
    qmod.load_state_dict(state)
    qmod.eval()
    return qmod


### Evaluate: Quantized models


In [None]:
results = []  # collected across all methods

distil_q_file = HF_BEST_DIR / "distilbert_quantized.pt"
bertw_q_file  = HF_BEST_DIR / "bertweet_quantized.pt"

if distil_q_file.exists():
    model = _load_quantized_teacher(HF_BEST_DIR / "distilbert", distil_q_file)
    acc = _acc(model, distilbert_tokenizer, ds_distil)
    results.append({"family": "distilbert", "method": "quantized", "artifact": str(distil_q_file), "accuracy": acc})
    if 'wandb' in globals(): wandb.log({"distilbert/quantized/accuracy": acc})
else:
    print("DistilBERT quantized file not found in hf_best.")

if bertw_q_file.exists():
    model = _load_quantized_teacher(HF_BEST_DIR / "bertweet", bertw_q_file)
    acc = _acc(model, bertweet_tokenizer, ds_bertw)
    results.append({"family": "bertweet", "method": "quantized", "artifact": str(bertw_q_file), "accuracy": acc})
    if 'wandb' in globals(): wandb.log({"bertweet/quantized/accuracy": acc})
else:
    print("BERTweet quantized file not found in hf_best.")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qmod = torch.quantization.quantize_dynamic(base, {torch.nn.Linear}, dtype=torch.qint8)
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrat

### Evaluate — Pruned

In [None]:
pruned_dirs = [p for p in HF_BEST_DIR.iterdir() if p.is_dir() and "_pruned_" in p.name.lower()]
print(f"Found {len(pruned_dirs)} pruned dirs:", [p.name for p in pruned_dirs])

for d in sorted(pruned_dirs):
    name = d.name.lower()
    fam  = "distilbert" if "distil" in name else "bertweet"
    tok  = distilbert_tokenizer if fam == "distilbert" else bertweet_tokenizer
    ds   = ds_distil if fam == "distilbert" else ds_bertw

    # Try GPU first; if it throws (device-side assert), fall back to CPU
    try:
        model = AutoModelForSequenceClassification.from_pretrained(d).to(device).eval()
    except Exception as e:
        print(f"GPU load failed for {d.name} ({type(e).__name__}). Falling back to CPU.")
        model = AutoModelForSequenceClassification.from_pretrained(d).to("cpu").eval()

    acc = _acc(model, tok, ds)
    print(f"[{fam}][{name}] accuracy={acc:.4f}")
    results.append({"family": fam, "method": f"pruned:{name}", "artifact": str(d), "accuracy": acc})
    if 'wandb' in globals(): wandb.log({f"{fam}/pruned/accuracy": acc})


Found 2 pruned dirs: ['distilbert_pruned_30', 'bertweet_pruned_30']
⚠️ GPU load failed for bertweet_pruned_30 (AcceleratorError). Falling back to CPU.
[bertweet][bertweet_pruned_30] accuracy=0.9133
⚠️ GPU load failed for distilbert_pruned_30 (AcceleratorError). Falling back to CPU.
[distilbert][distilbert_pruned_30] accuracy=0.8987


### Evaluate — KD students

In [None]:

kd_dirs = [p for p in HF_BEST_DIR.iterdir() if p.is_dir() and "_kd_" in p.name.lower()]
print(f"Found {len(kd_dirs)} KD dirs:", [p.name for p in kd_dirs])

def _pick_texts_and_labels(ds_src, max_n):
    # subset for speed
    ds_small = ds_src.select(range(min(len(ds_src), max_n))) if hasattr(ds_src, "select") else ds_src[:max_n]
    # prefer an existing raw text column
    text_key = None
    if hasattr(ds_small, "column_names"):
        for cand in ("text","Tweet","tweet","content","raw_text","orig_text"):
            if cand in ds_small.column_names:
                text_key = cand
                break
    if text_key is not None:
        texts  = ds_small[text_key]
        labels = ds_small["labels"]
    else:
        # decode from teacher tokens if no raw text present
        # (teacher tokenizer chosen by family outside)
        texts, labels = None, ds_small["labels"]
    return ds_small, texts, labels

def _find_weight_file(search_dirs):
    patterns = ["pytorch_model.bin", "model.safetensors", "*.bin", "*.pt", "*.pth"]
    for sd in search_dirs:
        sd = Path(sd)
        for pat in patterns:
            for fp in sd.glob(pat):
                return fp
    return None

for d in sorted(kd_dirs):
    name = d.name.lower()
    fam  = "distilbert" if "distil" in name else "bertweet"
    ds_teacher = ds_distil if fam == "distilbert" else ds_bertw
    max_n = globals().get('EVAL_MAX_SAMPLES', 1500)

    # 1) texts/labels
    ds_small, texts, labels = _pick_texts_and_labels(ds_teacher, max_n)

    # 2) student tokenizer (prefer folder; fallback by family/name)
    try:
        student_tok = AutoTokenizer.from_pretrained(str(d))
    except Exception:
        if "tiny" in name or fam == "distilbert":
            # TinyBERT-style student
            try:
                student_tok = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
            except Exception:
                student_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
        else:
            # BERTweet-style student
            student_tok = AutoTokenizer.from_pretrained("vinai/bertweet-base")

    # If we didn't have raw texts, decode from TEACHER tokenizer
    if texts is None:
        teacher_tok = distilbert_tokenizer if fam == "distilbert" else bertweet_tokenizer
        ids_list = ds_small["input_ids"]
        texts = teacher_tok.batch_decode(ids_list, skip_special_tokens=True)

    # 3) student model
    student = None
    try:
        # normal case: weights exist in the KD dir
        student = AutoModelForSequenceClassification.from_pretrained(str(d))
    except OSError:
        # try to find weights nearby (inside dir or hf_best root)
        maybe_w = _find_weight_file([d, HF_BEST_DIR])
        if maybe_w is not None:
            # build the architecture from config in folder, else sensible base
            try:
                cfg = AutoConfig.from_pretrained(str(d))
            except Exception:
                base_ckpt = "prajjwal1/bert-tiny" if ("tiny" in name or fam == "distilbert") else "vinai/bertweet-base"
                cfg = AutoConfig.from_pretrained(base_ckpt, num_labels=3)
            student = AutoModelForSequenceClassification.from_config(cfg)
            state = torch.load(maybe_w, map_location="cpu")
            # state could be either a plain state_dict or under "state_dict"
            try:
                student.load_state_dict(state, strict=False)
            except Exception:
                if isinstance(state, dict) and "state_dict" in state:
                    student.load_state_dict(state["state_dict"], strict=False)
        else:
            print(f"⚠️ Skipping {d.name}: no model weights found in {d} or {HF_BEST_DIR}.")
            continue

    # keep KD students on CPU to avoid CUDA asserts
    student = student.to("cpu").eval()

    # 4) retokenize with the student tokenizer and evaluate (accuracy only)
    enc = student_tok(texts, truncation=True, padding=False)
    student_eval_ds = [{"input_ids": enc["input_ids"][i],
                        "attention_mask": enc["attention_mask"][i],
                        "labels": labels[i]} for i in range(len(labels))]

    acc = _acc(student, student_tok, student_eval_ds)
    print(f"[{fam}][{name}] accuracy={acc:.4f}")
    results.append({"family": fam, "method": f"kd:{name}", "artifact": str(d), "accuracy": acc})
    if 'wandb' in globals(): wandb.log({f"{fam}/kd/accuracy": acc})


Found 2 KD dirs: ['distilbert_kd_bert_tiny', 'bertweet_kd_student']
[bertweet][bertweet_kd_student] accuracy=0.6460


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[distilbert][distilbert_kd_bert_tiny] accuracy=0.1953


## Save summary to Drive


In [None]:
METRICS_DIR = BASE_DIR / "metrics"
METRICS_DIR.mkdir(exist_ok=True)

df = pd.DataFrame(results)
if not df.empty:
    df = df.sort_values(["family", "method"]).reset_index(drop=True)
    display(df)
    out_path = METRICS_DIR / "compression_test_summary.csv"
    df.to_csv(out_path, index=False, encoding="utf-8")
    print(f"✅ Saved summary: {out_path}")
    if 'wandb' in globals():
        wandb.log({"compression_test_summary": wandb.Table(dataframe=df)})
        run.finish()
else:
    print("No compressed artifacts were evaluated; summary not created.")


Unnamed: 0,family,method,artifact,accuracy
0,bertweet,kd:bertweet_kd_student,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.646
1,bertweet,kd:bertweet_kd_student,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.646
2,bertweet,kd:bertweet_kd_student,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.646
3,bertweet,kd:bertweet_kd_student,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.646
4,bertweet,pruned:bertweet_pruned_30,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.913333
5,bertweet,quantized,/content/drive/MyDrive/ADV_DL/hf_best/bertweet...,0.216
6,distilbert,kd:distilbert_kd_bert_tiny,/content/drive/MyDrive/ADV_DL/hf_best/distilbe...,0.195333
7,distilbert,pruned:distilbert_pruned_30,/content/drive/MyDrive/ADV_DL/hf_best/distilbe...,0.898667
8,distilbert,quantized,/content/drive/MyDrive/ADV_DL/hf_best/distilbe...,0.894


✅ Saved summary: /content/drive/MyDrive/ADV_DL/metrics/compression_test_summary.csv


0,1
bertweet/kd/accuracy,▁▁▁▁
bertweet/pruned/accuracy,▁
bertweet/quantized/accuracy,▁
distilbert/kd/accuracy,▁
distilbert/pruned/accuracy,▁
distilbert/quantized/accuracy,▁

0,1
bertweet/kd/accuracy,0.646
bertweet/pruned/accuracy,0.91333
bertweet/quantized/accuracy,0.216
distilbert/kd/accuracy,0.19533
distilbert/pruned/accuracy,0.89867
distilbert/quantized/accuracy,0.894


## Results Analysis
- **Pruning (30%)** — Best overall:
  - **BERTweet:** **0.9133** (≈ teacher, even slightly higher → regularization effect).
  - **DistilBERT:** **0.8987** (≈ teacher, no meaningful drop).
- **Quantization (INT8, dynamic)**:
  - **DistilBERT:** **0.8940** (≈ teacher) when evaluated on **CPU** → good.
  - **BERTweet:** **0.2160** → likely a **bad artifact** (mismatch between saved quantized state and the fine-tuned checkpoint). Pruning working well on the same model rules out data/labels.
- **Knowledge Distillation (KD)**:
  - **BERTweet → student:** **0.6460** (large drop vs. teacher).
  - **DistilBERT → TinyBERT:** **0.1953** (very low) and earlier evidence of **missing/unsaved student weights** in the KD folder.

## What could we do to improve
1. **BERTweet Quantization Fix**
   - Re-export from the **fine-tuned `hf_best/bertweet/`** checkpoint; evaluate **on CPU** immediately; then save that exact INT8 artifact. Re-run the test cell (should be ≈ teacher like DistilBERT).

2. **KD Pipeline Hardening**
   - Ensure **student weights are saved** (`save_pretrained` or `trainer.save_model`) to the KD dir you evaluate.
   - **Evaluate with the student tokenizer** (already implemented in your eval cell).
   - Use a **stronger student**:
     - For **BERTweet (RoBERTa-style)**: try `distilroberta-base`.
     - For **DistilBERT (BERT-family)**: prefer `prajjwal1/bert-mini` over `bert-tiny`.
   - KD hyperparams to try: **T=3–4**, **α (teacher loss)=0.7–0.9**, smaller **LR (1e-5–3e-5)**, **more epochs** (2–3× teacher).

3. **Pruning Enhancement (optional)**
   - Try **40–50%** sparsity with a **1–2 epoch post-prune finetune** (low LR) for larger size gains while keeping accuracy.

4. **Hygiene (submission clarity)**
   - Deduplicate CSV rows before saving.  
   - Keep INT8 eval **CPU-only** and document `EVAL_MAX_SAMPLES` cap used for speed.
