In [1]:
import os
import kagglehub
import pandas as pd
from copy import deepcopy
import numpy as np
import optuna
import wandb
from tqdm.notebook import tqdm
import gc
import random
import time
import html
import re
import wandb
from IPython.display import display

# PyTorch and Sklearn
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

# Hugging Face
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset as HFDataset

# Model Compression
from torch.nn.utils import prune
from torch.quantization import quantize_dynamic



In [2]:
# --- Global Settings ---
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Login to W&B (you might be prompted for an API key)
wandb.login(key="120e017ed2eaa1fa329d9b080c6b901366a51acb")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/matan/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmoti-matan[0m ([33mmoti-matan-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
def clean_tweet_text(text: str) -> str:
    # Decode HTML entities
    text = html.unescape(text)
    # Replace URLs with <URL> token
    text = re.sub(r'http\S+|www\S+', '<URL>', text)
    # Replace mentions with <USER> token: none of the usernames are relevant for context and were probably anonimized.
    text = re.sub(r'@\w+', '<USER>', text)
    # Optionally remove hashtag symbol but keep the word ( #covid -> covid )
    text = re.sub(r'#(\w+)', r'\1', text)
    # Normalize excessive repeated characters (e.g., sooooo → soo), also double or tripple spaces are a waste of tokens.
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Remove leading/trailing whitespace and normalize internal spaces
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text


In [6]:
print("Downloading dataset...")
data_path = kagglehub.dataset_download("datatattle/covid-19-nlp-text-classification")

train_df_full = pd.read_csv(os.path.join(data_path, "Corona_NLP_train.csv"), encoding="latin1")
test_df = pd.read_csv(os.path.join(data_path, "Corona_NLP_test.csv"), encoding="latin1")

train_df_full['clean_text'] = train_df_full['OriginalTweet'].apply(clean_tweet_text)
test_df['clean_text'] = test_df['OriginalTweet'].apply(clean_tweet_text)

train_df, val_df = train_test_split(train_df_full, test_size=0.2, random_state=42, stratify=train_df_full['Sentiment'])

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Downloading dataset...


Train set size: 32925
Validation set size: 8232
Test set size: 3798


In [7]:
train_df['clean_text'].head()

3730     <USER> To everyone hoarding rice who until now...
35121    If your going to eat <USER> they have compleme...
9893     Watch this if you are one of those idiots who ...
34429    We need to have a risk management system more ...
29290    Markets plunge puts pension freedoms to the te...
Name: clean_text, dtype: object

In [8]:
class TweetsDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        sentiment_mapping = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4}
        self.texts = dataframe['clean_text'].tolist()
        self.labels = dataframe['Sentiment'].map(sentiment_mapping).tolist()
        self.tokenizer = tokenizer
        self.max_len = 512

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }



In [9]:
def train_and_validate(model, train_loader, val_loader, optimizer, epochs, trial, model_artifact_name):
    """
    Main function for training and validation loop.
    This version includes an outer progress bar for epochs.
    """
    best_val_f1 = 0.0
    
    # --- הוספת פס התקדמות חיצוני לאפוקים ---
    epoch_progress_bar = tqdm(range(1, epochs + 1), desc=f"Trial {trial.number} Epochs")
    
    for epoch in epoch_progress_bar:
        # --- עדכון התיאור של פס ההתקדמות ---
        epoch_progress_bar.set_description(f"Trial {trial.number} - Epoch {epoch}/{epochs}")

        # --- Training Loop (inner progress bar) ---
        model.train()
        train_loader_tqdm = tqdm(train_loader, desc="Training", leave=False)
        for batch in train_loader_tqdm:
            optimizer.zero_grad()
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device)
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loader_tqdm.set_postfix(loss=loss.item())

        # --- Validation Loop (inner progress bar) ---
        model.eval()
        all_val_labels, all_val_preds = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation", leave=False):
                outputs = model(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device)
                )
                preds = outputs.logits.argmax(dim=1)
                all_val_labels.extend(batch['labels'].cpu().numpy())
                all_val_preds.extend(preds.cpu().numpy())

        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        wandb.log({"Epoch": epoch, "Validation F1": val_f1})
        
        # Update the outer progress bar's postfix with the latest F1 score
        epoch_progress_bar.set_postfix(val_f1=f"{val_f1:.4f}")
        
        # --- Pruning Logic ---
        trial.report(val_f1, epoch)
        if trial.should_prune():
            del model, train_loader, val_loader, optimizer
            gc.collect()
            torch.cuda.empty_cache()
            raise optuna.TrialPruned()
        
        if val_f1 > best_val_f1:
            print(f"🚀 New best model found! F1: {val_f1:.4f} (Epoch {epoch})")
            best_val_f1 = val_f1
            artifact = wandb.Artifact(
                name=model_artifact_name, type='model',
                description=f'Best model from trial {trial.number} with F1: {val_f1:.4f}',
                metadata=dict(trial.params, epoch=epoch, val_f1=val_f1)
            )
            with artifact.new_file("model.pt", mode="wb") as f:
                torch.save(model.state_dict(), f)
            wandb.log_artifact(artifact, aliases=['best', f'trial-{trial.number}'])
            
    return best_val_f1

In [10]:
def objective(trial, model_name):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-7, 0.1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32])
    num_unfreeze_layers = trial.suggest_int("num_unfreeze_layers", 1, 4)

    model_short_name = model_name.split('/')[-1]
    config = trial.params
    config['model_name'] = model_name # Add model_name to config

    wandb.init(
        project="moti-matan-tel-aviv-university",
        config=config,
        name=f"manual-{model_short_name}-trial-{trial.number}",
        reinit=True,
        group=f"Optuna-{model_short_name}"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True).to(device)

    # Layer Freezing
    for param in model.base_model.parameters():
        param.requires_grad = False

    encoder_layers = list(model.base_model.encoder.layer)
    for layer in encoder_layers[-num_unfreeze_layers:]:
        for param in layer.parameters():
            param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True

    train_subset_df = train_df.sample(frac=0.3, random_state=42)
    train_dataset = TweetsDataset(train_subset_df, tokenizer)
    val_dataset = TweetsDataset(val_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=weight_decay)

    best_f1 = train_and_validate(model, train_loader, val_loader, optimizer, epochs=5, trial=trial, model_artifact_name=f"manual-{model_short_name}")

    wandb.finish()
    del model, tokenizer, train_loader, val_loader, optimizer
    gc.collect()
    torch.cuda.empty_cache()

    return best_f1

In [11]:
# --- Model 1: cardiffnlp/twitter-roberta-base-sentiment ---
print("--- Starting Optuna study for cardiffnlp/twitter-roberta-base-sentiment ---")
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(lambda trial: objective(trial, model_name="cardiffnlp/twitter-roberta-base-sentiment"), n_trials=12)
print(f"Best trial for twitter-roberta-base-sentiment: {study_roberta.best_trial.value}")
print(f"Best parameters: {study_roberta.best_trial.params}")

[I 2025-08-09 07:10:27,572] A new study created in memory with name: no-name-83523677-886e-4d16-9617-665d88e8fb77


--- Starting Optuna study for cardiffnlp/twitter-roberta-base-sentiment ---




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 0 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5062 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5833 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5903 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6084 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6313 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▆▇█

0,1
Epoch,5.0
Validation F1,0.63131


[I 2025-08-09 07:23:45,746] Trial 0 finished with value: 0.6313095107620008 and parameters: {'learning_rate': 3.602978435454494e-05, 'weight_decay': 5.06165052892724e-06, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 1 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5505 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5708 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6067 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6262 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▃▆█▇

0,1
Epoch,5.0
Validation F1,0.61196


[I 2025-08-09 07:36:50,985] Trial 1 finished with value: 0.626160335915341 and parameters: {'learning_rate': 3.635357693684378e-05, 'weight_decay': 4.774943948309924e-07, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 2 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5137 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5498 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5803 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6028 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▄▆█▆

0,1
Epoch,5.0
Validation F1,0.57861


[I 2025-08-09 07:49:55,828] Trial 2 finished with value: 0.6027993159904803 and parameters: {'learning_rate': 1.6256757700109393e-05, 'weight_decay': 0.003996165239403762, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 3 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4302 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4720 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4898 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5031 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5285 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▄▅▆█

0,1
Epoch,5.0
Validation F1,0.52847


[I 2025-08-09 08:00:42,410] Trial 3 finished with value: 0.5284748261977539 and parameters: {'learning_rate': 3.1256493649358045e-05, 'weight_decay': 0.00032323590769821347, 'batch_size': 32, 'num_unfreeze_layers': 1}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 4 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4461 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4971 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5251 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5417 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5461 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▇██

0,1
Epoch,5.0
Validation F1,0.54611


[I 2025-08-09 08:12:12,301] Trial 4 finished with value: 0.5461139206600677 and parameters: {'learning_rate': 4.033259035390765e-05, 'weight_decay': 2.0488008496575643e-05, 'batch_size': 16, 'num_unfreeze_layers': 1}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 5 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5343 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5584 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5998 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6156 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▃▇██

0,1
Epoch,5.0
Validation F1,0.61064


[I 2025-08-09 08:25:17,031] Trial 5 finished with value: 0.6155614373069268 and parameters: {'learning_rate': 2.1846952565174106e-05, 'weight_decay': 0.0014649081936008194, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 0 with value: 0.6313095107620008.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 6 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

[I 2025-08-09 08:27:32,959] Trial 6 pruned. 


0,1
Epoch,▁
Validation F1,▁

0,1
Epoch,1.0
Validation F1,0.48094


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 7 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

[I 2025-08-09 08:29:58,416] Trial 7 pruned. 


0,1
Epoch,▁
Validation F1,▁

0,1
Epoch,1.0
Validation F1,0.4901


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 8 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

[I 2025-08-09 08:32:25,121] Trial 8 pruned. 


0,1
Epoch,▁
Validation F1,▁

0,1
Epoch,1.0
Validation F1,0.48725


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 9 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5106 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5610 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5856 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

[I 2025-08-09 08:42:55,785] Trial 9 pruned. 


0,1
Epoch,▁▃▆█
Validation F1,▁▆██

0,1
Epoch,4.0
Validation F1,0.58182


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 10 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5797 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6307 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6447 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6500 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▆▇█▇

0,1
Epoch,5.0
Validation F1,0.64021


[I 2025-08-09 08:56:53,561] Trial 10 finished with value: 0.6499960574725696 and parameters: {'learning_rate': 4.584013354323171e-05, 'weight_decay': 0.07357512825552659, 'batch_size': 16, 'num_unfreeze_layers': 4}. Best is trial 10 with value: 0.6499960574725696.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 11 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5734 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6324 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6381 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6488 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▆▅▇█

0,1
Epoch,5.0
Validation F1,0.64877


[I 2025-08-09 09:10:57,730] Trial 11 finished with value: 0.6487654929617506 and parameters: {'learning_rate': 4.782890276843486e-05, 'weight_decay': 0.08944919072752507, 'batch_size': 16, 'num_unfreeze_layers': 4}. Best is trial 10 with value: 0.6499960574725696.


Best trial for twitter-roberta-base-sentiment: 0.6499960574725696
Best parameters: {'learning_rate': 4.584013354323171e-05, 'weight_decay': 0.07357512825552659, 'batch_size': 16, 'num_unfreeze_layers': 4}


In [12]:
# --- Model 2: roberta-base ---
print("\n--- Starting Optuna study for roberta-base ---")
study_roberta_base = optuna.create_study(direction="maximize")
study_roberta_base.optimize(lambda trial: objective(trial, model_name="roberta-base"), n_trials=12)
print(f"Best trial for roberta-base: {study_roberta_base.best_trial.value}")
print(f"Best parameters: {study_roberta_base.best_trial.params}")


[I 2025-08-09 09:10:57,992] A new study created in memory with name: no-name-cc3f721f-d935-4793-a3b8-f4e639c1e699



--- Starting Optuna study for roberta-base ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 0 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4480 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5133 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5214 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5519 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5680 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▅▇█

0,1
Epoch,5.0
Validation F1,0.56803


[I 2025-08-09 09:23:26,487] Trial 0 finished with value: 0.5680272718070384 and parameters: {'learning_rate': 1.0397713196315484e-05, 'weight_decay': 1.2331674571361498e-07, 'batch_size': 32, 'num_unfreeze_layers': 3}. Best is trial 0 with value: 0.5680272718070384.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 1 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.3191 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4556 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4837 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5037 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5115 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▆▇██

0,1
Epoch,5.0
Validation F1,0.5115


[I 2025-08-09 09:34:10,656] Trial 1 finished with value: 0.511497924391657 and parameters: {'learning_rate': 1.847125941484408e-05, 'weight_decay': 1.1816346173642974e-06, 'batch_size': 32, 'num_unfreeze_layers': 1}. Best is trial 0 with value: 0.5680272718070384.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 2 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.3670 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4629 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4906 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5096 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5222 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▇▇█

0,1
Epoch,5.0
Validation F1,0.52224


[I 2025-08-09 09:44:53,536] Trial 2 finished with value: 0.5222353197293785 and parameters: {'learning_rate': 2.1644511042858013e-05, 'weight_decay': 0.008112133938337044, 'batch_size': 32, 'num_unfreeze_layers': 1}. Best is trial 0 with value: 0.5680272718070384.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 3 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5070 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5542 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5807 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5851 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5909 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▇██

0,1
Epoch,5.0
Validation F1,0.59094


[I 2025-08-09 09:56:27,364] Trial 3 finished with value: 0.5909436959756996 and parameters: {'learning_rate': 4.7929901289161835e-05, 'weight_decay': 1.3282074455042119e-06, 'batch_size': 32, 'num_unfreeze_layers': 2}. Best is trial 3 with value: 0.5909436959756996.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 4 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4097 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.4763 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5003 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5125 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5237 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▇▇█

0,1
Epoch,5.0
Validation F1,0.52366


[I 2025-08-09 10:07:09,996] Trial 4 finished with value: 0.5236636620044861 and parameters: {'learning_rate': 2.898650203441786e-05, 'weight_decay': 0.04908383211047201, 'batch_size': 32, 'num_unfreeze_layers': 1}. Best is trial 3 with value: 0.5909436959756996.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 5 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5220 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5675 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5733 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6025 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▅█▆

0,1
Epoch,5.0
Validation F1,0.58487


[I 2025-08-09 10:19:23,064] Trial 5 finished with value: 0.6025306055153575 and parameters: {'learning_rate': 3.1212952317354524e-05, 'weight_decay': 0.046414420922664394, 'batch_size': 32, 'num_unfreeze_layers': 3}. Best is trial 5 with value: 0.6025306055153575.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 6 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5051 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5665 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5909 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5961 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▆█▇█

0,1
Epoch,5.0
Validation F1,0.59608


[I 2025-08-09 10:32:32,840] Trial 6 finished with value: 0.5960806315903034 and parameters: {'learning_rate': 1.705079421574817e-05, 'weight_decay': 9.949476247942805e-06, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 5 with value: 0.6025306055153575.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 7 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5032 (Epoch 1)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5559 (Epoch 2)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5664 (Epoch 3)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5840 (Epoch 4)


Training:   0%|          | 0/309 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/258 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▆▆██

0,1
Epoch,5.0
Validation F1,0.58123


[I 2025-08-09 10:44:47,072] Trial 7 finished with value: 0.5839743679436532 and parameters: {'learning_rate': 2.520915028282754e-05, 'weight_decay': 3.161435172162095e-07, 'batch_size': 32, 'num_unfreeze_layers': 3}. Best is trial 5 with value: 0.6025306055153575.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 8 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5281 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5713 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6025 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6151 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

0,1
Epoch,▁▃▅▆█
Validation F1,▁▄▇██

0,1
Epoch,5.0
Validation F1,0.61457


[I 2025-08-09 10:57:48,529] Trial 8 finished with value: 0.6150582844160238 and parameters: {'learning_rate': 4.3535289049703364e-05, 'weight_decay': 0.01984568287057082, 'batch_size': 16, 'num_unfreeze_layers': 3}. Best is trial 8 with value: 0.6150582844160238.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 9 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

[I 2025-08-09 11:00:23,156] Trial 9 pruned. 


0,1
Epoch,▁
Validation F1,▁

0,1
Epoch,1.0
Validation F1,0.49526


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 10 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5659 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5986 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6022 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6167 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▅▆▆█

0,1
Epoch,5.0
Validation F1,0.61673


[I 2025-08-09 11:14:25,642] Trial 10 finished with value: 0.6167294556677039 and parameters: {'learning_rate': 4.7880923166976995e-05, 'weight_decay': 0.000633976505241131, 'batch_size': 16, 'num_unfreeze_layers': 4}. Best is trial 10 with value: 0.6167294556677039.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 11 Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5760 (Epoch 1)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.5802 (Epoch 2)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6006 (Epoch 3)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6166 (Epoch 4)


Training:   0%|          | 0/618 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Validation:   0%|          | 0/515 [00:00<?, ?it/s]

🚀 New best model found! F1: 0.6170 (Epoch 5)


0,1
Epoch,▁▃▅▆█
Validation F1,▁▂▅██

0,1
Epoch,5.0
Validation F1,0.61699


[I 2025-08-09 11:28:29,910] Trial 11 finished with value: 0.6169873961881869 and parameters: {'learning_rate': 4.862140955744093e-05, 'weight_decay': 0.0013571248930636039, 'batch_size': 16, 'num_unfreeze_layers': 4}. Best is trial 11 with value: 0.6169873961881869.


Best trial for roberta-base: 0.6169873961881869
Best parameters: {'learning_rate': 4.862140955744093e-05, 'weight_decay': 0.0013571248930636039, 'batch_size': 16, 'num_unfreeze_layers': 4}


In [13]:
def compute_metrics_for_trainer(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'f1': f1_score(labels, predictions, average='weighted')}

# הגדרת המודלים לאימון
models_to_train_with_hf = ["cardiffnlp/twitter-roberta-base-sentiment", "roberta-base"]

# הכנת הדאטהסטים
sentiment_mapping = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4}
train_temp_df = train_df[['clean_text', 'Sentiment']].rename(columns={'Sentiment': 'label'})
val_temp_df = val_df[['clean_text', 'Sentiment']].rename(columns={'Sentiment': 'label'})
train_temp_df['label'] = train_temp_df['label'].map(sentiment_mapping)
val_temp_df['label'] = val_temp_df['label'].map(sentiment_mapping)
train_dataset_hf = HFDataset.from_pandas(train_temp_df.sample(frac=0.3, random_state=42))
val_dataset_hf = HFDataset.from_pandas(val_temp_df)

# לולאה על המודלים
for model_name in models_to_train_with_hf:
    print(f"\n--- Training {model_name} with Hugging Face Trainer ---")
    model_short_name = model_name.split('/')[-1]
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    
    def tokenize_function(examples):
        return tokenizer(examples['clean_text'], truncation=True, padding='max_length', max_length=512)
        
    train_tokenized = train_dataset_hf.map(tokenize_function, batched=True)
    val_tokenized = val_dataset_hf.map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
    
    training_args = TrainingArguments(
        output_dir=f'./results/{model_short_name}',
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        logging_dir=f'./logs/{model_short_name}',
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="wandb",
        run_name=f"hf-trainer-{model_short_name}",
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model, args=training_args, train_dataset=train_tokenized,
        eval_dataset=val_tokenized, tokenizer=tokenizer, compute_metrics=compute_metrics_for_trainer
    )
    
    run = wandb.init(project="moti-matan-tel-aviv-university", name=f"hf-trainer-{model_short_name}", reinit=True)
    trainer.train()
    
    best_model_path = trainer.state.best_model_checkpoint
    if best_model_path:
        print(f"Saving best model from trainer found at: {best_model_path}")
        artifact = wandb.Artifact(
            name=f"trainer-{model_short_name}",
            type='model',
            # --- הוספת תיאור ברור ---
            description=f"Best model for {model_name} trained using the Hugging Face Trainer."
        )
        artifact.add_dir(best_model_path)
        run.log_artifact(artifact, aliases=['best'])
    
    wandb.finish()


--- Training cardiffnlp/twitter-roberta-base-sentiment with Hugging Face Trainer ---


Map:   0%|          | 0/9878 [00:00<?, ? examples/s]

Map:   0%|          | 0/8232 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1
1,0.8908,0.83912,0.669961
2,0.6375,0.729731,0.733533
3,0.394,0.746678,0.74608
4,0.2806,0.855474,0.753194
5,0.1756,0.944261,0.760618


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./results/twitter-roberta-base-sentiment/checkpoint-1545)... 

Saving best model from trainer found at: ./results/twitter-roberta-base-sentiment/checkpoint-1545


Done. 7.7s


0,1
eval/f1,▁▆▇▇█
eval/loss,▅▁▂▅█
eval/runtime,▃▇▁▇█
eval/samples_per_second,▆▂█▂▁
eval/steps_per_second,▆▂█▂▁
train/epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▃▃▄▃▂▆▆▄▅▁█▂▂▅▄
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▃▂▁▁
train/loss,█▆▆▄▄▄▃▃▂▂▂▂▁▁▁

0,1
eval/f1,0.76062
eval/loss,0.94426
eval/runtime,29.0734
eval/samples_per_second,283.145
eval/steps_per_second,4.437
total_flos,1.299540505629696e+16
train/epoch,5.0
train/global_step,1545.0
train/grad_norm,11.07288
train/learning_rate,0.0



--- Training roberta-base with Hugging Face Trainer ---


Map:   0%|          | 0/9878 [00:00<?, ? examples/s]

Map:   0%|          | 0/8232 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1
1,1.005,0.975701,0.609225
2,0.7518,0.749547,0.718719
3,0.5004,0.791112,0.709633
4,0.3985,0.743602,0.74982
5,0.2574,0.856273,0.754155


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./results/roberta-base/checkpoint-1545)... 

Saving best model from trainer found at: ./results/roberta-base/checkpoint-1545


Done. 7.7s


0,1
eval/f1,▁▆▆██
eval/loss,█▁▂▁▄
eval/runtime,▁▄█▂▂
eval/samples_per_second,█▅▁▇▇
eval/steps_per_second,█▅▁▇▇
train/epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▆▄▄▇▃▆▅▄▂█▇▅▇▁▄
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▃▂▁▁
train/loss,█▆▅▄▄▄▃▃▂▂▂▂▁▁▁

0,1
eval/f1,0.75415
eval/loss,0.85627
eval/runtime,28.878
eval/samples_per_second,285.061
eval/steps_per_second,4.467
total_flos,1.299540505629696e+16
train/epoch,5.0
train/global_step,1545.0
train/grad_norm,11.77544
train/learning_rate,0.0


In [14]:
def evaluate_on_test_set(model, test_loader, model_name="model"):
    """
    Evaluates a model's performance on the test set, returning a comprehensive
    set of metrics (Accuracy, F1, Precision, Recall).
    """
    model.to(device).eval() # ודא שהמודל במצב הערכה ועל המכשיר הנכון
    all_labels = []
    all_preds = []

    # אין צורך בחישוב גרדיאנטים בשלב ההערכה
    with torch.no_grad():
        # עטיפת ה-loader עם tqdm להצגת פס התקדמות
        for batch in tqdm(test_loader, desc=f"Evaluating {model_name} on Test Set"):
            # העברת המנה (batch) לאותו מכשיר שבו נמצא המודל
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'] # שמירת התוויות לחישוב המדדים

            # קבלת תחזיות המודל
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = outputs.logits.argmax(dim=1)
            
            # איסוף התוויות והתחזיות מכל המנות
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    
    # חישוב כל המדדים הרלוונטיים
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    
    # החזרת המדדים במילון
    return {
        "Accuracy": accuracy,
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    }

In [15]:
# Compression helpers
def compress_prune_model(model, prune_percent=0.4):
    model_to_prune = deepcopy(model).to('cpu')
    parameters_to_prune = [(module, 'weight') for module in model_to_prune.modules() if isinstance(module, nn.Linear)]
    if parameters_to_prune:
        prune.global_unstructured(parameters_to_prune, pruning_method=prune.L1Unstructured, amount=prune_percent)
    return model_to_prune

def compress_quantize_model(model):
    return quantize_dynamic(deepcopy(model).to('cpu'), {nn.Linear}, dtype=torch.qint8)

def evaluate_compressed_model(model, test_loader, model_name="model"):
    """Evaluates a model's performance, size, and inference time."""
    eval_device = 'cpu' if 'quantized' in model_name.lower() else device
    model.to(eval_device).eval()

    all_labels, all_preds, inference_times = [], [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Evaluating {model_name}", leave=False):
            input_ids = batch['input_ids'].to(eval_device)
            attention_mask = batch['attention_mask'].to(eval_device)

            start_time = time.time()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            inference_times.append(time.time() - start_time)

            all_labels.extend(batch['labels'].numpy())
            all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())

    size_mb = sum(p.element_size() * p.numel() for p in model.parameters()) / (1024 * 1024)

    return {
        "F1 Score": f1_score(all_labels, all_preds, average='weighted'),
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Avg Inference Time (ms)": np.mean(inference_times) * 1000,
        "Model Size (MB)": size_mb
    }

In [16]:
def get_best_model(model_name, method):
    """Downloads the best model artifact from W&B for a given method."""
    print(f"\n--- Loading best model for {model_name} (Method: {method}) ---")
    model_short_name = model_name.split('/')[-1]
    artifact_name = f"{method}-{model_short_name}:best"
    
    run = wandb.init(project="moti-matan-tel-aviv-university", job_type='evaluation', reinit=True)
    try:
        artifact = run.use_artifact(artifact_name)
        artifact_dir = artifact.download()
        
        if os.path.exists(os.path.join(artifact_dir, "config.json")):
            model = AutoModelForSequenceClassification.from_pretrained(artifact_dir, num_labels=5)
        else:
            model_path = os.path.join(artifact_dir, "model.pt")
            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
            model.load_state_dict(torch.load(model_path, map_location=device))
        return model
    finally:
        run.finish()

def evaluate_on_test_set(model, test_loader, model_name="model"):
    """Evaluates a model's performance on the test set."""
    model.to(device).eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Evaluating {model_name} on Test Set"):
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device)
            )
            all_labels.extend(batch['labels'].numpy())
            all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
    return {
        "Accuracy": accuracy_score(all_labels, all_preds),
        "F1 Score": f1_score(all_labels, all_preds, average='weighted')
    }

def compress_prune_model(model, prune_percent=0.4):
    model_to_prune = deepcopy(model).to('cpu')
    parameters_to_prune = [(module, 'weight') for module in model_to_prune.modules() if isinstance(module, nn.Linear)]
    if parameters_to_prune:
        prune.global_unstructured(parameters_to_prune, pruning_method=prune.L1Unstructured, amount=prune_percent)
    return model_to_prune

def compress_quantize_model(model):
    return quantize_dynamic(deepcopy(model).to('cpu'), {nn.Linear}, dtype=torch.qint8)

def evaluate_compressed_model(model, test_loader, model_name="model"):
    """Evaluates a model's performance, size, and inference time."""
    eval_device = 'cpu' if 'Quantized' in model_name else device
    model.to(eval_device).eval()
    
    all_labels, all_preds, inference_times = [], [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Evaluating {model_name}", leave=False):
            input_ids = batch['input_ids'].to(eval_device)
            attention_mask = batch['attention_mask'].to(eval_device)
            
            start_time = time.time()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            inference_times.append(time.time() - start_time)
            
            all_labels.extend(batch['labels'].cpu().numpy())
            all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())

    temp_model = deepcopy(model)
    for module in temp_model.modules():
        if isinstance(module, torch.nn.Linear) and prune.is_pruned(module):
            prune.remove(module, 'weight')
    size_mb = sum(p.element_size() * p.numel() for p in temp_model.parameters()) / (1024 * 1024)
    del temp_model

    return {
        "F1 Score": f1_score(all_labels, all_preds, average='weighted'),
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Avg Inference Time (ms)": np.mean(inference_times) * 1000,
        "Model Size (MB)": size_mb
    }

In [17]:
# --- FINAL COMPETITION ---
all_test_results = {}
champion_models = {} # Dictionary to hold the two winning models
models_to_evaluate = ["cardiffnlp/twitter-roberta-base-sentiment", "roberta-base"]
tokenizer = AutoTokenizer.from_pretrained(models_to_evaluate[0])
test_dataset = TweetsDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# --- Evaluate all 4 models and find the champion for each base model ---
for model_name in models_to_evaluate:
    manual_model = get_best_model(model_name, "manual")
    manual_results = evaluate_on_test_set(manual_model, test_loader, f"{model_name} (manual)")
    all_test_results[f"{model_name} (manual)"] = manual_results

    trainer_model = get_best_model(model_name, "trainer")
    trainer_results = evaluate_on_test_set(trainer_model, test_loader, f"{model_name} (trainer)")
    all_test_results[f"{model_name} (trainer)"] = trainer_results
    
    # --- Tag the champion for this model type in W&B ---
    run = wandb.init(project="moti-matan-tel-aviv-university", job_type='tagging', reinit=True)
    model_short_name = model_name.split('/')[-1]
    champion_alias = f"champion-{model_short_name}"
    
    if trainer_results["F1 Score"] > manual_results["F1 Score"]:
        print(f"🏆 Trainer is the champion for {model_short_name}!")
        artifact_to_tag = f"trainer-{model_short_name}:best"
        champion_models[model_name] = trainer_model
    else:
        print(f"🏆 Manual/Optuna is the champion for {model_short_name}!")
        artifact_to_tag = f"manual-{model_short_name}:best"
        champion_models[model_name] = manual_model
            
    try:
        artifact = run.use_artifact(artifact_to_tag)
        if champion_alias not in artifact.aliases:
            artifact.aliases.append(champion_alias)
            artifact.save()
            print(f"Successfully added alias '{champion_alias}' to {artifact_to_tag}")
    except Exception as e:
        print(f"Could not apply champion alias: {e}")
    finally:
        run.finish()

print("\n--- CHAMPIONSHIP: Final Test Results ---")
results_df = pd.DataFrame.from_dict(all_test_results, orient='index')
display(results_df.round(4))


--- Loading best model for cardiffnlp/twitter-roberta-base-sentiment (Method: manual) ---


[34m[1mwandb[0m: Downloading large artifact manual-twitter-roberta-base-sentiment:best, 475.58MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:7.3 (65.1MB/s)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating cardiffnlp/twitter-roberta-base-sentiment (manual) on Test Set:   0%|          | 0/119 [00:00<?, ?i…

  return forward_call(*args, **kwargs)



--- Loading best model for cardiffnlp/twitter-roberta-base-sentiment (Method: trainer) ---


[34m[1mwandb[0m: Downloading large artifact trainer-twitter-roberta-base-sentiment:best, 1431.30MB. 12 files... 
[34m[1mwandb[0m:   12 of 12 files downloaded.  
Done. 0:0:1.0 (1479.0MB/s)


Evaluating cardiffnlp/twitter-roberta-base-sentiment (trainer) on Test Set:   0%|          | 0/119 [00:00<?, ?…

  return forward_call(*args, **kwargs)


🏆 Trainer is the champion for twitter-roberta-base-sentiment!
Successfully added alias 'champion-twitter-roberta-base-sentiment' to trainer-twitter-roberta-base-sentiment:best



--- Loading best model for roberta-base (Method: manual) ---


[34m[1mwandb[0m: Downloading large artifact manual-roberta-base:best, 475.58MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:8.6 (55.3MB/s)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating roberta-base (manual) on Test Set:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)



--- Loading best model for roberta-base (Method: trainer) ---


[34m[1mwandb[0m: Downloading large artifact trainer-roberta-base:best, 1431.30MB. 12 files... 
[34m[1mwandb[0m:   12 of 12 files downloaded.  
Done. 0:0:0.9 (1590.2MB/s)


Evaluating roberta-base (trainer) on Test Set:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


🏆 Trainer is the champion for roberta-base!
Successfully added alias 'champion-roberta-base' to trainer-roberta-base:best



--- CHAMPIONSHIP: Final Test Results ---


Unnamed: 0,Accuracy,F1 Score
cardiffnlp/twitter-roberta-base-sentiment (manual),0.6172,0.616
cardiffnlp/twitter-roberta-base-sentiment (trainer),0.7378,0.7388
roberta-base (manual),0.5948,0.599
roberta-base (trainer),0.7388,0.7397


In [18]:
# --- Run Compression Analysis on EACH of the two champions ---
for model_name, winning_model in champion_models.items():
    print(f"\n{'='*20} Compression Analysis for Champion: {model_name} {'='*20}")
    compression_results = {}
    
    # 1. Original Model (The Champion)
    compression_results['Original'] = evaluate_compressed_model(winning_model, test_loader, "Original")

    # 2. Pruned Model
    pruned_model = compress_prune_model(winning_model)
    compression_results['Pruned (40%)'] = evaluate_compressed_model(pruned_model, test_loader, "Pruned")

    # 3. Quantized Model
    quantized_model = compress_quantize_model(winning_model)
    compression_results['Quantized (INT8)'] = evaluate_compressed_model(quantized_model, test_loader, "Quantized")
    
    # 4. Distilled Model (simulated with a smaller architecture)
    distilled_model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=5)
    compression_results['Distilled (Untrained)'] = evaluate_compressed_model(distilled_model, test_loader, "Distilled")

    compression_df = pd.DataFrame.from_dict(compression_results, orient='index')
    display(compression_df.round(4))




Evaluating Original:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Evaluating Pruned:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Evaluating Quantized:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Distilled:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Unnamed: 0,F1 Score,Accuracy,Avg Inference Time (ms),Model Size (MB)
Original,0.7388,0.7378,6.2664,475.5
Pruned (40%),0.4524,0.5269,8.5305,475.5
Quantized (INT8),0.7246,0.7246,1388.7191,148.916
Distilled (Untrained),0.1181,0.273,3.8305,313.2715





Evaluating Original:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Evaluating Pruned:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Evaluating Quantized:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Distilled:   0%|          | 0/119 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Unnamed: 0,F1 Score,Accuracy,Avg Inference Time (ms),Model Size (MB)
Original,0.7397,0.7388,6.2824,475.5
Pruned (40%),0.3652,0.4476,8.6335,475.5
Quantized (INT8),0.7157,0.7146,1391.2147,148.916
Distilled (Untrained),0.1194,0.2738,3.7978,313.2715
