In [18]:
!pip install --upgrade pip
!pip install tokenizers
!pip install datasets --upgrade evaluate
!pip install transformers
!pip install numpy torch matplotlib pandas scikit-learn tqdm pillow
!pip install datasets evaluate transformers
!pip install torchvision
!pip install setuptools
!pip install wandb
!pip show wandb
!pip install schedulefree
!pip install nbformat


Name: wandb
Version: 0.19.9
Summary: A CLI and library for interacting with the Weights & Biases API.
Home-page: 
Author: 
Author-email: Weights & Biases <support@wandb.com>
License: MIT License

Copyright (c) 2021 Weights and Biases, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEM

In [19]:
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset
from evaluate import load
from transformers import (
    ViTFeatureExtractor,
    ViTForImageClassification,
    TrainingArguments,
    Trainer,
    get_scheduler,
    AutoImageProcessor
)

from torch.optim import AdamW, SGD
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torchvision.transforms as transforms
from PIL import Image
import random
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
from schedulefree import AdamWScheduleFree

In [20]:
# sweep_config = {
#     "method": "grid",  # we cna use "grid", "random", or "bayes"
#     "metric": {
#         "name": "val_accuracy",  # Metric to optimize
#         "goal": "maximize"       # Goal: maximize or minimize
#     },
#     "parameters": {
#         "optimizer_name": {
#             "values": ["AdamW", "SGD", "RMSProp", "AdaGrad", "schedule_free_adamw"]  # Optimizers to test
#         },
#         "learning_rate": {
#             "values": [2e-5, 2e-4, 2e-3, 2e-2, 2e-1]  # Fixed learning rate for simplicity
#         },
#         "batch_size": {
#             "values": [16]  # Fixed batch size
#         },
#         "num_epochs": {
#             "values": [3]  # Fixed number of epochs
#         },
#         "scheduler_name": {
#             "values": ["cosine"]  # Fixed scheduler for simplicity
#         }
#     }
# }

In [21]:
# Set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

# Initialize experiment tracking
def init_wandb(project_name, experiment_name, config):
    return wandb.init(
        # entity="dl_project_sp25",
        project=project_name,
        name=experiment_name,
        config=config,
        reinit=True
    )

# Load and prepare dataset
def prepare_dataset(dataset_name, image_processor):
    """
    Load and prepare a dataset from Hugging Face for ViT fine-tuning
    """
    # Load the dataset
    print(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name)

    # Get label information
    if "label" in dataset["train"].features:
        labels = dataset["train"].features["label"].names
    elif "labels" in dataset["train"].features:
        labels = dataset["train"].features["labels"].names
    else:
        # Count unique labels and create labels list
        all_labels = dataset["train"][0]["label"] if "label" in dataset["train"][0] else dataset["train"][0]["labels"]
        num_labels = len(set(all_labels))
        labels = [str(i) for i in range(num_labels)]

    # Create label mappings
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for i, label in enumerate(labels)}

    # Set up image transformations based on the model's requirements
    normalize = transforms.Normalize(
        mean=image_processor.image_mean,
        std=image_processor.image_std
    )

    # Get the expected image size
    if "shortest_edge" in image_processor.size:
        size = image_processor.size["shortest_edge"]
    else:
        size = (image_processor.size["height"], image_processor.size["width"])

    # Define transforms for training data
    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    # Define transforms for validation/test data
    val_transforms = transforms.Compose([
        transforms.Resize(size),
        transforms.CenterCrop(size),
        transforms.ToTensor(),
        normalize,
    ])

    # Apply transformations to the dataset
    def preprocess_train(examples):
        examples["pixel_values"] = [
            train_transforms(image.convert("RGB"))
            for image in examples["image"]
        ]
        return examples

    def preprocess_val(examples):
        examples["pixel_values"] = [
            val_transforms(image.convert("RGB"))
            for image in examples["image"]
        ]
        return examples

    # Apply preprocessing to each split
    train_dataset = dataset["train"].map(
        preprocess_train, batched=True, remove_columns=["image"]
    )

    if "validation" in dataset:
        val_dataset = dataset["validation"].map(
            preprocess_val, batched=True, remove_columns=["image"]
        )

    else:
        # Create a validation split if none exists
        splits = train_dataset.train_test_split(test_size=0.2, seed=42)
        train_dataset = splits["train"]
        val_dataset = splits["test"]

    if "test" in dataset:
        test_dataset = dataset["test"].map(
            preprocess_val, batched=True, remove_columns=["image"]
        )
    else:
        # test_dataset = val_dataset    #split further rather than using validation as test dataset

        # Further split validation dataset to create a test dataset
        test_split = val_dataset.train_test_split(test_size=0.2, seed=42)
        val_dataset = test_split["train"]  # Update validation dataset
        test_dataset = test_split["test"]  # Create test dataset

    print(f"Dataset prepared with {len(train_dataset)} training, {len(val_dataset)} validation, and {len(test_dataset)} test examples")

    return train_dataset, val_dataset, test_dataset, id2label, label2id

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

# Main experiment pipeline
def run_vit_experiment(config):
    """
    Run a ViT experiment with the specified configuration
    """
    # Initialize wandb for experiment tracking
    run = init_wandb("ViT-LR-Schedulers", config["experiment_name"], config)

    # Load the image processor for the model
    image_processor = AutoImageProcessor.from_pretrained(config["model_name"], use_fast=True)

    # Prepare the dataset
    train_dataset, val_dataset, test_dataset, id2label, label2id = prepare_dataset(
        config["dataset_name"], image_processor
    )

    # # Visualize some images from the training dataset (do this w/o the remove_columns=["image"])
    # # Initialize a set to keep track of shown labels
    # shown_labels = set()

    # # Initialize the figure for plotting
    # plt.figure(figsize=(15, 10))

    # # Loop through the dataset and plot the first image of each label
    # for i, sample in enumerate(train_dataset):
    #     label = sample["label"]
    #     image = sample["image"]

    #     # Check if the label has already been shown
    #     if label not in shown_labels:
    #         plt.subplot(1, len(id2label), len(shown_labels) + 1)
    #         plt.imshow(image.convert("RGB"))  # Convert to RGB if necessary
    #         plt.title(id2label[label])  # Get label name
    #         plt.axis("off")
    #         shown_labels.add(label)

    #         # Stop if all labels have been shown
    #         if len(shown_labels) == len(id2label):
    #             break

    # plt.show()

    # Load the ViT model
    model = ViTForImageClassification.from_pretrained(
        config["model_name"],
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{config['experiment_name']}",
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=config["num_epochs"],
        weight_decay=config["weight_decay"],
        eval_strategy="steps",
        save_strategy="steps",
        logging_strategy="steps",  # Ensure logging is enabled
        logging_steps=10,          # Log every 10 steps (adjust as needed)
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        push_to_hub=False,
        report_to="wandb",
        remove_unused_columns=False,
        learning_rate=config["learning_rate"],
    )

    # Setup optimizer
    if config["optimizer_name"] == "AdamW":
        optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
    else:  # SGD
        optimizer = SGD(model.parameters(), lr=config["learning_rate"], momentum=0.9)

    # Setup scheduler
    num_training_steps = len(train_dataset) // config["batch_size"] * config["num_epochs"]
    num_warmup_steps = int(num_training_steps * config["warmup_ratio"]) if "warmup_ratio" in config else 0

    scheduler_name = config["scheduler_name"]
    if scheduler_name == "linear":
        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    elif scheduler_name == "cosine":
        scheduler = get_scheduler(
            "cosine",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    elif scheduler_name == "cosine_with_restarts":
        scheduler = get_scheduler(
            "cosine_with_restarts",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
        )
    elif scheduler_name == "polynomial":
        scheduler = get_scheduler(
            "polynomial",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            # power=config.get("poly_power", 1.0),
        )
    elif scheduler_name == "constant":
        scheduler = get_scheduler(
            "constant",
            optimizer=optimizer,
        )
    elif scheduler_name == "constant_with_warmup":
        scheduler = get_scheduler(
            "constant_with_warmup",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
        )
    # add more experiments.


    else:
        raise ValueError(f"Scheduler {scheduler_name} not supported")

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler)
    )

    # # Loss validation curve in the training loop to log metrics to W&B
    # for epoch in range(config["num_epochs"]):
    #     print(f"Epoch {epoch + 1}/{config['num_epochs']}")

    #     # Train for one epoch
    #     trainer.train()

    #     # Evaluate on validation set
    #     eval_results = trainer.evaluate(val_dataset)

    #     print(trainer.state.log_history)
    #     if trainer.state.log_history and "loss" in trainer.state.log_history[-1]:
    #         train_loss = trainer.state.log_history[-1]["loss"]
    #     else:
    #         train_loss = None

    #     # Log training and validation metrics to W&B
    #     wandb.log({
    #         "epoch": epoch + 1,
    #         "train_loss": trainer.state.log_history[-1].get("loss", None),
    #         "val_loss": eval_results["eval_loss"],
    #         "val_accuracy": eval_results["eval_accuracy"],
    #     })

    # # Loss epoch curve in the training loop to log metrics to W&B
    # for epoch in range(config["num_epochs"]):
    #     print(f"Epoch {epoch + 1}/{config['num_epochs']}")

    #     # Train for one epoch
    #     trainer.train()

    #     # Evaluate on validation set
    #     eval_results = trainer.evaluate(val_dataset)

    #     # Extract training loss from the trainer's state
    #     if trainer.state.log_history and "loss" in trainer.state.log_history[-1]:
    #         train_loss = trainer.state.log_history[-1]["loss"]
    #     else:
    #         train_loss = None  # Handle missing loss gracefully

    #     # Log training and validation metrics to W&B
    #     wandb.log({
    #         "epoch": epoch + 1,
    #         "train_loss": train_loss,                  # Training loss
    #         "val_loss": eval_results["eval_loss"],    # Validation loss
    #         "val_accuracy": eval_results["eval_accuracy"],  # Validation accuracy
    #     })

    # Train the model
    print(f"Starting training for {config['experiment_name']}...")
    trainer.train()

    # Evaluate the model
    print(f"Evaluating {config['experiment_name']}...")
    eval_results = trainer.evaluate(test_dataset)


    # Log final metrics
    wandb.log({
        "final_accuracy": eval_results["eval_accuracy"],
        "final_f1": eval_results["eval_f1"],
        "final_precision": eval_results["eval_precision"],
        "final_recall": eval_results["eval_recall"],
    })

    # Compute confusion matrix for test set
    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=1)

    # Convert to lists
    labels = labels.tolist()
    predictions = predictions.tolist()

    # Log confusion matrix to W&B
    wandb.log({
        "confusion_matrix_test": wandb.plot.confusion_matrix(
            probs=None,
            y_true=labels,
            preds=predictions,
            class_names=[str(i) for i in range(len(np.unique(labels)))]
        )
    })



    # Save the model
    trainer.save_model(f"./saved_models/{config['experiment_name']}")

    # Finish wandb run
    wandb.finish()

    return eval_results

# Get experiment configurations for challenging datasets
def get_experiment_configs():
    # We'll use a more complex dataset from Hugging Face
    base_config = {
        "model_name": "google/vit-base-patch16-224-in21k",
        "dataset_name": "jbarat/plant_species",  # Any challenging dataset.
        "batch_size": 16,
        "num_epochs": 3, # let's keep smaller number to begin with.
        "weight_decay": 0.01,
        "optimizer_name": "AdamW",
    }

    # Different learning rate scheduler configurations
    configs = []

    # Constant learning rate (baseline)
    configs.append({
        **base_config,
        "experiment_name": "vit_constant_lr",
        "learning_rate": 5e-5,
        "scheduler_name": "constant",
    })

    # Linear decay
    configs.append({
        **base_config,
        "experiment_name": "vit_linear_decay",
        "learning_rate": 5e-5,
        "scheduler_name": "linear",
        "warmup_ratio": 0.1,
    })

    # Cosine decay (commonly used with ViT)
    configs.append({
        **base_config,
        "experiment_name": "vit_cosine_decay",
        "learning_rate": 5e-5,
        "scheduler_name": "cosine",
        "warmup_ratio": 0.1,
    })

    # Cosine with restarts
    configs.append({
        **base_config,
        "experiment_name": "vit_cosine_restarts",
        "learning_rate": 5e-5,
        "scheduler_name": "cosine_with_restarts",
        "warmup_ratio": 0.1,
    })

    # Polynomial decay
    configs.append({
        **base_config,
        "experiment_name": "vit_polynomial",
        "learning_rate": 5e-5,
        "scheduler_name": "polynomial",
        "warmup_ratio": 0.1,
        "poly_power": 2.0,
    })

    # Constant with warmup
    configs.append({
        **base_config,
        "experiment_name": "vit_constant_warmup",
        "learning_rate": 5e-5,
        "scheduler_name": "constant_with_warmup",
        "warmup_ratio": 0.1,
    })

    # Different learning rate experiments
    for lr in [1e-5, 3e-5, 1e-4]:
        configs.append({
            **base_config,
            "experiment_name": f"vit_cosine_lr_{lr}",
            "learning_rate": lr,
            "scheduler_name": "cosine",
            "warmup_ratio": 0.1,
        })

    # Different optimizer experiments
    configs.append({
        **base_config,
        "experiment_name": "vit_sgd_cosine",
        "learning_rate": 0.01,  # Higher LR for SGD
        "scheduler_name": "cosine",
        "warmup_ratio": 0.1,
        "optimizer_name": "SGD",
    })

    # here we can make changes to add new datasets to experiment.
    # or change batch_size to see the impact.
    # Other datasets to try (uncomment to use)
    #   Erik: We can use a data set as a strech. Maybe something less similar than plants for better contrasting comparison?
    # flowers dataset
    # configs.append({
    #     **base_config,
    #     "dataset_name": "huggan/flowers",
    #     "experiment_name": "vit_flowers_cosine",
    #     "learning_rate": 5e-5,
    #     "scheduler_name": "cosine",
    #     "warmup_ratio": 0.1,
    # })

    return configs

# Run experiments and visualize results
def run_all_experiments():
    configs = get_experiment_configs()
    results = []

    for config in configs:
        print(f"\n{'='*50}")
        print(f"Running experiment: {config['experiment_name']}")
        print(f"{'='*50}\n")

        eval_results = run_vit_experiment(config)
        results.append({
            "experiment": config['experiment_name'],
            "accuracy": eval_results["eval_accuracy"],
            "f1": eval_results["eval_f1"],
            "precision": eval_results["eval_precision"],
            "recall": eval_results["eval_recall"],
            "config": config
        })

    return results

# Visualize and compare results
def visualize_results(results):
    # Create DataFrame for easier plotting
    df = pd.DataFrame([
        {
            "Experiment": result["experiment"],
            "Accuracy": result["accuracy"],
            "F1 Score": result["f1"],
            "Precision": result["precision"],
            "Recall": result["recall"],
            "Learning Rate": result["config"]["learning_rate"],
            "Scheduler": result["config"]["scheduler_name"],
            "Optimizer": result["config"]["optimizer_name"],
            "Dataset": result["config"]["dataset_name"]
        }
        for result in results
    ])

    # Plot accuracy comparison
    plt.figure(figsize=(14, 8))
    ax = plt.bar(df["Experiment"], df["Accuracy"], color='skyblue')
    plt.xlabel('Experiment')
    plt.ylabel('Accuracy')
    plt.title('Comparison of Model Accuracy Across Experiments')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig("accuracy_comparison.png")
    plt.close()

    # Plot all metrics for a more comprehensive comparison
    plt.figure(figsize=(16, 10))
    metrics = ["Accuracy", "F1 Score", "Precision", "Recall"]
    x = np.arange(len(df["Experiment"]))
    width = 0.2

    for i, metric in enumerate(metrics):
        plt.bar(x + i*width, df[metric], width=width, label=metric)

    plt.xlabel('Experiment')
    plt.ylabel('Score')
    plt.title('Comparison of Metrics Across Experiments')
    plt.xticks(x + width*1.5, df["Experiment"], rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.savefig("metrics_comparison.png")
    plt.close()

    # Plot results by scheduler type
    plt.figure(figsize=(14, 8))
    schedulers = df["Scheduler"].unique()
    for scheduler in schedulers:
        scheduler_data = df[df["Scheduler"] == scheduler]
        plt.plot(scheduler_data["Learning Rate"], scheduler_data["Accuracy"], 'o-', label=scheduler)

    plt.xlabel('Learning Rate')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. Learning Rate by Scheduler Type')
    plt.xscale('log')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("scheduler_comparison.png")
    plt.close()

    # Create a table with results
    print("Results Summary:")
    print(df[["Experiment", "Accuracy", "F1 Score", "Precision", "Recall", "Scheduler", "Learning Rate", "Optimizer", "Dataset"]])

    # Save results to CSV
    df.to_csv("experiment_results.csv", index=False)

    return df

# Function to run a single experiment (useful for testing)
def run_single_experiment(experiment_index=0):
    configs = get_experiment_configs()
    if experiment_index >= len(configs):
        print(f"Invalid experiment index. Choose between 0 and {len(configs)-1}")
        return

    config = configs[experiment_index]
    print(f"Running single experiment: {config['experiment_name']}")
    eval_results = run_vit_experiment(config)

    print(f"\nResults for {config['experiment_name']}:")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"F1 Score: {eval_results['eval_f1']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")

    return eval_results


In [None]:
def run_optimizer_sweep():
    # Initialize W&B run first, then access config
    with wandb.init() as run:
        print(f"W&B initialized: {run.name}")
        
        # Get config from sweep
        config = wandb.config
        
        # Set experiment name based on sweep parameters
        custom_name = f"vit_{config.optimizer_name}_{config.learning_rate}"
        # Update the run name after initialization
        wandb.run.name = custom_name
        wandb.run.save()
        
        print(f"Running experiment: {custom_name}")
        
        # Load model and processor
        model_name = "google/vit-base-patch16-224-in21k"
        dataset_name = "jbarat/plant_species"
        
        # Load the image processor
        image_processor = AutoImageProcessor.from_pretrained(model_name, use_fast=True)
        
        # Prepare dataset
        train_dataset, val_dataset, test_dataset, id2label, label2id = prepare_dataset(
            dataset_name, image_processor
        )
        
        # Load the ViT model
        model = ViTForImageClassification.from_pretrained(
            model_name,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f"./results/{custom_name}",
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            num_train_epochs=config.num_epochs,
            weight_decay=0.01,
            eval_strategy="steps",
            save_strategy="steps",
            logging_strategy="steps",
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            push_to_hub=False,
            report_to="wandb",
            remove_unused_columns=False,
            learning_rate=config.learning_rate,
        )
        # Set up optimizer based on config
        if config.optimizer_name == "schedule_free_adamw":
            optimizer = AdamWScheduleFree(
                model.parameters(),
                lr=config.learning_rate,  # Learning rate
                # warmup_steps=500  # Optional: Adjust based on your dataset
            )
        elif config.optimizer_name == "AdamW":
            optimizer = AdamW(model.parameters(), lr=config.learning_rate)
        elif config.optimizer_name == "SGD":
            optimizer = SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)
        elif config.optimizer_name == "RMSProp":
            optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)
        elif config.optimizer_name == "AdaGrad":
            optimizer = torch.optim.Adagrad(model.parameters(), lr=config.learning_rate)
        else:
            optimizer = AdamW(model.parameters(), lr=config.learning_rate)        
        
        # Setup scheduler
        num_training_steps = len(train_dataset) // config.batch_size * config.num_epochs
        num_warmup_steps = int(num_training_steps * 0.1)  # 10% warmup
        
        scheduler = get_scheduler(
            config.scheduler_name,
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            optimizers=(optimizer, scheduler)
        )
        
        # Train the model
        print(f"Starting training...")
        # optimizer.train()  # Switch optimizer to training mode only for schedule_free
        trainer.train()
        
        # Evaluate on validation dataset
        print(f"Evaluating on validation set...")
        # optimizer.eval()  # Switch optimizer to evaluation mode only for schedule_free
        eval_results = trainer.evaluate(val_dataset)
        
        # Log validation metrics
        run.log({
            "val_accuracy": eval_results["eval_accuracy"],
            "val_f1": eval_results["eval_f1"],
            "val_precision": eval_results["eval_precision"],
            "val_recall": eval_results["eval_recall"],
            "val_loss": eval_results["eval_loss"]
        })
        
        # Evaluate on test dataset
        print(f"Evaluating on test set...")
        test_results = trainer.evaluate(test_dataset)
        
        # Log test metrics
        run.log({
            "test_accuracy": test_results["eval_accuracy"],
            "test_f1": test_results["eval_f1"],
            "test_precision": test_results["eval_precision"],
            "test_recall": test_results["eval_recall"],
            "test_loss": test_results["eval_loss"]
        })
        
        # Compute confusion matrix for test set
        predictions, labels, _ = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=1)
        
        # Log confusion matrix
        run.log({
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=labels.tolist(),
                preds=predictions.tolist(),
                class_names=[id2label[i] for i in range(len(id2label))]
            )
        })
        
        # Save the model
        model_path = f"./saved_models/{custom_name}"
        trainer.save_model(model_path)
        print(f"Model saved to {model_path}")

In [None]:

# Main execution
if __name__ == "__main__":
    print("Starting ViT experiments with different learning rate schedulers...")
    os.environ["WANDB_PROJECT"] = "ViT-LR-Schedulers"

    # Option 1: Run all experiments (time-consuming)
    # results = run_all_experiments()
    # results_df = visualize_results(results)

    # Option 2: Run a single experiment for testing
    # run_single_experiment(0)  # Try the baseline experiment first

    #option 3: Optimizer sweep:
    # Define sweep configuration
    sweep_config = {
        "method": "grid",  # we can use "grid", "random", or "bayes"
        "metric": {
            "name": "val_accuracy",  # Metric to optimize
            "goal": "maximize"       # Goal: maximize or minimize
        },
        "parameters": {
            "optimizer_name": {
                "values": ["schedule_free_adamw","AdamW", "SGD", "RMSProp", "AdaGrad"]  # Optimizers to test
            },
            "learning_rate": {
                "values": [2e-5, 2e-4, 2e-3, 2e-2, 2e-1]  # Learning rates to test
            },
            "batch_size": {
                "values": [16]  # Fixed batch size
            },
            "num_epochs": {
                "values": [3]  # Fixed number of epochs
            },
            "scheduler_name": {
                "values": ["cosine"]  # Fixed scheduler for simplicity
            }
        }
    }
    
    # Initialize the sweep
    sweep_id = wandb.sweep(sweep_config, project="ViT-Optimizer-Sweep")
    
    # Start the sweep agent
    wandb.agent(sweep_id, function=run_optimizer_sweep)
    

    print("Experiments completed!")

Starting ViT experiments with different learning rate schedulers...


Create sweep with ID: 8v6px2kf
Sweep URL: https://wandb.ai/dl_project_sp25/ViT-Optimizer-Sweep/sweeps/8v6px2kf


[34m[1mwandb[0m: Agent Starting Run: 06jzobdu with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0002
[34m[1mwandb[0m: 	num_epochs: 3
[34m[1mwandb[0m: 	optimizer_name: AdamW
[34m[1mwandb[0m: 	scheduler_name: cosine


W&B initialized: treasured-sweep-1
Running experiment: vit_AdamW_0.0002
Loading dataset: jbarat/plant_species
Dataset prepared with 640 training, 128 validation, and 32 test examples


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,2.012,1.826977,0.585938,0.544083,0.542073,0.585938
20,1.6424,1.390716,0.726562,0.716399,0.768291,0.726562
30,1.3427,1.110769,0.789062,0.784859,0.805987,0.789062
40,1.0385,0.947948,0.757812,0.760557,0.77536,0.757812
50,0.6881,0.888995,0.773438,0.770746,0.791871,0.773438
60,0.6648,0.726733,0.820312,0.816785,0.827519,0.820312
70,0.5282,0.795668,0.742188,0.745829,0.778188,0.742188
80,0.5451,0.610374,0.828125,0.823289,0.829726,0.828125
90,0.3585,0.593596,0.828125,0.82961,0.838593,0.828125
100,0.3167,0.603395,0.8125,0.812537,0.817605,0.8125


Evaluating on validation set...


Evaluating on test set...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved to ./saved_models/vit_AdamW_0.0002


0,1
eval/accuracy,▁▅▇▆▆█▅██▇███▅
eval/f1,▁▅▇▆▆█▆██▇███▄
eval/loss,█▆▄▃▃▂▂▁▁▁▁▁▁▂
eval/precision,▁▆▇▆▇█▆██▇███▃
eval/recall,▁▅▇▆▆█▅██▇███▅
eval/runtime,█▆▇▇▇▇▇▇▇▇▇▇█▁
eval/samples_per_second,▂█▇▆▅▄▄▇▆▃▇▇▁▄
eval/steps_per_second,▂█▇▆▅▄▄▇▆▃▇▇▁▄
test/accuracy,▁
test/f1,▁

0,1
eval/accuracy,0.71875
eval/f1,0.65841
eval/loss,0.69987
eval/precision,0.62153
eval/recall,0.71875
eval/runtime,1.8796
eval/samples_per_second,17.025
eval/steps_per_second,1.064
test/accuracy,0.71875
test/f1,0.65841


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


Experiments completed!


In [25]:
import pandas as pd

# Load the exported CSV file
df = pd.read_csv("wandb_export.csv")

# Group by optimizer and find the best learning rate for each
best_lr_per_optimizer = (
    df.groupby("optimizer_name")
    .apply(lambda group: group.loc[group["val_accuracy"].idxmax()])
    [["optimizer_name", "learning_rate", "val_accuracy"]]
)

print(best_lr_per_optimizer)

                          optimizer_name  learning_rate  val_accuracy
optimizer_name                                                       
AdaGrad                          AdaGrad         0.0002      0.796875
AdamW                              AdamW         0.0002      0.835938
RMSProp                          RMSProp         0.0002      0.820312
SGD                                  SGD         0.0200      0.812500
schedule_free_adamw  schedule_free_adamw         0.0002      0.843750


  .apply(lambda group: group.loc[group["val_accuracy"].idxmax()])
