## Setup Environment

In [None]:
#To save the output results to google drive, you can use the following step
from google.colab import drive
drive.mount('/content/drive')
out_dir = "drive/MyDrive/Colab Notebooks/"
!ls "drive/MyDrive/Colab Notebooks/"

In [None]:
!pip install --quiet --upgrade pip
!pip install -r requirements.txt
print("✅ Core packages installed!")

In [None]:
# Check GPU availability and system info
import torch
import os
import subprocess

print("🔍 System Information:")
print(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("❌ No GPU available! Please enable GPU runtime in Colab.")
    print("Runtime > Change runtime type > Hardware accelerator > GPU")

# Set environment variables for optimal performance
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_CACHE'] = '/content/transformers_cache'

## Zero-shot classification

measure performance of CLIP zero-shot classifier on TinyImageNet dataset. use test split (val/ folder). 

In [8]:
!python3 ../zeroshot.py --dataset TinyImageNet --batch-size 32 --data-location "/Users/Yang/Desktop/model-merge/model-soups/exp-clip_TinyImageNet/dataset"

Building zero-shot classifier.
100%|█████████████████████████████████████████| 200/200 [02:10<00:00,  1.53it/s]
[0% 0/313]	Acc: 53.12	Data (t) 5.761	Batch (t) 6.569
[6% 20/313]	Acc: 57.59	Data (t) 0.001	Batch (t) 0.419
[13% 40/313]	Acc: 59.83	Data (t) 0.001	Batch (t) 0.364
[19% 60/313]	Acc: 59.78	Data (t) 0.002	Batch (t) 0.348
[26% 80/313]	Acc: 60.49	Data (t) 0.001	Batch (t) 0.333
[32% 100/313]	Acc: 60.58	Data (t) 0.001	Batch (t) 0.333
[38% 120/313]	Acc: 61.11	Data (t) 0.001	Batch (t) 0.345
[45% 140/313]	Acc: 61.37	Data (t) 0.001	Batch (t) 0.358
[51% 160/313]	Acc: 61.51	Data (t) 0.001	Batch (t) 0.372
[58% 180/313]	Acc: 61.62	Data (t) 0.001	Batch (t) 0.339
[64% 200/313]	Acc: 61.61	Data (t) 0.001	Batch (t) 0.332
[70% 220/313]	Acc: 61.44	Data (t) 0.001	Batch (t) 0.342
[77% 240/313]	Acc: 61.50	Data (t) 0.001	Batch (t) 0.332
[83% 260/313]	Acc: 61.30	Data (t) 0.001	Batch (t) 0.336
[89% 280/313]	Acc: 61.30	Data (t) 0.001	Batch (t) 0.359
[96% 300/313]	Acc: 61.26	Data (t) 0.001	Batch (t) 0.343


## Fine-tuning
We start with pretrained CLIP ViT-B/32, then fine-tune it on TinyImageNet. The training set is 90% of the train/, and 10% of them are used as validationset. We sweep for different hyperparameters

## Fine-tuning
We start with pretrained CLIP ViT-B/32, then fine-tune it on TinyImageNet. The training set is 90% of the train/, and 10% of them are used as validation set. We sweep for different hyperparameters to create 5 diverse models for model soups.

### Hyperparameter Configurations:
1. **Config 1**: lr=3e-5, wd=0.1, epochs=10, batch_size=256, timm_aug=False
2. **Config 2**: lr=1e-5, wd=0.1, epochs=10, batch_size=256, timm_aug=False  
3. **Config 3**: lr=3e-6, wd=0.1, epochs=10, batch_size=256, timm_aug=False
4. **Config 4**: lr=2e-5, wd=1e-3, epochs=10, batch_size=256, timm_aug=True
5. **Config 5**: lr=1e-6, wd=1e-4, epochs=10, batch_size=256, timm_aug=False

In [None]:
# Configuration 1: lr=3e-5, wd=0.1, epochs=10, batch_size=256, timm_aug=False
!python ../finetune.py --lr 3e-5 --wd 0.1 --epochs 10 --batch-size 256 --data-location "/content/dataset" --name "config1"

# Backup model to Google Drive
!cp config1_*.pt "/content/drive/MyDrive/Colab Notebooks/"
print("✅ Configuration 1 completed and backed up to Drive!")

In [None]:
# Configuration 2: lr=1e-5, wd=0.1, epochs=10, batch_size=256, timm_aug=False
!python ../finetune.py --lr 1e-5 --wd 0.1 --epochs 10 --batch-size 256 --data-location "/content/dataset" --name "config2"

# Backup model to Google Drive
!cp config2_*.pt "/content/drive/MyDrive/Colab Notebooks/"
print("✅ Configuration 2 completed and backed up to Drive!")

In [None]:
# Configuration 3: lr=3e-6, wd=0.1, epochs=10, batch_size=256, timm_aug=False
!python ../finetune.py --lr 3e-6 --wd 0.1 --epochs 10 --batch-size 256 --data-location "/content/dataset" --name "config3"

# Backup model to Google Drive
!cp config3_*.pt "/content/drive/MyDrive/Colab Notebooks/"
print("✅ Configuration 3 completed and backed up to Drive!")

In [None]:
# Configuration 4: lr=2e-5, wd=1e-3, epochs=10, batch_size=256, timm_aug=True
!python ../finetune.py --lr 2e-5 --wd 1e-3 --epochs 10 --batch-size 256 --timm-aug --data-location "/content/dataset" --name "config4"

# Backup model to Google Drive
!cp config4_*.pt "/content/drive/MyDrive/Colab Notebooks/"
print("✅ Configuration 4 completed and backed up to Drive!")

In [None]:
# Configuration 5: lr=1e-6, wd=1e-4, epochs=10, batch_size=256, timm_aug=False
!python ../finetune.py --lr 1e-6 --wd 1e-4 --epochs 10 --batch-size 256 --data-location "/content/dataset" --name "config5"

# Backup model to Google Drive
!cp config5_*.pt "/content/drive/MyDrive/Colab Notebooks/"
print("✅ Configuration 5 completed and backed up to Drive!")

## Load Models and Create Soup
After training all 5 configurations, load the models and create model soups

In [None]:
import torch
import clip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm.notebook import tqdm

# Import TinyImageNet dataset classes
import sys
sys.path.append('..')
from dataset.tiny_imagenet import TinyImageNet
from utils import ModelWrapper

In [None]:
# Load all trained models (check both local and Drive backup)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_names = ['config1', 'config2', 'config3', 'config4', 'config5']
state_dicts = []

for name in model_names:
    # Load the final checkpoint (after 10 epochs)
    model_path = f'{name}_10.pt'
    drive_path = f'/content/drive/MyDrive/Colab Notebooks/{name}_10.pt'
    
    # Try local first, then Drive backup
    if os.path.exists(model_path):
        print(f'Loading {model_path} (local)')
        state_dicts.append(torch.load(model_path, map_location=device))
    elif os.path.exists(drive_path):
        print(f'Loading {drive_path} (from Drive)')
        state_dicts.append(torch.load(drive_path, map_location=device))
    else:
        print(f'⚠️  Model {model_path} not found in local or Drive!')

print(f"✅ Loaded {len(state_dicts)} models successfully!")

# Also backup final soup results to Drive
def save_soup_results():
    import pickle
    results = {
        'individual_results': individual_results,
        'uniform_accuracy': uniform_accuracy,
        'greedy_accuracy': best_accuracy,
        'greedy_indices': greedy_indices,
        'model_configs': [
            'lr=3e-5, wd=0.1, timm_aug=False',
            'lr=1e-5, wd=0.1, timm_aug=False', 
            'lr=3e-6, wd=0.1, timm_aug=False',
            'lr=2e-5, wd=1e-3, timm_aug=True',
            'lr=1e-6, wd=1e-4, timm_aug=False'
        ]
    }
    
    with open('/content/drive/MyDrive/Colab Notebooks/soup_results.pkl', 'wb') as f:
        pickle.dump(results, f)
    print("💾 Soup results saved to Drive!")

In [None]:
# Set up TinyImageNet test dataset for evaluation
base_model, preprocess = clip.load('ViT-B/32', device, jit=False)
test_dataset = TinyImageNet(preprocess, location="/content/dataset", batch_size=128, num_workers=4)

print(f"Number of classes: {len(test_dataset.classnames)}")
print(f"Test dataset size: {len(test_dataset.test_dataset)}")

In [ ]:
def get_model_from_state_dict(state_dict, base_model, num_classes):
    """Create a model wrapper from a state dict"""
    feature_dim = state_dict['classification_head.weight'].shape[1]
    model = ModelWrapper(base_model, feature_dim, num_classes, normalize=True)
    model.load_state_dict(state_dict)
    model = model.to(device)
    return model

def create_soup(state_dicts, weights=None):
    """Create a model soup by averaging state dicts with given weights"""
    if weights is None:
        weights = [1.0 / len(state_dicts)] * len(state_dicts)
    
    # Start with the first model weighted
    soup_state_dict = {k: v.clone() * weights[0] for k, v in state_dicts[0].items()}
    
    # Add remaining models
    for i, state_dict in enumerate(state_dicts[1:], 1):
        for k, v in state_dict.items():
            soup_state_dict[k] += v.clone() * weights[i]
    
    return soup_state_dict

def evaluate_model(model, dataset):
    """Evaluate model on test dataset"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(dataset.test_loader, desc="Evaluating"):
            if isinstance(batch, dict):
                images = batch['images'].to(device)
                labels = batch['labels'].to(device)
            else:
                images, labels = batch[0].to(device), batch[1].to(device)
            
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

## Individual Model Performance
Evaluate each individual fine-tuned model

In [None]:
# Evaluate individual models
num_classes = len(test_dataset.classnames)
individual_results = []

for i, state_dict in enumerate(state_dicts):
    print(f"\n📊 Evaluating Config {i+1}...")
    model = get_model_from_state_dict(state_dict, base_model, num_classes)
    accuracy = evaluate_model(model, test_dataset)
    individual_results.append(accuracy)
    print(f"Config {i+1} Accuracy: {accuracy:.2f}%")

print(f"\n🎯 Individual Model Results:")
for i, acc in enumerate(individual_results):
    print(f"Config {i+1}: {acc:.2f}%")
print(f"Best Individual: {max(individual_results):.2f}%")

## Model Soup Results
Create and evaluate uniform and greedy model soups

In [None]:
# Create and evaluate uniform soup (simple average)
print("🍲 Creating Uniform Soup...")
uniform_soup_state = create_soup(state_dicts)
uniform_soup_model = get_model_from_state_dict(uniform_soup_state, base_model, num_classes)
uniform_accuracy = evaluate_model(uniform_soup_model, test_dataset)

print(f"\n🍲 Uniform Soup Accuracy: {uniform_accuracy:.2f}%")
print(f"Improvement over best individual: {uniform_accuracy - max(individual_results):.2f}%")

# Simple greedy soup implementation (add models if they improve performance)
print(f"\n🧠 Creating Greedy Soup...")
best_accuracy = 0
best_state_dict = None
greedy_indices = []

# Start with the best individual model
best_idx = np.argmax(individual_results)
greedy_indices.append(best_idx)
best_state_dict = state_dicts[best_idx]
best_accuracy = individual_results[best_idx]

print(f"Starting with Config {best_idx + 1} (accuracy: {best_accuracy:.2f}%)")

# Try adding each remaining model
for i, state_dict in enumerate(state_dicts):
    if i == best_idx:
        continue
    
    # Create soup with current best + this model
    temp_soup = create_soup([best_state_dict, state_dict])
    temp_model = get_model_from_state_dict(temp_soup, base_model, num_classes)
    temp_accuracy = evaluate_model(temp_model, test_dataset)
    
    print(f"Adding Config {i+1}: {temp_accuracy:.2f}%")
    
    if temp_accuracy > best_accuracy:
        print(f"✅ Improved! Adding Config {i+1}")
        greedy_indices.append(i)
        best_state_dict = temp_soup
        best_accuracy = temp_accuracy
    else:
        print(f"❌ No improvement, skipping Config {i+1}")

print(f"\n🧠 Greedy Soup includes configs: {[i+1 for i in greedy_indices]}")
print(f"🧠 Greedy Soup Accuracy: {best_accuracy:.2f}%")
print(f"Improvement over best individual: {best_accuracy - max(individual_results):.2f}%")

In [None]:
# Create summary results
results_summary = {
    'Method': ['Config 1', 'Config 2', 'Config 3', 'Config 4', 'Config 5', 
               'Best Individual', 'Uniform Soup', 'Greedy Soup'],
    'Accuracy (%)': individual_results + [max(individual_results), uniform_accuracy, best_accuracy]
}

# Create DataFrame for easy visualization
results_df = pd.DataFrame(results_summary)
print("📊 Summary Results:")
print(results_df.to_string(index=False))

# Plot results
plt.figure(figsize=(12, 6))
colors = ['lightblue'] * 5 + ['orange', 'red', 'green']
bars = plt.bar(results_df['Method'], results_df['Accuracy (%)'], color=colors)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{height:.1f}%', ha='center', va='bottom')

plt.title('Model Soup Results on Tiny ImageNet', fontsize=16, fontweight='bold')
plt.ylabel('Test Accuracy (%)', fontsize=12)
plt.xlabel('Method', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()

# Add legend
legend_elements = [plt.Rectangle((0,0),1,1, color='lightblue', label='Individual Models'),
                   plt.Rectangle((0,0),1,1, color='orange', label='Best Individual'),
                   plt.Rectangle((0,0),1,1, color='red', label='Uniform Soup'),
                   plt.Rectangle((0,0),1,1, color='green', label='Greedy Soup')]
plt.legend(handles=legend_elements, loc='upper left')

plt.show()

# Print final summary
print(f"\n🎉 Final Results Summary:")
print(f"Zero-shot CLIP: ~61.39% (from earlier)")
print(f"Best Individual Model: {max(individual_results):.2f}%")
print(f"Uniform Soup: {uniform_accuracy:.2f}% (+{uniform_accuracy - max(individual_results):.2f}%)")
print(f"Greedy Soup: {best_accuracy:.2f}% (+{best_accuracy - max(individual_results):.2f}%)")
print(f"\n✨ Model soups successfully improved over the best individual model!")

In [ ]:
# Create summary results
results_summary = {
    'Method': ['Config 1', 'Config 2', 'Config 3', 'Config 4', 'Config 5', 
               'Best Individual', 'Uniform Soup', 'Greedy Soup'],
    'Accuracy (%)': individual_results + [max(individual_results), uniform_accuracy, best_accuracy]
}

# Create DataFrame for easy visualization
results_df = pd.DataFrame(results_summary)
print("📊 Summary Results:")
print(results_df.to_string(index=False))

# Plot results
plt.figure(figsize=(12, 6))
colors = ['lightblue'] * 5 + ['orange', 'red', 'green']
bars = plt.bar(results_df['Method'], results_df['Accuracy (%)'], color=colors)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{height:.1f}%', ha='center', va='bottom')

plt.title('Model Soup Results on Tiny ImageNet', fontsize=16, fontweight='bold')
plt.ylabel('Test Accuracy (%)', fontsize=12)
plt.xlabel('Method', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()

# Add legend
legend_elements = [plt.Rectangle((0,0),1,1, color='lightblue', label='Individual Models'),
                   plt.Rectangle((0,0),1,1, color='orange', label='Best Individual'),
                   plt.Rectangle((0,0),1,1, color='red', label='Uniform Soup'),
                   plt.Rectangle((0,0),1,1, color='green', label='Greedy Soup')]
plt.legend(handles=legend_elements, loc='upper left')

# Save plot to Drive
plt.savefig('/content/drive/MyDrive/Colab Notebooks/model_soup_results.png', 
            dpi=300, bbox_inches='tight')
plt.show()

# Print final summary
print(f"\n🎉 Final Results Summary:")
print(f"Zero-shot CLIP: ~61.39% (from earlier)")
print(f"Best Individual Model: {max(individual_results):.2f}%")
print(f"Uniform Soup: {uniform_accuracy:.2f}% (+{uniform_accuracy - max(individual_results):.2f}%)")
print(f"Greedy Soup: {best_accuracy:.2f}% (+{best_accuracy - max(individual_results):.2f}%)")
print(f"\n✨ Model soups successfully improved over the best individual model!")

# Save all results to Drive
save_soup_results()
print("💾 All results backed up to Google Drive!")