# ResNet18 Finetuning on CIFAR-10

This notebook:
- Takes a ResNet18 network pretrained on ImageNet as base point, then finetune on CIFAR-10
- Uses different finetuning hyperparameters to obtain different model checkpoints
- Follows heDeepResidualLearning2016 training configuration

## Setup Environment

In [None]:
LOCAL = True

# if run locally:
if LOCAL:
    ROOT_DIR = "/Users/Yang/Desktop/research-model-merge/playground/merge_soup-resnet18-cifar10"
    DATA_DIR = "/Users/Yang/Desktop/research-model-merge/datasets"
    PROJECT_ROOT = "/Users/Yang/Desktop/research-model-merge"
else:
    # on Colab
    ROOT_DIR = "/content/research-model-merge/playground/merge_soup-resnet18-cifar10"
    DATA_DIR = "/content/research-model-merge/datasets"
    PROJECT_ROOT = "/content/research-model-merge"
    DRIVE_DIR = "/content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10"

If using Colab, mount Google Drive to save checkpoints persistently.

In [9]:
if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Clone the repository and install dependencies on Colab.

In [10]:
if not LOCAL:
    # remove the dir if exists
    !rm -rf research-model-merge
    # Clone the repository
    !git clone https://github.com/nbzy1995/research-model-merge.git /content/research-model-merge

    # Install dependencies
    !pip install --quiet --upgrade pip
    !pip install -q -r research-model-merge/requirements.txt
    print("✅ Repository cloned and dependencies installed!")

Cloning into '/content/research-model-merge'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 92 (delta 34), reused 90 (delta 32), pack-reused 0 (from 0)[K
Receiving objects: 100% (92/92), 1.54 MiB | 4.97 MiB/s, done.
Resolving deltas: 100% (34/34), done.
✅ Repository cloned and dependencies installed!


In [11]:
import os
import sys
import time
from typing import Dict, Any

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.models import resnet18, ResNet18_Weights
import numpy as np
from tqdm import tqdm

# Add project root to path
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Add utils to path
if ROOT_DIR not in sys.path:
    sys.path.insert(0, ROOT_DIR)

from datasets.cifar10 import CIFAR10

from datasets.cifar10 import CIFAR10

In [12]:
# Check GPU availability and system info
import subprocess

print("🔍 System Information:")
print(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
    DEVICE = torch.device("cuda")
else:
    if LOCAL:
        print("⚠️ No GPU available! Training will be slow on CPU.")
    else:
        print("❌ No GPU available! Please enable GPU runtime in Colab.")
        print("Runtime > Change runtime type > Hardware accelerator > GPU")
    DEVICE = torch.device("cpu")

🔍 System Information:
Python version: Python 3.12.11
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU device: Tesla T4
GPU memory: 15.8 GB
CUDA version: 12.6


## Dataset Preparation

Using the shared CIFAR10 dataset class from `datasets/cifar10.py`:
- Training: 98% of original training set (49,000 images)
- Validation: 2% of original training set (1,000 images)  
- Test: Official CIFAR-10 test set (10,000 images)
- Persistent indices ensure consistent splits across all experiments

In [13]:
# Create CIFAR-10 dataset using shared dataset class
# This uses persistent indices for reproducible splits
dataset = CIFAR10(
    data_location=DATA_DIR,
    batch_size=256,
    num_workers=2
)

train_loader = dataset.train_loader
val_loader = dataset.val_loader
test_loader = dataset.test_loader

print(f"✅ Dataset loaded:")
print(f"   Train samples: {len(dataset.train_sampler)}")
print(f"   Val samples: {len(dataset.val_sampler)}")
print(f"   Test samples: {len(dataset.test_dataset)}")
print(f"   Classnames: {dataset.classnames}")

100%|██████████| 170M/170M [00:08<00:00, 20.7MB/s]


✅ Dataset loaded:
   Train samples: 49000
   Val samples: 1000
   Test samples: 10000
   Classnames: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


## Finetuning Function

In [14]:
def cosine_lr_schedule(optimizer, epoch, total_epochs, warmup_epochs, base_lr):
    """
    Cosine learning rate schedule with linear warmup.
    Following Git Re-Basin configuration.
    """
    if epoch < warmup_epochs:
        # Linear warmup from 1e-6 to base_lr
        lr = 1e-6 + (base_lr - 1e-6) * epoch / warmup_epochs
    else:
        # Cosine decay from base_lr to 0
        progress = (epoch - warmup_epochs) / (total_epochs - warmup_epochs)
        lr = base_lr * 0.5 * (1 + np.cos(np.pi * progress))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return lr

In [15]:
def step_lr_schedule(optimizer, epoch, total_epochs, warmup_epochs, base_lr):
    """
    Following heDeepResidualLearning2016.
    """
    if epoch < warmup_epochs:
        lr = base_lr
    else:
        lr = base_lr * 0.1

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return lr

In [16]:
def finetune_resnet(
    train_loader: DataLoader,
    val_loader: DataLoader,
    model_save_location: str = '.',
    batch_size: int = 256,
    epochs: int = 10,
    warmup_epochs: int = 5,
    lr: float = 0.1,
    wd: float = 1e-4,
    momentum: float = 0.9,
    name: str = 'config1',
    log_interval: int = 20,
) -> Dict[str, Any]:
    """
    Finetune ResNet18 (pretrained on ImageNet) on CIFAR-10.

    Following He 2016 training configuration:
    - SGD optimizer with momentum=0.9
    - Weight decay (default 1e-4)
    - Step LR schedule with 5-epoch warmup
    - Warmup: 1e-6 -> lr over 5 epochs
    - Step decay: lr -> 0.1*lr after warmup
    """
    os.makedirs(model_save_location, exist_ok=True)

    # Load pretrained ResNet18 and modify for CIFAR-10
    model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)

    # Modify first conv layer for 32x32 input (CIFAR-10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)

    # Remove maxpool layer (too aggressive for 32x32 images)
    model.maxpool = nn.Identity()

    # Replace final FC layer for CIFAR-10 (10 classes)
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, 10)

    model = model.to(DEVICE)

    # Optimizer: SGD with momentum
    optimizer = optim.SGD(
        model.parameters(),
        lr=lr,
        momentum=momentum,
        weight_decay=wd
    )

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Training history
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_acc': [],
        'lr': []
    }

    print(f"\n{'='*80}")
    print(f"Starting training: {name}")
    print(f"Config: lr={lr}, wd={wd}, epochs={epochs}, batch_size={batch_size}")
    print(f"{'='*80}\n")

    # Training loop
    for epoch in range(epochs):
        # Update learning rate
        current_lr = step_lr_schedule(optimizer, epoch, epochs, warmup_epochs, lr)
        history['lr'].append(current_lr)

        # Training phase
        model.train()
        train_loss_accum = 0.0
        train_batches = 0

        pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        for i, (inputs, labels) in enumerate(pbar):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss_accum += loss.item()
            train_batches += 1

            if i % log_interval == 0:
                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'lr': f'{current_lr:.6f}'
                })

        train_loss = train_loss_accum / train_batches
        history['train_loss'].append(train_loss)

        # Validation phase
        model.eval()
        val_loss_accum = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]')
            for inputs, labels in pbar:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss_accum += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{100.*correct/total:.2f}%'
                })

        val_loss = val_loss_accum / len(val_loader)
        val_acc = correct / total

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"\nEpoch {epoch+1}/{epochs} Summary:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss:   {val_loss:.4f}")
        print(f"  Val Acc:    {100*val_acc:.2f}%")
        print(f"  LR:         {current_lr:.6f}\n")

        # Save checkpoint after each epoch
        checkpoint_path = os.path.join(model_save_location, f'{name}_epoch{epoch+1}.pt')
        torch.save(model.state_dict(), checkpoint_path)
        print(f"✅ Saved checkpoint: {checkpoint_path}")

    result = {
        'history': history,
        'config': {
            'model_save_location': model_save_location,
            'batch_size': batch_size,
            'epochs': epochs,
            'warmup_epochs': warmup_epochs,
            'lr': lr,
            'wd': wd,
            'momentum': momentum,
            'name': name,
        },
    }

    return result

## Run Training with Multiple Configurations

We train 5 different configurations with varying learning rates and weight decay values:

1. **Config 1**: lr=0.1, wd=1e-4 (He 2016 baseline)
2. **Config 2**: lr=0.05, wd=1e-4
3. **Config 3**: lr=0.01, wd=1e-4
4. **Config 4**: lr=0.1, wd=1e-3
5. **Config 5**: lr=0.1, wd=1e-5

In [17]:
# Checkpoint directory
if LOCAL:
    checkpoint_dir = f"{ROOT_DIR}/checkpoints"
else:
    checkpoint_dir = f"{DRIVE_DIR}/checkpoints"

os.makedirs(checkpoint_dir, exist_ok=True)

# Define configurations
configs = [
    dict(lr=0.1, wd=1e-4, name='config1'),
    dict(lr=0.05, wd=1e-4, name='config2'),
    dict(lr=0.01, wd=1e-4, name='config3'),
    dict(lr=0.1, wd=1e-3, name='config4'),
    dict(lr=0.1, wd=1e-5, name='config5'),
]

# Common parameters
common = dict(
    train_loader=train_loader,
    val_loader=val_loader,
    model_save_location=checkpoint_dir,
    batch_size=258,
    epochs=10,
    warmup_epochs=5,
    momentum=0.9,
)

In [None]:
# Run all configurations
results = []

for config in configs:
    run_config = {**common, **config}
    print(f"\n{'#'*80}")
    print(f"Running configuration: {config['name']}")
    print(f"  LR: {config['lr']}, WD: {config['wd']}")
    print(f"{'#'*80}\n")

    result = finetune_resnet(**run_config)
    results.append(result)

    print(f"\n✅ {config['name']} completed!\n")

print("\n" + "="*80)
print("All configurations completed!")
print("="*80)


################################################################################
Running configuration: config1
  LR: 0.1, WD: 0.0001
################################################################################

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 111MB/s]



Starting training: config1
Config: lr=0.1, wd=0.0001, epochs=10, batch_size=258



Epoch 1/10 [Train]: 100%|██████████| 192/192 [00:47<00:00,  4.08it/s, loss=2.0856, lr=0.100000]
Epoch 1/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  7.86it/s, loss=2.1194, acc=19.30%]



Epoch 1/10 Summary:
  Train Loss: 3.3916
  Val Loss:   2.1219
  Val Acc:    19.30%
  LR:         0.100000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch1.pt


Epoch 2/10 [Train]: 100%|██████████| 192/192 [00:42<00:00,  4.52it/s, loss=1.8862, lr=0.100000]
Epoch 2/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  6.99it/s, loss=2.0189, acc=29.20%]



Epoch 2/10 Summary:
  Train Loss: 2.0032
  Val Loss:   1.9286
  Val Acc:    29.20%
  LR:         0.100000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch2.pt


Epoch 3/10 [Train]: 100%|██████████| 192/192 [00:42<00:00,  4.56it/s, loss=1.6816, lr=0.100000]
Epoch 3/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.16it/s, loss=1.7781, acc=36.40%]



Epoch 3/10 Summary:
  Train Loss: 1.7881
  Val Loss:   1.6944
  Val Acc:    36.40%
  LR:         0.100000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch3.pt


Epoch 4/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.75it/s, loss=1.6208, lr=0.100000]
Epoch 4/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.46it/s, loss=1.6536, acc=44.50%]



Epoch 4/10 Summary:
  Train Loss: 1.6342
  Val Loss:   1.5335
  Val Acc:    44.50%
  LR:         0.100000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch4.pt


Epoch 5/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.78it/s, loss=1.6547, lr=0.100000]
Epoch 5/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.54it/s, loss=1.5400, acc=48.10%]



Epoch 5/10 Summary:
  Train Loss: 1.5305
  Val Loss:   1.4313
  Val Acc:    48.10%
  LR:         0.100000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch5.pt


Epoch 6/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.76it/s, loss=1.5412, lr=0.010000]
Epoch 6/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.73it/s, loss=1.3661, acc=51.50%]



Epoch 6/10 Summary:
  Train Loss: 1.4176
  Val Loss:   1.3102
  Val Acc:    51.50%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch6.pt


Epoch 7/10 [Train]: 100%|██████████| 192/192 [00:39<00:00,  4.84it/s, loss=1.4743, lr=0.010000]
Epoch 7/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  6.83it/s, loss=1.3420, acc=52.80%]



Epoch 7/10 Summary:
  Train Loss: 1.3884
  Val Loss:   1.2945
  Val Acc:    52.80%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch7.pt


Epoch 8/10 [Train]: 100%|██████████| 192/192 [00:41<00:00,  4.60it/s, loss=1.4330, lr=0.010000]
Epoch 8/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  6.35it/s, loss=1.3265, acc=53.30%]



Epoch 8/10 Summary:
  Train Loss: 1.3623
  Val Loss:   1.2810
  Val Acc:    53.30%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch8.pt


Epoch 9/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.79it/s, loss=1.3260, lr=0.010000]
Epoch 9/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.55it/s, loss=1.2574, acc=55.80%]



Epoch 9/10 Summary:
  Train Loss: 1.3506
  Val Loss:   1.2496
  Val Acc:    55.80%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch9.pt


Epoch 10/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.76it/s, loss=1.3285, lr=0.010000]
Epoch 10/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.74it/s, loss=1.2544, acc=54.40%]



Epoch 10/10 Summary:
  Train Loss: 1.3290
  Val Loss:   1.2477
  Val Acc:    54.40%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config1_epoch10.pt

✅ config1 completed!


################################################################################
Running configuration: config2
  LR: 0.05, WD: 0.0001
################################################################################


Starting training: config2
Config: lr=0.05, wd=0.0001, epochs=10, batch_size=258



Epoch 1/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.73it/s, loss=0.7658, lr=0.050000]
Epoch 1/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.74it/s, loss=0.8860, acc=71.70%]



Epoch 1/10 Summary:
  Train Loss: 1.3856
  Val Loss:   0.8668
  Val Acc:    71.70%
  LR:         0.050000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch1.pt


Epoch 2/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.73it/s, loss=0.5989, lr=0.050000]
Epoch 2/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.49it/s, loss=0.4233, acc=82.70%]



Epoch 2/10 Summary:
  Train Loss: 0.6969
  Val Loss:   0.5022
  Val Acc:    82.70%
  LR:         0.050000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch2.pt


Epoch 3/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.72it/s, loss=0.5196, lr=0.050000]
Epoch 3/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.39it/s, loss=0.4116, acc=83.90%]



Epoch 3/10 Summary:
  Train Loss: 0.5513
  Val Loss:   0.4726
  Val Acc:    83.90%
  LR:         0.050000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch3.pt


Epoch 4/10 [Train]: 100%|██████████| 192/192 [00:42<00:00,  4.53it/s, loss=0.5107, lr=0.050000]
Epoch 4/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.54it/s, loss=0.3731, acc=85.80%]



Epoch 4/10 Summary:
  Train Loss: 0.4708
  Val Loss:   0.4287
  Val Acc:    85.80%
  LR:         0.050000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch4.pt


Epoch 5/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.78it/s, loss=0.4881, lr=0.050000]
Epoch 5/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  7.29it/s, loss=0.3124, acc=88.10%]



Epoch 5/10 Summary:
  Train Loss: 0.4173
  Val Loss:   0.3427
  Val Acc:    88.10%
  LR:         0.050000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch5.pt


Epoch 6/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.76it/s, loss=0.2718, lr=0.005000]
Epoch 6/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  6.50it/s, loss=0.1923, acc=92.50%]



Epoch 6/10 Summary:
  Train Loss: 0.2988
  Val Loss:   0.2132
  Val Acc:    92.50%
  LR:         0.005000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch6.pt


Epoch 7/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.77it/s, loss=0.2711, lr=0.005000]
Epoch 7/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  7.72it/s, loss=0.1908, acc=92.50%]



Epoch 7/10 Summary:
  Train Loss: 0.2567
  Val Loss:   0.2092
  Val Acc:    92.50%
  LR:         0.005000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch7.pt


Epoch 8/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.75it/s, loss=0.1695, lr=0.005000]
Epoch 8/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.87it/s, loss=0.1987, acc=92.70%]



Epoch 8/10 Summary:
  Train Loss: 0.2396
  Val Loss:   0.2070
  Val Acc:    92.70%
  LR:         0.005000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch8.pt


Epoch 9/10 [Train]: 100%|██████████| 192/192 [00:42<00:00,  4.53it/s, loss=0.2116, lr=0.005000]
Epoch 9/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.79it/s, loss=0.1777, acc=92.40%]



Epoch 9/10 Summary:
  Train Loss: 0.2273
  Val Loss:   0.1943
  Val Acc:    92.40%
  LR:         0.005000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch9.pt


Epoch 10/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.76it/s, loss=0.1929, lr=0.005000]
Epoch 10/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.01it/s, loss=0.1964, acc=92.10%]



Epoch 10/10 Summary:
  Train Loss: 0.2162
  Val Loss:   0.1913
  Val Acc:    92.10%
  LR:         0.005000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config2_epoch10.pt

✅ config2 completed!


################################################################################
Running configuration: config3
  LR: 0.01, WD: 0.0001
################################################################################


Starting training: config3
Config: lr=0.01, wd=0.0001, epochs=10, batch_size=258



Epoch 1/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.75it/s, loss=0.6690, lr=0.010000]
Epoch 1/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.51it/s, loss=0.5972, acc=78.60%]



Epoch 1/10 Summary:
  Train Loss: 1.2094
  Val Loss:   0.6344
  Val Acc:    78.60%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch1.pt


Epoch 2/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.74it/s, loss=0.5391, lr=0.010000]
Epoch 2/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.62it/s, loss=0.3834, acc=84.00%]



Epoch 2/10 Summary:
  Train Loss: 0.6808
  Val Loss:   0.4346
  Val Acc:    84.00%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch2.pt


Epoch 3/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.77it/s, loss=0.4509, lr=0.010000]
Epoch 3/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.60it/s, loss=0.2854, acc=87.90%]



Epoch 3/10 Summary:
  Train Loss: 0.5279
  Val Loss:   0.3552
  Val Acc:    87.90%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch3.pt


Epoch 4/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.74it/s, loss=0.4766, lr=0.010000]
Epoch 4/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.00it/s, loss=0.2876, acc=88.10%]



Epoch 4/10 Summary:
  Train Loss: 0.4307
  Val Loss:   0.3209
  Val Acc:    88.10%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch4.pt


Epoch 5/10 [Train]: 100%|██████████| 192/192 [00:41<00:00,  4.59it/s, loss=0.3054, lr=0.010000]
Epoch 5/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.67it/s, loss=0.2801, acc=88.90%]



Epoch 5/10 Summary:
  Train Loss: 0.3735
  Val Loss:   0.3256
  Val Acc:    88.90%
  LR:         0.010000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch5.pt


Epoch 6/10 [Train]: 100%|██████████| 192/192 [00:40<00:00,  4.74it/s, loss=0.2808, lr=0.001000]
Epoch 6/10 [Val]: 100%|██████████| 4/4 [00:00<00:00,  9.13it/s, loss=0.2248, acc=91.30%]



Epoch 6/10 Summary:
  Train Loss: 0.2928
  Val Loss:   0.2556
  Val Acc:    91.30%
  LR:         0.001000

✅ Saved checkpoint: /content/drive/MyDrive/research-model_merge-shared/merge_soup-resnet18-cifar10/checkpoints/config3_epoch6.pt


Epoch 7/10 [Train]:  57%|█████▋    | 109/192 [00:22<00:16,  4.92it/s, loss=0.2151, lr=0.001000]

## Summary of Results

In [None]:
import pandas as pd

# Create summary table
summary = []
for r in results:
    cfg = r['config']
    hist = r['history']
    summary.append({
        'name': cfg['name'],
        'lr': cfg['lr'],
        'wd': cfg['wd'],
        'final_train_loss': hist['train_loss'][-1],
        'final_val_loss': hist['val_loss'][-1],
        'final_val_acc': f"{100*hist['val_acc'][-1]:.2f}%",
        'best_val_acc': f"{100*max(hist['val_acc']):.2f}%",
    })

df = pd.DataFrame(summary)
print("\n" + "="*80)
print("Training Summary")
print("="*80)
print(df.to_string(index=False))
print("="*80)

# Save summary
df.to_csv(f"{checkpoint_dir}/training_summary.csv", index=False)
print(f"\n✅ Summary saved to {checkpoint_dir}/training_summary.csv")

## Plot Training Curves

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Training Loss
for r in results:
    axes[0, 0].plot(r['history']['train_loss'], label=r['config']['name'])
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Train Loss')
axes[0, 0].set_title('Training Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Plot 2: Validation Loss
for r in results:
    axes[0, 1].plot(r['history']['val_loss'], label=r['config']['name'])
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Validation Loss')
axes[0, 1].set_title('Validation Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Plot 3: Validation Accuracy
for r in results:
    axes[1, 0].plot([100*x for x in r['history']['val_acc']], label=r['config']['name'])
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Validation Accuracy (%)')
axes[1, 0].set_title('Validation Accuracy')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Plot 4: Learning Rate
for r in results:
    axes[1, 1].plot(r['history']['lr'], label=r['config']['name'])
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Learning Rate')
axes[1, 1].set_title('Learning Rate Schedule')
axes[1, 1].legend()
axes[1, 1].grid(True)
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.savefig(f"{checkpoint_dir}/training_curves.png", dpi=150, bbox_inches='tight')
plt.show()

print(f"✅ Training curves saved to {checkpoint_dir}/training_curves.png")