# 🧠 ResNet-50 Training on ImageNet 100 Subset
This Colab notebook will:
- 📥 Download ImageNet 100 dataset
- 🧠 Train a ResNet-50 at 224×224 resolution
- 💾 Save the best checkpoint
- 🧪 Evaluate accuracy

In [None]:
# !mkdir -p /kaggle/working/checkpoint
# !cp /kaggle/input/checkpoint1/*.pth /kaggle/working/checkpoints/
!ls /kaggle/working/checkpoints/model_checkpoint.pth

/kaggle/working/checkpoints/model_checkpoint.pth


In [15]:

# 🧰 Kaggle Environment Setup
import os

# Make sure directories exist
os.makedirs("/kaggle/working/checkpoints", exist_ok=True)

# Check GPU
!nvidia-smi

# Optional: Install libraries if needed
# !pip install torch torchvision pytorch-lightning --quiet


Mon Oct 20 07:58:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:

# 💾 Helper functions to save and load training state (model + optimizer + epoch)
import torch

def save_checkpoint(model, optimizer, epoch, path="/kaggle/working/checkpoints/model_checkpoint.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, path)
    print(f"✅ Checkpoint saved at {path}")

def load_checkpoint(model, optimizer, path="/kaggle/working/checkpoints/model_checkpoint.pth"):
    if os.path.exists(path):
        checkpoint = torch.load(path, map_location="cuda" if torch.cuda.is_available() else "cpu")
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"✅ Checkpoint loaded from {path}, starting at epoch {start_epoch}")
        return start_epoch
    else:
        print("⚠️ No checkpoint found, starting from scratch")
        return 0

# 📤 Save a copy of the checkpoint to root for Kaggle Output persistence
import shutil

def export_checkpoint_for_kaggle():
    src = "/kaggle/working/checkpoints/model_checkpoint.pth"
    dst = "/kaggle/working/model_checkpoint.pth"
    if os.path.exists(src):
        shutil.copy(src, dst)
        print(f"✅ Exported checkpoint to {dst} for Kaggle Output persistence")

In [None]:
import json, os

# ✅ Hardcoded Kaggle credentials (replace with your actual values)
kaggle_credentials = {"username":"<placeholder>","key":"<placeholder>"}

# Write kaggle.json file
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_credentials, f)

# Set permissions
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
!kaggle datasets list -s imagenet | head

ref                                                       title                                                size  lastUpdated                 downloadCount  voteCount  usabilityRating  
--------------------------------------------------------  --------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  
wanghaohan/imagenetsketch                                 ImageNet-Sketch                               15219198396  2019-06-18 13:45:50.200000           4239         62  0.625            
vitaliykinakh/stable-imagenet1k                           Stable ImageNet-1K                            10519983320  2022-09-08 22:19:54.953000           3665         47  0.9375           
akash2sharma/tiny-imagenet                                Tiny ImageNet                                   497536564  2018-09-27 12:11:20.917000          20604        219  0.3125           
deeptrial/miniimagenet                                 

In [None]:
# 📥 Step 1: Download ImageNet Mini (1000 classes)
!kaggle datasets download -d wilyzh/imagenet100
!unzip -q imagenet100.zip -d /content/imagenet_mini
!ls /content/imagenet_mini

Dataset URL: https://www.kaggle.com/datasets/wilyzh/imagenet100
License(s): unknown
^C
User cancelled operation


In [None]:
# # 📥 Step 1: Download ImageNet Mini (1000 classes) 3GB
# !kaggle datasets download -d ifigotin/imagenetmini-1000
# !unzip -q imagenetmini-1000.zip -d /content/imagenet_mini
# !ls /content/imagenet_mini

In [None]:
# ✂️ Step 2: Pick 25 classes
import os, random, shutil

src_train = '/kaggle/input/imagenet100/ImageNet100/train/'
src_val = '/kaggle/input/imagenet100/ImageNet100/val/'

all_classes = sorted(os.listdir(src_train))
print(f"Total classes available: {len(all_classes)}")


Total classes available: 100


In [None]:
# 🧠 Step 4: Data loaders with advanced augmentation
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.data import RandomSampler
from model import ResNet50
from train import train, train_transforms, test_transforms
from test import evaluate

# Enhanced batch size and workers for better utilization
BATCH_SIZE = 128  # Reduced for better generalization
NUM_WORKERS = 4  # Increased for faster data loading

# Datasets with different transforms
train_dataset = datasets.ImageFolder(src_train, transform=train_transforms(augment=True))
val_dataset = datasets.ImageFolder(src_val, transform=test_transforms())

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=NUM_WORKERS,
    pin_memory=True,  # Faster data transfer to GPU
    prefetch_factor=2  # Prefetch 2 batches per worker
)

test_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS,
    pin_memory=True,
    prefetch_factor=2
)

num_classes = len(train_dataset.classes)
print(f'✅ {num_classes} classes | {len(train_dataset)} train | {len(val_dataset)} val')

✅ 100 classes | 126689 train | 5000 val


In [None]:
# 🏋️ Step 5: Train ResNet-50 with optimizations
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np

# Enhanced training settings
EPOCHS = 150  # Increased epochs for better convergence
LABEL_SMOOTHING = 0.15  # Increased label smoothing
MAX_LR = 2e-3  # Slightly increased learning rate
MIN_LR = 1e-6
WEIGHT_DECAY = 2e-4  # Increased weight decay for better regularization
GRAD_CLIP = 1.0
MIXUP_ALPHA = 0.4  # Increased mixup alpha for better regularization

# Initialize model using custom implementation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet50(num_classes=len(all_classes))
model = model.to(device)

# Loss function with label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTHING)

# Optimizer with weight decay
optimizer = optim.AdamW(
    model.parameters(),
    lr=MAX_LR,
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.999)
)

# Enhanced learning rate scheduler
steps_per_epoch = len(train_loader)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=MAX_LR,
    steps_per_epoch=steps_per_epoch,
    epochs=EPOCHS,
    pct_start=0.15,  # 15% warmup
    anneal_strategy='cos',
    cycle_momentum=True,
    base_momentum=0.85,
    max_momentum=0.95,
    div_factor=25.0,
    final_div_factor=1000.0
)

# Enhanced Mixup augmentation
def mixup_data(x, y, alpha=MIXUP_ALPHA):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# Set paths for checkpoints
best_val_acc = 0
checkpoint_dir = '/kaggle/working/checkpoints'
checkpoint_path = f'{checkpoint_dir}/model_checkpoint.pth'
best_model_path = f'{checkpoint_dir}/model_best.pth'

# Create checkpoint directory
os.makedirs(checkpoint_dir, exist_ok=True)

In [33]:
# 🔁 Auto Resume Training if Checkpoint Exists
start_epoch = load_checkpoint(model, optimizer)


✅ Checkpoint loaded from /kaggle/working/checkpoints/model_checkpoint.pth, starting at epoch 68


In [35]:
checkpoint = torch.load('/kaggle/working/checkpoints/model_checkpoint.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

if 'scheduler_state_dict' in checkpoint:
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

global best_val_acc
best_val_acc = checkpoint.get('best_val_acc', 0)
start_epoch = checkpoint['epoch'] + 1

print(f"📊 Resumed from epoch {checkpoint['epoch']}")
print(f"🎯 Best validation accuracy: {best_val_acc:.2f}%")

📊 Resumed from epoch 67
🎯 Best validation accuracy: 84.20%


In [None]:
# Required imports for mixup
import numpy as np

# Enhanced training loop with better early stopping
patience = 10  # Increased patience for better convergence
patience_counter = 0
best_epoch = 0

print(f"🚀 Starting training for {EPOCHS} epochs...")
print(f"💡 Using device: {device}")
print(f"📊 Training samples: {len(train_dataset)}")
print(f"📊 Validation samples: {len(val_dataset)}")

for epoch in range(start_epoch, EPOCHS):
    print(f"\n📅 EPOCH: {epoch+1}/{EPOCHS}")
    
    # Clear memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Training phase with learning rate logging
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current learning rate: {current_lr:.6f}")
    
    train(model, train_loader, criterion, optimizer, device, epoch)
    
    # Validation phase
    val_acc = test(model, test_loader, device)
    
    # Save checkpoint
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_val_acc': best_val_acc,
    }
    
    # Save regular checkpoint
    torch.save(checkpoint, checkpoint_path)
    print(f"💾 Checkpoint saved at epoch {epoch+1}")
    
    # Early stopping and best model saving
    if val_acc > best_val_acc:
        improvement = val_acc - best_val_acc
        best_val_acc = val_acc
        best_epoch = epoch
        patience_counter = 0
        
        # Save best model
        torch.save(checkpoint, best_model_path)
        print(f"🏆 New best model saved! (val_acc={val_acc:.2f}%, improvement: +{improvement:.2f}%)")
        
        # Export to Kaggle output
        kaggle_output = '/kaggle/working'
        for filename in ['model_checkpoint.pth', 'model_best.pth']:
            src = os.path.join(checkpoint_dir, filename)
            dst = os.path.join(kaggle_output, filename)
            if os.path.exists(src):
                shutil.copy(src, dst)
                print(f"📤 Exported {filename} to Kaggle output")
    else:
        patience_counter += 1
        print(f"⏳ No improvement for {patience_counter} epochs. "
              f"Best accuracy: {best_val_acc:.2f}% (epoch {best_epoch+1})")
    
    # Early stopping with more informative message
    if patience_counter >= patience:
        print(f"\n⚠️ Early stopping triggered! No improvement for {patience} epochs.")
        print(f"🎯 Best validation accuracy: {best_val_acc:.2f}% (achieved at epoch {best_epoch+1})")
        break
    
    # Step the scheduler
    scheduler.step()
    
print("\n✅ Training completed!")
print(f"🎯 Best validation accuracy: {best_val_acc:.2f}% (achieved at epoch {best_epoch+1})")

🚀 Starting training for 100 epochs...
💡 Using device: cuda
📊 Training samples: 126689
📊 Validation samples: 5000

📅 EPOCH: 69/100


Epoch 69/100: 100%|██████████| 990/990 [15:43<00:00,  1.05it/s, loss=1.716, acc=43.47%, lr=0.000265]
Evaluating: 100%|██████████| 40/40 [00:35<00:00,  1.13it/s, loss=0.011, acc=84.20%]



Test set: Average loss: 0.011, Accuracy: 84.20%
💾 Checkpoint saved at epoch 69
⏳ No improvement for 1 epochs. Best accuracy: 84.20%

📅 EPOCH: 70/100


Epoch 70/100: 100%|██████████| 990/990 [11:16<00:00,  1.46it/s, loss=1.725, acc=43.18%, lr=0.000250]
Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.77it/s, loss=0.011, acc=83.90%]



Test set: Average loss: 0.011, Accuracy: 83.90%
💾 Checkpoint saved at epoch 70
⏳ No improvement for 2 epochs. Best accuracy: 84.20%

📅 EPOCH: 71/100


Epoch 71/100: 100%|██████████| 990/990 [11:16<00:00,  1.46it/s, loss=1.745, acc=42.46%, lr=0.000235]
Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.75it/s, loss=0.011, acc=83.94%]



Test set: Average loss: 0.011, Accuracy: 83.94%
💾 Checkpoint saved at epoch 71
⏳ No improvement for 3 epochs. Best accuracy: 84.20%

📅 EPOCH: 72/100


Epoch 72/100: 100%|██████████| 990/990 [11:15<00:00,  1.47it/s, loss=1.691, acc=42.56%, lr=0.000220]
Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.76it/s, loss=0.011, acc=83.68%]



Test set: Average loss: 0.011, Accuracy: 83.68%
💾 Checkpoint saved at epoch 72
⏳ No improvement for 4 epochs. Best accuracy: 84.20%

📅 EPOCH: 73/100


Epoch 73/100: 100%|██████████| 990/990 [11:36<00:00,  1.42it/s, loss=1.711, acc=44.66%, lr=0.000206]
Evaluating: 100%|██████████| 40/40 [00:24<00:00,  1.66it/s, loss=0.011, acc=83.88%]



Test set: Average loss: 0.011, Accuracy: 83.88%
💾 Checkpoint saved at epoch 73
⏳ No improvement for 5 epochs. Best accuracy: 84.20%

⚠️ Early stopping triggered! No improvement for 5 epochs.
🎯 Best validation accuracy: 84.20%

✅ Training completed!
🎯 Best validation accuracy: 84.20%


In [None]:
# # Plotting results
# import matplotlib.pyplot as plt
# fig, axs = plt.subplots(2,2,figsize=(15,10))
# axs[0, 0].plot([t.item() for t in train_losses])
# axs[0, 0].set_title("Training Loss")
# axs[1, 0].plot(train_acc)
# axs[1, 0].set_title("Training Accuracy")
# axs[0, 1].plot(test_losses)
# axs[0, 1].set_title("Test Loss")
# axs[1, 1].plot(test_acc)
# axs[1, 1].set_title("Test Accuracy")
# plt.show()