# CIFAR-100 Training - Session 05 (Conservative Optimization)

## Target: 74-75% Accuracy in 100 Epochs

### Conservative Optimization Strategy:
1. **Progressive Augmentation** (2 phases): Full → Reduced
2. **Progressive Dropout**: 0.3 → 0.2
3. **Two-Phase LR**: Warm restarts (T_0=25) → Smooth decay
4. **Keep Max LR**: 0.1 (same as Session-02)
5. **Progressive MixUp**: 0.2 → 0.15
6. **Keep Label Smoothing**: 0.1 (no change)

### Storage Strategy:
- **Google Drive**: All checkpoints (auto-cleanup keeps last 5)
- **HuggingFace**: Upload every 10 epochs + best model

### Improvements from Session-02 (71.20%):
- Expected gain: +3-4% (target: 74-75%)

In [None]:
# Import Libraries
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn as nn
from torchsummary import summary
from torch.cuda.amp import autocast, GradScaler
import matplotlib.pyplot as plt
import os
import json
import shutil
from datetime import datetime

# CIFAR-100 Mean and Std
cifar100_mean = (0.5071, 0.4865, 0.4409)
cifar100_std = (0.2673, 0.2564, 0.2761)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
print(f"PyTorch version: {torch.__version__}")

In [None]:
# Mount Google Drive for checkpoint storage
try:
    from google.colab import drive
    drive.mount('/content/drive')
    GDRIVE_DIR = '/content/drive/MyDrive/cifar100_checkpoints_session05'
    os.makedirs(GDRIVE_DIR, exist_ok=True)
    print(f"✓ Google Drive mounted: {GDRIVE_DIR}")
    USE_GDRIVE = True
except:
    print("⚠ Google Drive not available (not running in Colab)")
    GDRIVE_DIR = './checkpoints_gdrive_backup'
    os.makedirs(GDRIVE_DIR, exist_ok=True)
    USE_GDRIVE = False

In [None]:
# HuggingFace Setup
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("✓ HuggingFace token retrieved from Colab secrets")
except Exception as e:
    print(f"Warning: Could not retrieve HF_TOKEN: {e}")
    HF_TOKEN = None

try:
    from huggingface_hub import HfApi, create_repo
except ImportError:
    !pip install -q huggingface_hub
    from huggingface_hub import HfApi, create_repo

REPO_ID = 'pandurangpatil/cifar100-wideresnet-session05'
api = HfApi()

if HF_TOKEN:
    try:
        create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)
        print(f"✓ HuggingFace repository ready: https://huggingface.co/{REPO_ID}")
    except Exception as e:
        print(f"Warning: Could not create repository: {e}")
else:
    print("⚠ HuggingFace upload will be skipped (no token)")

In [None]:
# Progressive Data Augmentation (2 Phases - Conservative)
class ProgressiveAlbumentationsTransforms:
    def __init__(self, mean, std, phase='full'):
        if phase == 'full':  # Epochs 1-60: Full augmentation
            self.aug = A.Compose([
                A.HorizontalFlip(p=0.5),
                A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=0.5),
                A.CoarseDropout(max_holes=1, max_height=8, max_width=8, p=0.5, fill_value=0),
                A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.3),
                A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.3),
                A.Normalize(mean=mean, std=std),
                ToTensorV2()
            ])
        else:  # phase == 'reduced', Epochs 61-100: Reduced augmentation
            self.aug = A.Compose([
                A.HorizontalFlip(p=0.5),
                A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.07, rotate_limit=10, p=0.35),
                A.CoarseDropout(max_holes=1, max_height=6, max_width=6, p=0.35, fill_value=0),
                A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.2),
                A.Normalize(mean=mean, std=std),
                ToTensorV2()
            ])
        self.phase = phase

    def __call__(self, img):
        image = np.array(img)
        return self.aug(image=image)["image"]

def get_augmentation_phase(epoch):
    """Determine augmentation phase based on epoch"""
    if epoch <= 60:
        return 'full'
    else:
        return 'reduced'

# Initial transforms (full augmentation)
train_transforms = ProgressiveAlbumentationsTransforms(mean=cifar100_mean, std=cifar100_std, phase='full')
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar100_mean, std=cifar100_std)
])

print("✓ Progressive augmentation initialized (Phase: full)")

In [None]:
# Load CIFAR-100 Dataset
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=train_transforms)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=test_transforms)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

cifar100_classes = datasets.CIFAR100(root='./data', train=False).classes
print(f"Training batches: {len(train_loader)}, Test batches: {len(test_loader)}")

In [None]:
# WideResNet Architecture with Dynamic Dropout
class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
        super(BasicBlock, self).__init__()
        self.equalInOut = in_planes == out_planes
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.dropRate = dropRate
        self.shortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, 1, stride=stride, bias=False) or None

    def forward(self, x):
        if not self.equalInOut:
            x = self.relu1(self.bn1(x))
        else:
            out = self.relu1(self.bn1(x))
        out = self.conv1(out if self.equalInOut else x)
        out = self.relu2(self.bn2(out))
        if self.dropRate > 0:
            out = F.dropout(out, p=self.dropRate, training=self.training)
        out = self.conv2(out)
        return out + (x if self.equalInOut else self.shortcut(x))

class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, dropRate=0.0):
        super(NetworkBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, dropRate)

    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, dropRate):
        layers = []
        for i in range(nb_layers):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, dropRate))
        return nn.Sequential(*layers)

    def forward(self, x):
        return self.layer(x)

class WideResNet(nn.Module):
    def __init__(self, depth=28, num_classes=100, widen_factor=10, dropRate=0.3):
        super(WideResNet, self).__init__()
        nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor]
        assert ((depth - 4) % 6 == 0)
        n = (depth - 4) // 6
        block = BasicBlock
        self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1, padding=1, bias=False)
        self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
        self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, dropRate)
        self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, dropRate)
        self.bn1 = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(nChannels[3], num_classes)
        self.nChannels = nChannels[3]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.adaptive_avg_pool2d(out, 1)
        out = out.view(-1, self.nChannels)
        return self.fc(out)

def update_model_dropout(model, new_dropout):
    """Dynamically update dropout rate during training"""
    count = 0
    for module in model.modules():
        if isinstance(module, BasicBlock):
            module.dropRate = new_dropout
            count += 1
    print(f"✓ Updated dropout to {new_dropout} in {count} blocks")

# Initialize model
model = WideResNet(depth=28, widen_factor=10, dropRate=0.3, num_classes=100).to(device)
print(f"✓ Model initialized with {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
# MixUp function
def mixup_data(x, y, alpha=0.2, device='cuda'):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

# Warmup Scheduler
class WarmupScheduler:
    def __init__(self, optimizer, warmup_epochs, initial_lr, target_lr, steps_per_epoch):
        self.optimizer = optimizer
        self.warmup_steps = warmup_epochs * steps_per_epoch
        self.initial_lr = initial_lr
        self.target_lr = target_lr
        self.current_step = 0
    
    def step(self):
        if self.current_step < self.warmup_steps:
            lr = self.initial_lr + (self.target_lr - self.initial_lr) * self.current_step / self.warmup_steps
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = lr
        self.current_step += 1
    
    def is_warmup(self):
        return self.current_step < self.warmup_steps

In [None]:
# Google Drive Checkpoint Manager
class GoogleDriveCheckpointManager:
    def __init__(self, gdrive_dir, max_keep=5):
        self.gdrive_dir = gdrive_dir
        self.max_keep = max_keep
        self.checkpoints = []
        os.makedirs(gdrive_dir, exist_ok=True)
    
    def save(self, checkpoint_path, epoch, is_best=False):
        """Save checkpoint to Google Drive"""
        try:
            if is_best:
                # Always keep best model
                best_path = os.path.join(self.gdrive_dir, 'best_model.pth')
                shutil.copy(checkpoint_path, best_path)
                print(f"  → Saved to GDrive: best_model.pth")
            else:
                # Save regular checkpoint
                gdrive_path = os.path.join(self.gdrive_dir, f'checkpoint_epoch{epoch:03d}.pth')
                shutil.copy(checkpoint_path, gdrive_path)
                self.checkpoints.append((epoch, gdrive_path))
                
                # Cleanup: keep only last max_keep checkpoints
                if len(self.checkpoints) > self.max_keep:
                    old_epoch, old_path = self.checkpoints.pop(0)
                    if os.path.exists(old_path):
                        os.remove(old_path)
                        print(f"  → Cleaned up old checkpoint: epoch{old_epoch:03d}")
                
                print(f"  → Saved to GDrive: checkpoint_epoch{epoch:03d}.pth")
        except Exception as e:
            print(f"  ✗ GDrive save failed: {e}")
    
    def list_checkpoints(self):
        """List all checkpoints in Google Drive"""
        files = [f for f in os.listdir(self.gdrive_dir) if f.endswith('.pth')]
        return sorted(files)

gdrive_manager = GoogleDriveCheckpointManager(GDRIVE_DIR, max_keep=5)
print(f"✓ Google Drive checkpoint manager initialized")

In [None]:
# HuggingFace Upload Functions
def upload_to_huggingface(file_path, path_in_repo, commit_message="Upload checkpoint"):
    """Upload file to HuggingFace (only every 10 epochs)"""
    if not HF_TOKEN:
        return
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=path_in_repo,
            repo_id=REPO_ID,
            repo_type="model",
            token=HF_TOKEN,
            commit_message=commit_message
        )
        print(f"  → Uploaded to HF: {path_in_repo}")
    except Exception as e:
        print(f"  ✗ HF upload failed: {e}")

# HuggingFace upload schedule (every 10 epochs)
HF_UPLOAD_EPOCHS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
print(f"✓ HuggingFace uploads scheduled for epochs: {HF_UPLOAD_EPOCHS}")

In [None]:
# Training and Testing Functions
def train(model, device, train_loader, optimizer, scheduler, warmup_scheduler, scaler, epoch,
          use_mixup=True, mixup_alpha=0.2, label_smoothing=0.1):
    model.train()
    pbar = tqdm(train_loader)
    correct = 0
    processed = 0
    total_loss = 0

    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        with autocast():
            if use_mixup:
                inputs, targets_a, targets_b, lam = mixup_data(data, target, alpha=mixup_alpha, device=device)
                outputs = model(inputs)
                loss = lam * F.cross_entropy(outputs, targets_a, label_smoothing=label_smoothing) + \
                       (1 - lam) * F.cross_entropy(outputs, targets_b, label_smoothing=label_smoothing)
            else:
                outputs = model(data)
                loss = F.cross_entropy(outputs, target, label_smoothing=label_smoothing)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        if warmup_scheduler.is_warmup():
            warmup_scheduler.step()
        else:
            scheduler.step()

        _, pred = outputs.max(1)
        if use_mixup:
            correct += lam * pred.eq(targets_a).sum().item() + (1 - lam) * pred.eq(targets_b).sum().item()
        else:
            correct += pred.eq(target).sum().item()
        processed += len(data)
        total_loss += loss.item()

        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_description(f"Epoch {epoch} Loss={loss.item():.4f} Acc={100*correct/processed:.2f}% LR={current_lr:.6f}")

    avg_loss = total_loss / len(train_loader)
    accuracy = 100. * correct / processed
    return avg_loss, accuracy

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n")
    return test_loss, accuracy

In [None]:
# Training Configuration
print("="*70)
print("TRAINING CONFIGURATION - SESSION 05 (CONSERVATIVE)")
print("="*70)
print(f"Model: WideResNet-28-10 (36.5M parameters)")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Dropout: 0.3 → 0.2 (epoch 61)")
print(f"Augmentation: Full (1-60) → Reduced (61-100)")
print(f"MixUp: 0.2 (1-60) → 0.15 (61-100)")
print(f"Label Smoothing: 0.1 (constant)")
print(f"LR Schedule: Phase1 (1-50) T_0=25, Phase2 (51-100) CosineDecay")
print(f"Max LR: 0.1 (same as Session-02)")
print(f"Storage: GDrive (all) + HuggingFace (every 10 epochs)")
print("="*70)

EPOCHS = 100
WARMUP_EPOCHS = 5
INITIAL_LR = 0.01
MAX_LR = 0.1  # Conservative - same as Session-02
MIN_LR = 1e-4

optimizer = optim.SGD(model.parameters(), lr=INITIAL_LR, momentum=0.9, weight_decay=1e-3)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=25, T_mult=1, eta_min=1e-4)
warmup_scheduler = WarmupScheduler(optimizer, WARMUP_EPOCHS, INITIAL_LR, MAX_LR, len(train_loader))
scaler = GradScaler()

# Tracking
train_losses, test_losses = [], []
train_accuracies, test_accuracies = [], []
learning_rates = []
dropout_history = []
mixup_history = []
aug_phase_history = []

best_test_acc = 0.0
patience = 15
patience_counter = 0
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
# Training Loop with Conservative Progressive Strategies
print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70 + "\n")

current_aug_phase = 'full'
current_dropout = 0.3
current_mixup_alpha = 0.2
current_label_smoothing = 0.1
current_use_mixup = True

for epoch in range(1, EPOCHS + 1):
    # === PHASE TRANSITIONS (Conservative) ===
    
    # LR scheduler transition (epoch 51)
    if epoch == 51:
        print(f"\n{'='*70}")
        print(f"📍 LR SCHEDULER TRANSITION AT EPOCH {epoch}")
        print(f"   CosineAnnealingWarmRestarts → CosineAnnealingLR")
        print(f"{'='*70}\n")
        scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-5)
    
    # Combined transition at epoch 61
    if epoch == 61:
        print(f"\n{'='*70}")
        print(f"📍 MAJOR PHASE TRANSITION AT EPOCH {epoch}")
        print(f"   • Augmentation: Full → Reduced")
        print(f"   • Dropout: 0.3 → 0.2")
        print(f"   • MixUp: 0.2 → 0.15")
        print(f"{'='*70}\n")
        
        # Update augmentation
        current_aug_phase = 'reduced'
        train_transforms = ProgressiveAlbumentationsTransforms(mean=cifar100_mean, std=cifar100_std, phase=current_aug_phase)
        train_dataset.transform = train_transforms
        
        # Update dropout
        current_dropout = 0.2
        update_model_dropout(model, current_dropout)
        
        # Update MixUp
        current_mixup_alpha = 0.15
    
    # === TRAINING ===
    train_loss, train_acc = train(
        model, device, train_loader, optimizer, scheduler, warmup_scheduler, scaler, epoch,
        use_mixup=current_use_mixup, mixup_alpha=current_mixup_alpha, label_smoothing=current_label_smoothing
    )
    
    test_loss, test_acc = test(model, device, test_loader)
    
    # Record metrics
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    learning_rates.append(optimizer.param_groups[0]['lr'])
    dropout_history.append(current_dropout)
    mixup_history.append(current_mixup_alpha if current_use_mixup else 0.0)
    aug_phase_history.append(current_aug_phase)
    
    # Save checkpoint
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'train_loss': train_loss,
        'test_loss': test_loss,
        'timestamp': datetime.now().isoformat(),
        'config': {
            'dropout': current_dropout,
            'mixup_alpha': current_mixup_alpha if current_use_mixup else 0.0,
            'label_smoothing': current_label_smoothing,
            'aug_phase': current_aug_phase
        }
    }
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch{epoch}.pth')
    torch.save(checkpoint, checkpoint_path)
    
    # Save to Google Drive every epoch
    is_best = test_acc > best_test_acc
    gdrive_manager.save(checkpoint_path, epoch, is_best=is_best)
    
    # Upload to HuggingFace every 10 epochs
    if epoch in HF_UPLOAD_EPOCHS:
        print(f"\n📤 Uploading to HuggingFace (epoch {epoch})...")
        upload_to_huggingface(checkpoint_path, f'checkpoint_epoch{epoch}.pth', f"Epoch {epoch}: {test_acc:.2f}%")
        
        # Upload metrics
        metrics = {
            'epochs': list(range(1, epoch + 1)),
            'train_losses': train_losses,
            'test_losses': test_losses,
            'train_accuracies': train_accuracies,
            'test_accuracies': test_accuracies,
            'learning_rates': learning_rates,
            'dropout_history': dropout_history,
            'mixup_history': mixup_history,
            'aug_phase_history': aug_phase_history,
            'best_test_accuracy': best_test_acc
        }
        metrics_path = os.path.join(checkpoint_dir, 'metrics.json')
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2)
        upload_to_huggingface(metrics_path, 'metrics.json', f"Metrics update (epoch {epoch})")
    
    # Best model handling
    if is_best:
        best_test_acc = test_acc
        patience_counter = 0
        print(f"\n*** NEW BEST MODEL! Test Accuracy: {best_test_acc:.2f}% ***")
        
        # Upload best model to HF immediately
        best_path = os.path.join(checkpoint_dir, 'best_model.pth')
        torch.save(checkpoint, best_path)
        upload_to_huggingface(best_path, 'best_model.pth', f"New best: {best_test_acc:.2f}% (epoch {epoch})")
    else:
        patience_counter += 1
    
    # Early stopping
    if patience_counter >= patience:
        print(f"\nEarly stopping triggered at epoch {epoch}")
        break
    
    # Target reached
    if test_acc >= 74.0:
        print(f"\n{'='*70}")
        print(f"TARGET REACHED! Test Accuracy: {test_acc:.2f}% at epoch {epoch}")
        print(f"{'='*70}")
        break
    
    print(f"Best: {best_test_acc:.2f}% | Patience: {patience_counter}/{patience}\n")

print(f"\n{'='*70}")
print(f"TRAINING COMPLETED")
print(f"Best Test Accuracy: {best_test_acc:.2f}%")
print(f"{'='*70}")

In [None]:
# Plot Training Curves
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Accuracy
axes[0, 0].plot(train_accuracies, label='Train', linewidth=2)
axes[0, 0].plot(test_accuracies, label='Test', linewidth=2)
axes[0, 0].axhline(y=74, color='r', linestyle='--', label='Target (74%)')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy (%)')
axes[0, 0].set_title('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Loss
axes[0, 1].plot(train_losses, label='Train', linewidth=2)
axes[0, 1].plot(test_losses, label='Test', linewidth=2)
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].set_title('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Learning Rate
axes[0, 2].plot(learning_rates, linewidth=2, color='green')
axes[0, 2].set_xlabel('Epoch')
axes[0, 2].set_ylabel('Learning Rate')
axes[0, 2].set_title('Learning Rate Schedule')
axes[0, 2].grid(True)

# Dropout History
axes[1, 0].plot(dropout_history, linewidth=2, color='purple')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Dropout Rate')
axes[1, 0].set_title('Progressive Dropout')
axes[1, 0].grid(True)

# MixUp History
axes[1, 1].plot(mixup_history, linewidth=2, color='orange')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('MixUp Alpha')
axes[1, 1].set_title('Progressive MixUp')
axes[1, 1].grid(True)

# Train-Test Gap
accuracy_gap = [train - test for train, test in zip(train_accuracies, test_accuracies)]
axes[1, 2].plot(accuracy_gap, linewidth=2, color='red')
axes[1, 2].set_xlabel('Epoch')
axes[1, 2].set_ylabel('Accuracy Gap (%)')
axes[1, 2].set_title('Train-Test Gap (Overfitting Indicator)')
axes[1, 2].grid(True)

plt.tight_layout()
curves_path = os.path.join(checkpoint_dir, 'training_curves.png')
plt.savefig(curves_path, dpi=300, bbox_inches='tight')
plt.show()

upload_to_huggingface(curves_path, 'training_curves.png', "Training curves")

print(f"\nFinal Statistics:")
print(f"Best Test Accuracy: {best_test_acc:.2f}%")
print(f"Final Train Accuracy: {train_accuracies[-1]:.2f}%")
print(f"Final Test Accuracy: {test_accuracies[-1]:.2f}%")
print(f"Final Train-Test Gap: {accuracy_gap[-1]:.2f}%")
print(f"\nGoogle Drive checkpoints: {len(gdrive_manager.list_checkpoints())} files")

In [None]:
# Load best model and final evaluation
best_checkpoint = torch.load(os.path.join(checkpoint_dir, 'best_model.pth'))
model.load_state_dict(best_checkpoint['model_state_dict'])
print(f"Loaded best model from epoch {best_checkpoint['epoch']}")
print(f"Best test accuracy: {best_checkpoint['test_accuracy']:.2f}%")

test_loss, test_acc = test(model, device, test_loader)
print(f"\nFinal evaluation: {test_acc:.2f}%")