# CIFAR-100 Training - Optimized for 74% Accuracy with HuggingFace Integration

## Key Improvements:
- Fixed data augmentation (proper Cutout implementation)
- Cosine annealing scheduler with warmup
- Optimized hyperparameters (batch size 256, MixUp alpha 0.2)
- Label smoothing (0.1)
- Gradient clipping
- Mixed precision training
- Model checkpointing and early stopping
- WideResNet-28-10 (36.5M parameters)
- **HuggingFace Hub integration with checkpoint uploads at breakpoints**

In [None]:
# Import Libraries
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn as nn
from torchsummary import summary
from torch.cuda.amp import autocast, GradScaler
import matplotlib.pyplot as plt
import os
import json
from datetime import datetime

# CIFAR-100 Mean and Std
cifar100_mean = (0.5071, 0.4865, 0.4409)
cifar100_std = (0.2673, 0.2564, 0.2761)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
# HuggingFace Setup - Get token from Colab secrets
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("✓ HuggingFace token retrieved from Colab secrets")
except Exception as e:
    print(f"Warning: Could not retrieve HF_TOKEN from secrets: {e}")
    print("Please add HF_TOKEN to Colab secrets for model upload")
    HF_TOKEN = None

# Install huggingface_hub if needed
try:
    from huggingface_hub import HfApi, create_repo, upload_file
except ImportError:
    !pip install -q huggingface_hub
    from huggingface_hub import HfApi, create_repo, upload_file

# Configure HuggingFace repository
REPO_ID = 'pandurangpatil/cifar100-wideresnet-session8'
api = HfApi()

# Login and create repository
if HF_TOKEN:
    try:
        api.set_access_token(HF_TOKEN)
        create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)
        print(f"✓ Repository ready: https://huggingface.co/{REPO_ID}")
    except Exception as e:
        print(f"Warning: Could not create repository: {e}")
else:
    print("⚠ HuggingFace upload will be skipped (no token available)")

In [None]:
# Improved Data Augmentation
class ImprovedAlbumentationsTransforms:
    def __init__(self, mean, std):
        self.aug = A.Compose([
            A.HorizontalFlip(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=0.5),
            A.CoarseDropout(max_holes=1, max_height=8, max_width=8, p=0.5, fill_value=0),  # Cutout
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.3),
            A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.3),
            A.Normalize(mean=mean, std=std),
            ToTensorV2()
        ])

    def __call__(self, img):
        image = np.array(img)
        return self.aug(image=image)["image"]

# Instantiate transforms
train_transforms = ImprovedAlbumentationsTransforms(mean=cifar100_mean, std=cifar100_std)

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar100_mean, std=cifar100_std)
])

# CIFAR-100 Dataset with increased batch size
train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=train_transforms)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=test_transforms)

# DataLoaders with batch size 256
BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

cifar100_classes = datasets.CIFAR100(root='./data', train=False).classes
print(f"Training batches: {len(train_loader)}, Test batches: {len(test_loader)}")

In [None]:
# Visualize augmented samples
import matplotlib.pyplot as plt
import numpy as np

batch_data, batch_label = next(iter(train_loader))
batch_data = batch_data.cpu().detach()

fig = plt.figure(figsize=(12, 8))
for i in range(12):
    img = batch_data[i].numpy().transpose((1, 2, 0))
    # Denormalize
    img = img * np.array(cifar100_std) + np.array(cifar100_mean)
    img = np.clip(img, 0, 1)
    
    plt.subplot(3, 4, i + 1)
    plt.imshow(img)
    plt.title(f"Class: {cifar100_classes[batch_label[i]]}")
    plt.axis('off')

plt.tight_layout()
plt.savefig('sample_augmentations.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# WideResNet Architecture
class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
        super(BasicBlock, self).__init__()
        self.equalInOut = in_planes == out_planes
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.dropRate = dropRate
        self.shortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, 1, stride=stride, bias=False) or None

    def forward(self, x):
        if not self.equalInOut:
            x = self.relu1(self.bn1(x))
        else:
            out = self.relu1(self.bn1(x))
        out = self.conv1(out if self.equalInOut else x)
        out = self.relu2(self.bn2(out))
        if self.dropRate > 0:
            out = F.dropout(out, p=self.dropRate, training=self.training)
        out = self.conv2(out)
        return out + (x if self.equalInOut else self.shortcut(x))


class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, dropRate=0.0):
        super(NetworkBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, dropRate)

    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, dropRate):
        layers = []
        for i in range(nb_layers):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes,
                                i == 0 and stride or 1, dropRate))
        return nn.Sequential(*layers)

    def forward(self, x):
        return self.layer(x)


class WideResNet(nn.Module):
    def __init__(self, depth=28, num_classes=100, widen_factor=10, dropRate=0.3):
        super(WideResNet, self).__init__()
        nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor]
        assert ((depth - 4) % 6 == 0)
        n = (depth - 4) // 6
        block = BasicBlock
        self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
        self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, dropRate)
        self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, dropRate)
        self.bn1 = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(nChannels[3], num_classes)
        self.nChannels = nChannels[3]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.adaptive_avg_pool2d(out, 1)
        out = out.view(-1, self.nChannels)
        return self.fc(out)

In [None]:
# Initialize model
model = WideResNet(depth=28, widen_factor=10, dropRate=0.3, num_classes=100).to(device)
summary(model, input_size=(3, 32, 32))

In [None]:
# MixUp function
def mixup_data(x, y, alpha=0.2, device='cuda'):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

In [None]:
# Learning Rate Warmup Scheduler
class WarmupScheduler:
    def __init__(self, optimizer, warmup_epochs, initial_lr, target_lr, steps_per_epoch):
        self.optimizer = optimizer
        self.warmup_steps = warmup_epochs * steps_per_epoch
        self.initial_lr = initial_lr
        self.target_lr = target_lr
        self.current_step = 0
        
    def step(self):
        if self.current_step < self.warmup_steps:
            lr = self.initial_lr + (self.target_lr - self.initial_lr) * self.current_step / self.warmup_steps
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = lr
        self.current_step += 1
        
    def is_warmup(self):
        return self.current_step < self.warmup_steps

In [None]:
# HuggingFace Upload Functions
def upload_to_huggingface(file_path, path_in_repo, repo_id=REPO_ID, commit_message="Upload checkpoint"):
    """Upload a file to HuggingFace Hub"""
    if not HF_TOKEN:
        print("⚠ Skipping upload (no HF token)")
        return
    
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            repo_type="model",
            token=HF_TOKEN,
            commit_message=commit_message
        )
        print(f"✓ Uploaded: {path_in_repo}")
    except Exception as e:
        print(f"✗ Upload failed for {path_in_repo}: {e}")

def save_and_upload_checkpoint(model, optimizer, epoch, train_acc, test_acc, train_loss, test_loss, 
                                checkpoint_dir, checkpoint_name, is_best=False):
    """Save checkpoint locally and upload to HuggingFace"""
    checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name)
    
    # Save checkpoint with metadata
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'train_loss': train_loss,
        'test_loss': test_loss,
        'timestamp': datetime.now().isoformat(),
        'config': {
            'model': 'WideResNet-28-10',
            'batch_size': BATCH_SIZE,
            'mixup_alpha': 0.2,
            'label_smoothing': 0.1,
            'weight_decay': 1e-3,
            'dropout': 0.3
        }
    }
    
    torch.save(checkpoint, checkpoint_path)
    
    # Upload to HuggingFace
    commit_msg = f"Epoch {epoch}: Test Acc {test_acc:.2f}%"
    if is_best:
        commit_msg = f"New best model! " + commit_msg
    
    upload_to_huggingface(checkpoint_path, checkpoint_name, commit_message=commit_msg)
    
    return checkpoint_path

def create_and_upload_model_card(best_acc, total_epochs, train_accuracies, test_accuracies, checkpoint_dir):
    """Create and upload README.md model card"""
    model_card = f"""---
tags:
- image-classification
- cifar100
- wideresnet
- pytorch
datasets:
- cifar100
metrics:
- accuracy
---

# CIFAR-100 WideResNet-28-10

## Model Description

WideResNet-28-10 trained on CIFAR-100 dataset with advanced augmentation techniques.

### Model Architecture
- **Architecture**: WideResNet-28-10
- **Parameters**: 36.5M
- **Depth**: 28 layers
- **Width Factor**: 10
- **Dropout**: 0.3

### Training Configuration
- **Batch Size**: 256
- **Optimizer**: SGD (momentum=0.9, weight_decay=1e-3)
- **Learning Rate**: Cosine annealing with warmup (0.01→0.1, min=1e-4)
- **Scheduler**: CosineAnnealingWarmRestarts (T_0=25)
- **Augmentation**: HorizontalFlip, ShiftScaleRotate, Cutout, ColorJitter
- **MixUp**: Alpha=0.2
- **Label Smoothing**: 0.1
- **Mixed Precision**: Enabled
- **Gradient Clipping**: 1.0

### Performance
- **Best Test Accuracy**: {best_acc:.2f}%
- **Total Epochs Trained**: {total_epochs}
- **Final Train Accuracy**: {train_accuracies[-1]:.2f}%
- **Final Test Accuracy**: {test_accuracies[-1]:.2f}%

### Available Checkpoints
- `best_model.pth` - Best performing model
- `checkpoint_epoch10.pth` - Epoch 10 checkpoint
- `checkpoint_epoch25.pth` - Epoch 25 checkpoint (end of cycle 1)
- `checkpoint_epoch50.pth` - Epoch 50 checkpoint (mid-training)
- `checkpoint_epoch75.pth` - Epoch 75 checkpoint (late training)
- `final_model.pth` - Final epoch model

### Usage

```python
import torch
from huggingface_hub import hf_hub_download

# Download model
checkpoint_path = hf_hub_download(
    repo_id="{REPO_ID}",
    filename="best_model.pth"
)

# Load checkpoint
checkpoint = torch.load(checkpoint_path, map_location='cpu')

# Load model (define WideResNet class first)
model = WideResNet(depth=28, widen_factor=10, num_classes=100)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
```

### Training Details
- **Dataset**: CIFAR-100 (50,000 train, 10,000 test)
- **Classes**: 100
- **Image Size**: 32×32
- **Normalization**: mean=(0.5071, 0.4865, 0.4409), std=(0.2673, 0.2564, 0.2761)

### Files
- `training_curves.png` - Training/test accuracy and loss curves
- `metrics.json` - Complete training history
- `config.json` - Hyperparameter configuration

### License
MIT

### Citation
```bibtex
@misc{{wideresnet-cifar100,
  author = {{Pandurang Patil}},
  title = {{CIFAR-100 WideResNet-28-10}},
  year = {{2025}},
  publisher = {{HuggingFace}},
  url = {{https://huggingface.co/{REPO_ID}}}
}}
```
"""
    
    # Save README locally
    readme_path = os.path.join(checkpoint_dir, 'README.md')
    with open(readme_path, 'w') as f:
        f.write(model_card)
    
    # Upload to HuggingFace
    upload_to_huggingface(readme_path, 'README.md', commit_message="Update model card")
    print("✓ Model card created and uploaded")

In [None]:
# Training function with mixed precision and gradient clipping
def train(model, device, train_loader, optimizer, scheduler, warmup_scheduler, scaler, epoch, 
          use_mixup=True, mixup_alpha=0.2, label_smoothing=0.1):
    model.train()
    pbar = tqdm(train_loader)
    correct = 0
    processed = 0
    total_loss = 0

    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        # Mixed precision training
        with autocast():
            if use_mixup:
                inputs, targets_a, targets_b, lam = mixup_data(data, target, alpha=mixup_alpha, device=device)
                outputs = model(inputs)
                loss = lam * F.cross_entropy(outputs, targets_a, label_smoothing=label_smoothing) + \
                       (1 - lam) * F.cross_entropy(outputs, targets_b, label_smoothing=label_smoothing)
            else:
                outputs = model(data)
                loss = F.cross_entropy(outputs, target, label_smoothing=label_smoothing)

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        scaler.step(optimizer)
        scaler.update()
        
        # Update learning rate
        if warmup_scheduler.is_warmup():
            warmup_scheduler.step()
        else:
            scheduler.step()

        # Accuracy tracking
        _, pred = outputs.max(1)
        if use_mixup:
            correct += lam * pred.eq(targets_a).sum().item() + (1 - lam) * pred.eq(targets_b).sum().item()
        else:
            correct += pred.eq(target).sum().item()
        processed += len(data)
        total_loss += loss.item()

        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_description(f"Epoch {epoch} Loss={loss.item():.4f} Acc={100*correct/processed:.2f}% LR={current_lr:.6f}")

    avg_loss = total_loss / len(train_loader)
    accuracy = 100. * correct / processed
    return avg_loss, accuracy


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n")
    return test_loss, accuracy

In [None]:
# Training Configuration
print("=" * 70)
print("Training Configuration:")
print("=" * 70)
print(f"Model: WideResNet-28-10 (36.5M parameters)")
print(f"Batch Size: {BATCH_SIZE}")
print(f"MixUp Alpha: 0.2")
print(f"Label Smoothing: 0.1")
print(f"Weight Decay: 1e-3")
print(f"Gradient Clipping: 1.0")
print(f"Scheduler: CosineAnnealingWarmRestarts")
print(f"Warmup: 5 epochs (0.01 -> 0.1)")
print(f"Mixed Precision: Enabled")
print(f"HuggingFace Upload: {'Enabled' if HF_TOKEN else 'Disabled'}")
print("=" * 70)

# Optimizer and Scheduler
EPOCHS = 100
WARMUP_EPOCHS = 5
INITIAL_LR = 0.01
MAX_LR = 0.1
MIN_LR = 1e-4

optimizer = optim.SGD(model.parameters(), lr=INITIAL_LR, momentum=0.9, weight_decay=1e-3)

# Cosine annealing with warm restarts (T_0 = 25 epochs per cycle)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=25, T_mult=1, eta_min=MIN_LR)

# Warmup scheduler
warmup_scheduler = WarmupScheduler(optimizer, WARMUP_EPOCHS, INITIAL_LR, MAX_LR, len(train_loader))

# Gradient scaler for mixed precision
scaler = GradScaler()

# Tracking variables
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
learning_rates = []

# Early stopping and checkpointing
best_test_acc = 0.0
patience = 15
patience_counter = 0
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Define checkpoint breakpoints
CHECKPOINT_EPOCHS = [10, 20, 25, 30, 40, 50, 60, 75, 90]

In [None]:
# Training Loop with HuggingFace Checkpoint Uploads
print("\nStarting training...\n")

for epoch in range(1, EPOCHS + 1):
    # Train
    train_loss, train_acc = train(
        model, device, train_loader, optimizer, scheduler, warmup_scheduler, scaler, epoch,
        use_mixup=True, mixup_alpha=0.2, label_smoothing=0.1
    )
    
    # Test
    test_loss, test_acc = test(model, device, test_loader)
    
    # Record metrics
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    learning_rates.append(optimizer.param_groups[0]['lr'])
    
    # Save best model
    if test_acc > best_test_acc:
        best_test_acc = test_acc
        patience_counter = 0
        
        # Save and upload best model
        print(f"*** New best model! Test Accuracy: {best_test_acc:.2f}% ***")
        save_and_upload_checkpoint(
            model, optimizer, epoch, train_acc, test_acc, train_loss, test_loss,
            checkpoint_dir, 'best_model.pth', is_best=True
        )
    else:
        patience_counter += 1
    
    # Upload checkpoint at breakpoints
    if epoch in CHECKPOINT_EPOCHS:
        checkpoint_name = f'checkpoint_epoch{epoch}.pth'
        print(f"📍 Breakpoint checkpoint at epoch {epoch}")
        save_and_upload_checkpoint(
            model, optimizer, epoch, train_acc, test_acc, train_loss, test_loss,
            checkpoint_dir, checkpoint_name
        )
    
    # Save metrics periodically
    if epoch % 10 == 0 or epoch in CHECKPOINT_EPOCHS:
        metrics = {
            'epochs': list(range(1, epoch + 1)),
            'train_losses': train_losses,
            'test_losses': test_losses,
            'train_accuracies': train_accuracies,
            'test_accuracies': test_accuracies,
            'learning_rates': learning_rates,
            'best_test_accuracy': best_test_acc
        }
        metrics_path = os.path.join(checkpoint_dir, 'metrics.json')
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=2)
        
        upload_to_huggingface(metrics_path, 'metrics.json', commit_message=f"Update metrics (epoch {epoch})")
    
    # Early stopping
    if patience_counter >= patience:
        print(f"\nEarly stopping triggered at epoch {epoch}. No improvement for {patience} epochs.")
        break
    
    # Check if target reached
    if test_acc >= 74.0:
        print(f"\n{'=' * 70}")
        print(f"Target accuracy of 74% reached at epoch {epoch}!")
        print(f"Final test accuracy: {test_acc:.2f}%")
        print(f"{'=' * 70}")
        break
    
    print(f"Best Test Accuracy so far: {best_test_acc:.2f}% | Patience: {patience_counter}/{patience}\n")

# Save final model
print("\n📦 Saving final model...")
save_and_upload_checkpoint(
    model, optimizer, epoch, train_acc, test_acc, train_loss, test_loss,
    checkpoint_dir, 'final_model.pth'
)

print(f"\nTraining completed. Best test accuracy: {best_test_acc:.2f}%")

In [None]:
# Plot training curves
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss curves
axes[0, 0].plot(train_losses, label='Train Loss', linewidth=2)
axes[0, 0].plot(test_losses, label='Test Loss', linewidth=2)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training and Test Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy curves
axes[0, 1].plot(train_accuracies, label='Train Accuracy', linewidth=2)
axes[0, 1].plot(test_accuracies, label='Test Accuracy', linewidth=2)
axes[0, 1].axhline(y=74, color='r', linestyle='--', label='Target (74%)')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy (%)')
axes[0, 1].set_title('Training and Test Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Learning rate
axes[1, 0].plot(learning_rates, linewidth=2, color='green')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Learning Rate')
axes[1, 0].set_title('Learning Rate Schedule')
axes[1, 0].grid(True)

# Gap between train and test accuracy (overfitting indicator)
accuracy_gap = [train - test for train, test in zip(train_accuracies, test_accuracies)]
axes[1, 1].plot(accuracy_gap, linewidth=2, color='orange')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy Gap (%)')
axes[1, 1].set_title('Train-Test Accuracy Gap (Overfitting Indicator)')
axes[1, 1].grid(True)

plt.tight_layout()
curves_path = os.path.join(checkpoint_dir, 'training_curves.png')
plt.savefig(curves_path, dpi=300, bbox_inches='tight')
plt.show()

# Upload training curves
upload_to_huggingface(curves_path, 'training_curves.png', commit_message="Upload training curves")

print(f"\nFinal Statistics:")
print(f"Best Test Accuracy: {best_test_acc:.2f}%")
print(f"Final Train Accuracy: {train_accuracies[-1]:.2f}%")
print(f"Final Test Accuracy: {test_accuracies[-1]:.2f}%")
print(f"Final Train-Test Gap: {accuracy_gap[-1]:.2f}%")

In [None]:
# Create and upload model card
create_and_upload_model_card(
    best_test_acc, 
    len(train_accuracies), 
    train_accuracies, 
    test_accuracies, 
    checkpoint_dir
)

# Save final config
config = {
    'model': 'WideResNet-28-10',
    'depth': 28,
    'widen_factor': 10,
    'dropout': 0.3,
    'num_classes': 100,
    'batch_size': BATCH_SIZE,
    'epochs': len(train_accuracies),
    'optimizer': 'SGD',
    'momentum': 0.9,
    'weight_decay': 1e-3,
    'initial_lr': INITIAL_LR,
    'max_lr': MAX_LR,
    'min_lr': MIN_LR,
    'scheduler': 'CosineAnnealingWarmRestarts',
    'T_0': 25,
    'warmup_epochs': WARMUP_EPOCHS,
    'mixup_alpha': 0.2,
    'label_smoothing': 0.1,
    'gradient_clipping': 1.0,
    'mixed_precision': True,
    'best_test_accuracy': best_test_acc,
    'final_train_accuracy': train_accuracies[-1],
    'final_test_accuracy': test_accuracies[-1]
}

config_path = os.path.join(checkpoint_dir, 'config.json')
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

upload_to_huggingface(config_path, 'config.json', commit_message="Upload training configuration")

print(f"\n{'=' * 70}")
print(f"All files uploaded to: https://huggingface.co/{REPO_ID}")
print(f"{'=' * 70}")

In [None]:
# Load best model and evaluate
checkpoint = torch.load(os.path.join(checkpoint_dir, 'best_model.pth'))
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded best model from epoch {checkpoint['epoch']} with test accuracy: {checkpoint['test_accuracy']:.2f}%")

# Final evaluation
test_loss, test_acc = test(model, device, test_loader)
print(f"\nFinal Test Accuracy: {test_acc:.2f}%")

In [None]:
# Optional: Download and resume from checkpoint
# Uncomment to use

# from huggingface_hub import hf_hub_download

# # Download a specific checkpoint
# checkpoint_path = hf_hub_download(
#     repo_id=REPO_ID,
#     filename="checkpoint_epoch50.pth",
#     token=HF_TOKEN
# )

# # Load checkpoint
# checkpoint = torch.load(checkpoint_path, map_location=device)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# print(f"Resumed from epoch {checkpoint['epoch']} with test accuracy {checkpoint['test_accuracy']:.2f}%")