# CIFAR-10 Competition - Data Exploration

This notebook helps you explore the CIFAR-10 dataset and understand the data distribution.

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Device: {device}")

classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Classes: {classes}")

## Load and Visualize Data

In [None]:
# Load CIFAR-10 dataset
basic_transform = transforms.Compose([
    transforms.ToTensor(),
])

train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=basic_transform)
test_dataset = datasets.CIFAR10(root="./data", train=False, download=True, transform=basic_transform)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

In [None]:
# Visualize sample images from each class
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for idx, class_name in enumerate(classes):
    for i in range(len(train_dataset)):
        img, label = train_dataset[i]
        if label == idx:
            ax = axes[idx // 5, idx % 5]
            img_np = img.permute(1, 2, 0).numpy()
            ax.imshow(img_np)
            ax.set_title(class_name, fontsize=12)
            ax.axis("off")
            break
plt.tight_layout()
plt.show()

In [None]:
# Class distribution
train_labels = [train_dataset[i][1] for i in range(len(train_dataset))]
label_counts = Counter(train_labels)

plt.figure(figsize=(12, 4))
plt.bar([classes[i] for i in range(len(classes))], [label_counts[i] for i in range(len(classes))], color='steelblue')
plt.xlabel("Class")
plt.ylabel("Number of Samples")
plt.title("Training Set Class Distribution")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Train/Validation/Test Split

You can easily change the train/val/test distribution below:

In [None]:
# Configure splits
VAL_SPLIT = 0.1  # Change this to adjust validation split

# Prepare transforms
train_tfms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])

test_tfms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])

# Load datasets
train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_tfms)
test_dataset = datasets.CIFAR10(root="./data", train=False, download=True, transform=test_tfms)

# Split train into train and validation
val_size = int(len(train_dataset) * VAL_SPLIT)
train_size = len(train_dataset) - val_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_subset, batch_size=256, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

print(f"Train: {len(train_subset)}, Val: {len(val_subset)}, Test: {len(test_dataset)}")

## SimpleCNN Example

Here's a simple baseline CNN architecture:

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(128),
            nn.AdaptiveAvgPool2d((1, 1)),
        )

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")

## Train the Model

In [None]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total, correct, loss_sum = 0, 0, 0.0
    
    for x, y in tqdm(loader, desc="Training"):
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item() * x.size(0)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += x.size(0)
    
    return correct / total, loss_sum / total

def validate(model, loader, criterion, device):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    
    with torch.no_grad():
        for x, y in tqdm(loader, desc="Validating"):
            x, y = x.to(device), y.to(device)
            
            logits = model(x)
            loss = criterion(logits, y)
            
            loss_sum += loss.item() * x.size(0)
            pred = logits.argmax(1)
            correct += (pred == y).sum().item()
            total += x.size(0)
    
    return correct / total, loss_sum / total

# Train for a few epochs
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_acc, val_loss = validate(model, val_loader, criterion, device)
    print(f"Train Acc: {train_acc:.4f}, Loss: {train_loss:.4f}")
    print(f"Val   Acc: {val_acc:.4f}, Loss: {val_loss:.4f}")