# CN7023 - Individual Assignment: Image Multi-Class Classifier

## Research Question:
**How do different data augmentation techniques (rotation, horizontal flip, combined transformations) affect CNN classification performance and generalization on Fashion-MNIST?**

## Experiments:
1. **Baseline**: No augmentation
2. **Rotation**: Random rotation (±30°)
3. **Horizontal Flip**: Random horizontal flip
4. **Combined**: Rotation + Flip + Random affine

## 1. Import Libraries and Setup

In [None]:
# PyTorch Libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as dsets

# Data handling and visualization
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

# Utilities
import os
import json
from datetime import datetime

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Configuration and Hyperparameters

In [None]:
# Image configuration
IMAGE_SIZE = 28  # Fashion-MNIST is 28x28 (we'll keep original size)
NUM_CLASSES = 10

# Training hyperparameters
BATCH_SIZE = 100
LEARNING_RATE = 0.1
NUM_EPOCHS = 5  # Can increase if time permits

# Model architecture parameters
OUT_CHANNELS_1 = 16  # First conv layer output channels
OUT_CHANNELS_2 = 32  # Second conv layer output channels

# Paths
DATA_DIR = '../data'
MODELS_DIR = '../models'
RESULTS_DIR = '../results'
PLOTS_DIR = '../results/plots'
METRICS_DIR = '../results/metrics'

# Create directories if they don't exist
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(METRICS_DIR, exist_ok=True)

# Fashion-MNIST class names
CLASS_NAMES = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

print("Configuration loaded successfully!")

## 3. Define CNN Architecture

In [None]:
class CNN(nn.Module):
    """
    Convolutional Neural Network for Fashion-MNIST Classification
    
    Architecture:
    - Conv Layer 1: 1 -> 16 channels, 5x5 kernel, padding=2
    - ReLU + MaxPool (2x2)
    - Conv Layer 2: 16 -> 32 channels, 5x5 kernel, padding=2
    - ReLU + MaxPool (2x2)
    - Fully Connected: 32*7*7 -> 10 classes
    
    Channel width calculation:
    - Input: 28x28
    - After Conv1 (padding=2): 28x28
    - After MaxPool1: 14x14
    - After Conv2 (padding=2): 14x14
    - After MaxPool2: 7x7
    """
    
    def __init__(self, out_1=16, out_2=32):
        super(CNN, self).__init__()
        
        # First convolutional block
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=out_1, 
                              kernel_size=5, padding=2)
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        # Second convolutional block
        self.cnn2 = nn.Conv2d(in_channels=out_1, out_channels=out_2, 
                              kernel_size=5, stride=1, padding=2)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        
        # Fully connected layer
        # Input size: out_2 * 7 * 7 (for 28x28 input)
        self.fc1 = nn.Linear(out_2 * 7 * 7, NUM_CLASSES)
    
    def forward(self, x):
        """Forward pass through the network"""
        # First conv block
        x = self.cnn1(x)
        x = torch.relu(x)
        x = self.maxpool1(x)
        
        # Second conv block
        x = self.cnn2(x)
        x = torch.relu(x)
        x = self.maxpool2(x)
        
        # Flatten and fully connected
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x
    
    def count_parameters(self):
        """Count total number of trainable parameters"""
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

# Test the model
test_model = CNN(OUT_CHANNELS_1, OUT_CHANNELS_2)
print(f"Model architecture:\n{test_model}")
print(f"\nTotal trainable parameters: {test_model.count_parameters():,}")

## 4. Data Loading and Exploration

In [None]:
# Define transformations for different experiments

# Baseline: Only normalization
transform_baseline = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Rotation augmentation (±30 degrees)
transform_rotation = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Horizontal flip augmentation
transform_flip = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Combined augmentation (rotation + flip + affine)
transform_combined = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Test/Validation transform (no augmentation)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

print("Data transformations defined successfully!")

In [None]:
# Load Fashion-MNIST dataset

# Download and load training dataset (we'll use baseline transform for exploration)
train_dataset_full = dsets.FashionMNIST(root=DATA_DIR, train=True, 
                                        download=True, transform=transform_baseline)

# Load test dataset
test_dataset = dsets.FashionMNIST(root=DATA_DIR, train=False, 
                                  download=True, transform=transform_test)

# Split training data into train and validation
train_size = int(0.8 * len(train_dataset_full))  # 80% for training
val_size = len(train_dataset_full) - train_size   # 20% for validation

train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

print(f"Dataset loaded successfully!")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

In [None]:
# Visualize sample images from each class

def show_data(data_sample, label):
    """Display a single image with its label"""
    plt.imshow(data_sample.squeeze(), cmap='gray')
    plt.title(f'Class: {CLASS_NAMES[label]}')
    plt.axis('off')

def visualize_samples(dataset, n_samples=10):
    """Visualize one sample from each class"""
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    axes = axes.ravel()
    
    # Find one example of each class
    samples_found = {}
    for img, label in dataset:
        if label not in samples_found:
            samples_found[label] = img
        if len(samples_found) == 10:
            break
    
    # Plot samples
    for idx in range(10):
        if idx in samples_found:
            axes[idx].imshow(samples_found[idx].squeeze(), cmap='gray')
            axes[idx].set_title(f'{CLASS_NAMES[idx]}', fontsize=10)
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, 'sample_images.png'), dpi=150, bbox_inches='tight')
    plt.show()

# Visualize samples
visualize_samples(train_dataset_full)

In [None]:
# Analyze class distribution

def analyze_class_distribution(dataset, title="Class Distribution"):
    """Analyze and plot class distribution"""
    labels = []
    for _, label in dataset:
        labels.append(label)
    
    # Count occurrences
    unique, counts = np.unique(labels, return_counts=True)
    
    # Plot
    plt.figure(figsize=(12, 5))
    plt.bar([CLASS_NAMES[i] for i in unique], counts)
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, 'class_distribution.png'), dpi=150, bbox_inches='tight')
    plt.show()
    
    # Print statistics
    print("\nClass distribution:")
    for class_idx, count in zip(unique, counts):
        print(f"{CLASS_NAMES[class_idx]:15s}: {count:5d} samples ({count/len(labels)*100:.1f}%)")
    
    return unique, counts

# Analyze distribution
analyze_class_distribution(train_dataset_full, "Training Set Class Distribution")

## 5. Training and Evaluation Functions

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, 
                num_epochs, device, experiment_name="baseline"):
    """
    Train the model and track metrics
    
    Returns:
        history: dict with training history (loss, accuracy per epoch)
    """
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'epoch_times': []
    }
    
    model = model.to(device)
    
    print(f"\nTraining model: {experiment_name}")
    print("=" * 60)
    
    for epoch in range(num_epochs):
        start_time = datetime.now()
        
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            # Forward pass
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            # Track metrics
            train_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            train_total += target.size(0)
            train_correct += (predicted == target).sum().item()
        
        # Calculate training metrics
        avg_train_loss = train_loss / len(train_loader)
        train_accuracy = 100 * train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = criterion(output, target)
                
                val_loss += loss.item()
                _, predicted = torch.max(output.data, 1)
                val_total += target.size(0)
                val_correct += (predicted == target).sum().item()
        
        # Calculate validation metrics
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * val_correct / val_total
        
        # Record history
        history['train_loss'].append(avg_train_loss)
        history['train_acc'].append(train_accuracy)
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_accuracy)
        
        epoch_time = (datetime.now() - start_time).total_seconds()
        history['epoch_times'].append(epoch_time)
        
        # Print progress
        print(f"Epoch [{epoch+1}/{num_epochs}] | "
              f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}% | "
              f"Time: {epoch_time:.1f}s")
    
    print("=" * 60)
    print(f"Training completed!")
    
    return history

In [None]:
def evaluate_model(model, test_loader, device, class_names):
    """
    Evaluate model on test set and return detailed metrics
    
    Returns:
        results: dict with accuracy, predictions, confusion matrix, etc.
    """
    model.eval()
    model = model.to(device)
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    conf_matrix = confusion_matrix(all_targets, all_predictions)
    class_report = classification_report(all_targets, all_predictions, 
                                        target_names=class_names, 
                                        output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'predictions': all_predictions,
        'targets': all_targets,
        'confusion_matrix': conf_matrix,
        'classification_report': class_report
    }
    
    return results

In [None]:
def plot_training_history(history, experiment_name):
    """
    Plot training and validation loss/accuracy curves
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot loss
    ax1.plot(history['train_loss'], label='Train Loss', marker='o')
    ax1.plot(history['val_loss'], label='Val Loss', marker='o')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title(f'{experiment_name} - Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Plot accuracy
    ax2.plot(history['train_acc'], label='Train Accuracy', marker='o')
    ax2.plot(history['val_acc'], label='Val Accuracy', marker='o')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.set_title(f'{experiment_name} - Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    filename = f"{experiment_name.lower().replace(' ', '_')}_training_curves.png"
    plt.savefig(os.path.join(PLOTS_DIR, filename), dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
def plot_confusion_matrix(conf_matrix, class_names, experiment_name):
    """
    Plot confusion matrix heatmap
    """
    plt.figure(figsize=(12, 10))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'{experiment_name} - Confusion Matrix')
    plt.tight_layout()
    filename = f"{experiment_name.lower().replace(' ', '_')}_confusion_matrix.png"
    plt.savefig(os.path.join(PLOTS_DIR, filename), dpi=150, bbox_inches='tight')
    plt.show()

## 6. Experiment 1: Baseline (No Augmentation)

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 1: BASELINE (NO AUGMENTATION)")
print("="*60)

# Create datasets with baseline transform
train_dataset_baseline = dsets.FashionMNIST(root=DATA_DIR, train=True, 
                                            download=True, transform=transform_baseline)

# Split into train/val
train_baseline, val_baseline = torch.utils.data.random_split(
    train_dataset_baseline, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

# Create data loaders
train_loader_baseline = torch.utils.data.DataLoader(train_baseline, 
                                                    batch_size=BATCH_SIZE, 
                                                    shuffle=True)
val_loader_baseline = torch.utils.data.DataLoader(val_baseline, 
                                                  batch_size=BATCH_SIZE, 
                                                  shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=BATCH_SIZE, 
                                          shuffle=False)

# Initialize model, criterion, and optimizer
model_baseline = CNN(OUT_CHANNELS_1, OUT_CHANNELS_2)
criterion = nn.CrossEntropyLoss()
optimizer_baseline = optim.SGD(model_baseline.parameters(), lr=LEARNING_RATE)

# Train model
history_baseline = train_model(model_baseline, train_loader_baseline, val_loader_baseline,
                              criterion, optimizer_baseline, NUM_EPOCHS, 
                              device, "Baseline")

# Plot training history
plot_training_history(history_baseline, "Baseline")

# Evaluate on test set
results_baseline = evaluate_model(model_baseline, test_loader, device, CLASS_NAMES)

print(f"\nBaseline Test Accuracy: {results_baseline['accuracy']*100:.2f}%")

# Plot confusion matrix
plot_confusion_matrix(results_baseline['confusion_matrix'], CLASS_NAMES, "Baseline")

# Save model
torch.save({
    'model_state_dict': model_baseline.state_dict(),
    'optimizer_state_dict': optimizer_baseline.state_dict(),
    'history': history_baseline,
    'test_results': results_baseline
}, os.path.join(MODELS_DIR, 'baseline_model.pth'))

print("Baseline model saved!")

## 7. Experiment 2: Rotation Augmentation

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 2: ROTATION AUGMENTATION (±30°)")
print("="*60)

# Create datasets with rotation transform
train_dataset_rotation = dsets.FashionMNIST(root=DATA_DIR, train=True, 
                                           download=True, transform=transform_rotation)

# Split into train/val (validation uses no augmentation)
train_rotation, _ = torch.utils.data.random_split(
    train_dataset_rotation, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

# Create data loaders (reuse validation loader from baseline)
train_loader_rotation = torch.utils.data.DataLoader(train_rotation, 
                                                    batch_size=BATCH_SIZE, 
                                                    shuffle=True)

# Initialize model, criterion, and optimizer
model_rotation = CNN(OUT_CHANNELS_1, OUT_CHANNELS_2)
optimizer_rotation = optim.SGD(model_rotation.parameters(), lr=LEARNING_RATE)

# Train model
history_rotation = train_model(model_rotation, train_loader_rotation, val_loader_baseline,
                              criterion, optimizer_rotation, NUM_EPOCHS, 
                              device, "Rotation Augmentation")

# Plot training history
plot_training_history(history_rotation, "Rotation Augmentation")

# Evaluate on test set
results_rotation = evaluate_model(model_rotation, test_loader, device, CLASS_NAMES)

print(f"\nRotation Test Accuracy: {results_rotation['accuracy']*100:.2f}%")

# Plot confusion matrix
plot_confusion_matrix(results_rotation['confusion_matrix'], CLASS_NAMES, "Rotation Augmentation")

# Save model
torch.save({
    'model_state_dict': model_rotation.state_dict(),
    'optimizer_state_dict': optimizer_rotation.state_dict(),
    'history': history_rotation,
    'test_results': results_rotation
}, os.path.join(MODELS_DIR, 'rotation_model.pth'))

print("Rotation model saved!")

## 8. Experiment 3: Horizontal Flip Augmentation

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 3: HORIZONTAL FLIP AUGMENTATION")
print("="*60)

# Create datasets with flip transform
train_dataset_flip = dsets.FashionMNIST(root=DATA_DIR, train=True, 
                                       download=True, transform=transform_flip)

# Split into train/val
train_flip, _ = torch.utils.data.random_split(
    train_dataset_flip, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

# Create data loaders
train_loader_flip = torch.utils.data.DataLoader(train_flip, 
                                               batch_size=BATCH_SIZE, 
                                               shuffle=True)

# Initialize model, criterion, and optimizer
model_flip = CNN(OUT_CHANNELS_1, OUT_CHANNELS_2)
optimizer_flip = optim.SGD(model_flip.parameters(), lr=LEARNING_RATE)

# Train model
history_flip = train_model(model_flip, train_loader_flip, val_loader_baseline,
                          criterion, optimizer_flip, NUM_EPOCHS, 
                          device, "Horizontal Flip")

# Plot training history
plot_training_history(history_flip, "Horizontal Flip")

# Evaluate on test set
results_flip = evaluate_model(model_flip, test_loader, device, CLASS_NAMES)

print(f"\nFlip Test Accuracy: {results_flip['accuracy']*100:.2f}%")

# Plot confusion matrix
plot_confusion_matrix(results_flip['confusion_matrix'], CLASS_NAMES, "Horizontal Flip")

# Save model
torch.save({
    'model_state_dict': model_flip.state_dict(),
    'optimizer_state_dict': optimizer_flip.state_dict(),
    'history': history_flip,
    'test_results': results_flip
}, os.path.join(MODELS_DIR, 'flip_model.pth'))

print("Flip model saved!")

## 9. Experiment 4: Combined Augmentation

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 4: COMBINED AUGMENTATION (Rotation + Flip + Affine)")
print("="*60)

# Create datasets with combined transform
train_dataset_combined = dsets.FashionMNIST(root=DATA_DIR, train=True, 
                                           download=True, transform=transform_combined)

# Split into train/val
train_combined, _ = torch.utils.data.random_split(
    train_dataset_combined, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

# Create data loaders
train_loader_combined = torch.utils.data.DataLoader(train_combined, 
                                                    batch_size=BATCH_SIZE, 
                                                    shuffle=True)

# Initialize model, criterion, and optimizer
model_combined = CNN(OUT_CHANNELS_1, OUT_CHANNELS_2)
optimizer_combined = optim.SGD(model_combined.parameters(), lr=LEARNING_RATE)

# Train model
history_combined = train_model(model_combined, train_loader_combined, val_loader_baseline,
                              criterion, optimizer_combined, NUM_EPOCHS, 
                              device, "Combined Augmentation")

# Plot training history
plot_training_history(history_combined, "Combined Augmentation")

# Evaluate on test set
results_combined = evaluate_model(model_combined, test_loader, device, CLASS_NAMES)

print(f"\nCombined Test Accuracy: {results_combined['accuracy']*100:.2f}%")

# Plot confusion matrix
plot_confusion_matrix(results_combined['confusion_matrix'], CLASS_NAMES, "Combined Augmentation")

# Save model
torch.save({
    'model_state_dict': model_combined.state_dict(),
    'optimizer_state_dict': optimizer_combined.state_dict(),
    'history': history_combined,
    'test_results': results_combined
}, os.path.join(MODELS_DIR, 'combined_model.pth'))

print("Combined model saved!")

## 10. Comparative Analysis

In [None]:
# Create comprehensive comparison table

experiments = ['Baseline', 'Rotation', 'Horizontal Flip', 'Combined']
results_list = [results_baseline, results_rotation, results_flip, results_combined]
histories_list = [history_baseline, history_rotation, history_flip, history_combined]

# Compile results
comparison_data = []
for exp_name, results, history in zip(experiments, results_list, histories_list):
    comparison_data.append({
        'Experiment': exp_name,
        'Train Accuracy (%)': f"{history['train_acc'][-1]:.2f}",
        'Val Accuracy (%)': f"{history['val_acc'][-1]:.2f}",
        'Test Accuracy (%)': f"{results['accuracy']*100:.2f}",
        'Final Train Loss': f"{history['train_loss'][-1]:.4f}",
        'Final Val Loss': f"{history['val_loss'][-1]:.4f}",
        'Avg Epoch Time (s)': f"{np.mean(history['epoch_times']):.2f}"
    })

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*80)
print("COMPARATIVE RESULTS")
print("="*80)
print(comparison_df.to_string(index=False))

# Save to CSV
comparison_df.to_csv(os.path.join(METRICS_DIR, 'comparison_results.csv'), index=False)
print(f"\nComparison table saved to {METRICS_DIR}/comparison_results.csv")

In [None]:
# Plot comparative accuracy bar chart

test_accuracies = [r['accuracy']*100 for r in results_list]

plt.figure(figsize=(12, 6))
bars = plt.bar(experiments, test_accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
plt.xlabel('Augmentation Strategy', fontsize=12)
plt.ylabel('Test Accuracy (%)', fontsize=12)
plt.title('Test Accuracy Comparison Across Augmentation Strategies', fontsize=14, fontweight='bold')
plt.ylim([min(test_accuracies)-5, max(test_accuracies)+5])

# Add value labels on bars
for bar, acc in zip(bars, test_accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.2f}%', ha='center', va='bottom', fontsize=11)

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, 'accuracy_comparison.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot all training curves together

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

# Plot validation accuracy for all experiments
for exp_name, history, color in zip(experiments, histories_list, colors):
    ax1.plot(range(1, NUM_EPOCHS+1), history['val_acc'], 
            label=exp_name, marker='o', color=color, linewidth=2)

ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Validation Accuracy (%)', fontsize=12)
ax1.set_title('Validation Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot validation loss for all experiments
for exp_name, history, color in zip(experiments, histories_list, colors):
    ax2.plot(range(1, NUM_EPOCHS+1), history['val_loss'], 
            label=exp_name, marker='o', color=color, linewidth=2)

ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Validation Loss', fontsize=12)
ax2.set_title('Validation Loss Comparison', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, 'all_experiments_comparison.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Per-class accuracy comparison

def get_per_class_accuracy(results):
    """Extract per-class accuracy from classification report"""
    class_acc = []
    for class_name in CLASS_NAMES:
        class_acc.append(results['classification_report'][class_name]['recall'] * 100)
    return class_acc

# Get per-class accuracies for all experiments
per_class_data = []
for exp_name, results in zip(experiments, results_list):
    class_accs = get_per_class_accuracy(results)
    per_class_data.append(class_accs)

# Plot
x = np.arange(len(CLASS_NAMES))
width = 0.2

fig, ax = plt.subplots(figsize=(16, 8))
for i, (exp_name, class_accs, color) in enumerate(zip(experiments, per_class_data, colors)):
    ax.bar(x + i*width, class_accs, width, label=exp_name, color=color)

ax.set_xlabel('Class', fontsize=12)
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_title('Per-Class Accuracy Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(CLASS_NAMES, rotation=45, ha='right')
ax.legend(fontsize=10)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, 'per_class_accuracy_comparison.png'), dpi=150, bbox_inches='tight')
plt.show()

print("\nPer-class accuracy saved!")

## 11. Save Summary Report

In [None]:
# Create comprehensive summary report

summary_report = {
    'experiment_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'dataset': 'Fashion-MNIST',
    'architecture': {
        'type': 'CNN',
        'conv1_out_channels': OUT_CHANNELS_1,
        'conv2_out_channels': OUT_CHANNELS_2,
        'total_parameters': model_baseline.count_parameters()
    },
    'hyperparameters': {
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'num_epochs': NUM_EPOCHS,
        'optimizer': 'SGD'
    },
    'data_split': {
        'train_size': train_size,
        'val_size': val_size,
        'test_size': len(test_dataset)
    },
    'experiments': {}
}

# Add results for each experiment
for exp_name, results, history in zip(experiments, results_list, histories_list):
    summary_report['experiments'][exp_name] = {
        'final_train_accuracy': history['train_acc'][-1],
        'final_val_accuracy': history['val_acc'][-1],
        'test_accuracy': results['accuracy'] * 100,
        'final_train_loss': history['train_loss'][-1],
        'final_val_loss': history['val_loss'][-1],
        'avg_epoch_time': np.mean(history['epoch_times']),
        'classification_report': results['classification_report']
    }

# Save to JSON
with open(os.path.join(METRICS_DIR, 'summary_report.json'), 'w') as f:
    json.dump(summary_report, f, indent=4)

print("\n" + "="*80)
print("SUMMARY REPORT SAVED")
print("="*80)
print(f"Location: {METRICS_DIR}/summary_report.json")
print("\nAll experiments completed successfully!")
print(f"\nBest performing model: {experiments[np.argmax(test_accuracies)]}")
print(f"Best test accuracy: {max(test_accuracies):.2f}%")
print(f"\nImprovement over baseline: {max(test_accuracies) - test_accuracies[0]:.2f}%")

## Next Steps

Now that you have completed all experiments, you should:

1. **Review Results**: Examine all plots and metrics
2. **Start Writing Report**: Use the results to write your 5000-word report
3. **Critical Analysis**: Analyze why certain augmentations worked better
4. **Create Presentation**: Prepare slides based on your findings

All your results are saved in:
- Models: `../models/`
- Plots: `../results/plots/`
- Metrics: `../results/metrics/`