# Fashion-MNIST: Feature Engineering + Tiny CNN


## Overview
This notebook walks through a **complete image classification pipeline** on **Fashion-MNIST**:
1. Data loading & visualization
2. Preprocessing (grayscale already, normalization)
3. **Feature engineering** with HOG + Logistic Regression
4. **Tiny CNN** baseline (PyTorch)
5. Evaluation & comparison (accuracy, confusion matrix)

> Runs locally or on Google Colab. If on Colab, enable GPU (Runtime → Change runtime type → GPU).


In [None]:
# If running in a clean environment, uncomment to install packages:
# !pip install torch torchvision torchaudio -q --extra-index-url https://download.pytorch.org/whl/cu126
# !pip install scikit-image scikit-learn matplotlib pandas seaborn tqdm ipywidgets notebook-q

import torch, torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, roc_auc_score
from sklearn.preprocessing import label_binarize
from skimage.feature import hog
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from tqdm.notebook import tqdm

# Device selection: CUDA (NVIDIA GPU) > MPS (Apple Silicon) > CPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [2]:
# 1) Data loading & visualization
# Define transformation pipeline: Convert to tensor and normalize
# Normalization helps the model converge faster by scaling pixel values to [-1, 1]
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts PIL Image to tensor and scales [0, 255] to [0, 1]
    transforms.Normalize((0.5,), (0.5,))  # Normalizes grayscale images: (pixel - 0.5) / 0.5
])

# Load Fashion-MNIST dataset (28x28 grayscale images of clothing items)
train_dataset = datasets.FashionMNIST(root="./data", train=True, download=True, transform=transform)
test_dataset  = datasets.FashionMNIST(root="./data", train=False, download=True, transform=transform)

# 10 classes of clothing items
classes = train_dataset.classes
print(f"Classes: {classes}")
print(f"Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")

Classes: ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
Training samples: 60000, Test samples: 10000


In [None]:
# Visualize sample images from each class
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for idx, class_name in enumerate(classes):
    # Find first image of this class
    for i in range(len(train_dataset)):
        img, label = train_dataset[i]
        if label == idx:
            ax = axes[idx // 5, idx % 5]
            # Denormalize for display: reverse the normalization to get back to [0, 1]
            ax.imshow(img.squeeze().numpy() * 0.5 + 0.5, cmap="gray")
            ax.set_title(class_name, fontsize=12)
            ax.axis("off")
            break
plt.tight_layout()
plt.show()

# Display class distribution in training set
from collections import Counter
train_labels = [train_dataset[i][1] for i in range(len(train_dataset))]
label_counts = Counter(train_labels)

plt.figure(figsize=(12, 4))
plt.bar([classes[i] for i in range(len(classes))], [label_counts[i] for i in range(len(classes))], color='steelblue')
plt.xlabel("Class")
plt.ylabel("Number of Samples")
plt.title("Training Set Class Distribution")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Split a validation set from train

In [4]:
# Split training data into train and validation sets
# Validation set helps us monitor overfitting during training
val_size = 10000
train_size = len(train_dataset) - val_size
train_ds, val_ds = random_split(train_dataset, [train_size, val_size])

# DataLoaders handle batching, shuffling, and parallel data loading
# Shuffle training data to prevent the model from learning the order
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

print(f"Train samples: {len(train_ds)}, Validation samples: {len(val_ds)}, Test samples: {len(test_dataset)}")

Train samples: 50000, Validation samples: 10000, Test samples: 10000


## Feature Engineering: HOG + Logistic Regression

In [5]:
# Extract HOG (Histogram of Oriented Gradients) features from a subset for speed
# HOG captures edge and shape information, which is useful for traditional ML methods
N_train = 12000
N_val   = 4000

def dataset_to_numpy(ds, N):
    """Convert PyTorch dataset to numpy arrays with HOG features"""
    X, y = [], []
    for i in tqdm(range(N), desc="Extracting HOG features"):
        img, label = ds[i]
        arr = img.squeeze().numpy()  # 28x28 grayscale
        # HOG: Extracts gradient orientation histograms from image patches
        # pixels_per_cell: size of each cell, cells_per_block: normalization blocks
        feat = hog(arr, pixels_per_cell=(4,4), cells_per_block=(2,2), 
                   orientations=9, block_norm="L2-Hys")
        X.append(feat)
        y.append(int(label))
    return np.array(X), np.array(y)

Xtr, ytr = dataset_to_numpy(train_ds, min(N_train, len(train_ds)))
Xva, yva = dataset_to_numpy(val_ds,   min(N_val, len(val_ds)))

# Standardize features: zero mean, unit variance
# This is important for logistic regression to converge properly
scaler = StandardScaler().fit(Xtr)
Xtr_s = scaler.transform(Xtr)
Xva_s = scaler.transform(Xva)

print(f"HOG feature dimension: {Xtr_s.shape[1]}")

# Train Logistic Regression classifier
clf = LogisticRegression(max_iter=200, n_jobs=-1, verbose=0)
clf.fit(Xtr_s, ytr)

# Evaluate on validation set
pred_va = clf.predict(Xva_s)
acc_va = accuracy_score(yva, pred_va)
print(f"\nHOG+LR Validation Accuracy: {acc_va:.4f}")
print("\nClassification Report:")
print(classification_report(yva, pred_va, target_names=classes))

Extracting HOG features:   0%|          | 0/12000 [00:00<?, ?it/s]

Extracting HOG features:   0%|          | 0/4000 [00:00<?, ?it/s]

HOG feature dimension: 1296

HOG+LR Validation Accuracy: 0.8290

Classification Report:
              precision    recall  f1-score   support

 T-shirt/top       0.76      0.80      0.78       400
     Trouser       0.95      0.96      0.96       411
    Pullover       0.74      0.72      0.73       402
       Dress       0.85      0.83      0.84       408
        Coat       0.68      0.74      0.71       388
      Sandal       0.94      0.92      0.93       380
       Shirt       0.57      0.54      0.55       396
     Sneaker       0.91      0.91      0.91       411
         Bag       0.97      0.92      0.94       407
  Ankle boot       0.94      0.94      0.94       397

    accuracy                           0.83      4000
   macro avg       0.83      0.83      0.83      4000
weighted avg       0.83      0.83      0.83      4000



## Tiny CNN Baseline (PyTorch)

In [7]:
# Define a Tiny CNN (Convolutional Neural Network)
# CNNs are designed to automatically learn spatial hierarchies of features from images

class TinyCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        # Feature extraction layers
        # Conv2d: learns filters to detect edges, textures, and patterns
        # ReLU: introduces non-linearity (allows learning complex patterns)
        # MaxPool2d: downsamples, reducing spatial dimensions and computation
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),  # 1 input channel (grayscale), 32 output channels
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 28x28 -> 14x14
            nn.Conv2d(32, 64, 3, padding=1),  # 32 input channels, 64 output
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 14x14 -> 7x7
        )
        # Classification layers
        self.classifier = nn.Sequential(
            nn.Flatten(),  # Flatten 2D feature maps to 1D vector
            nn.Linear(64*7*7, 128),  # Fully connected layer
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),  # Dropout: randomly zeros 50% of neurons during training (prevents overfitting)
            nn.Linear(128, num_classes)  # Output layer: 10 classes
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Initialize model and move to device (GPU/MPS/CPU)
model = TinyCNN().to(device)

# Loss function: measures how wrong the predictions are
criterion = nn.CrossEntropyLoss()

# Optimizer: adjusts model weights to minimize loss
# Adam is an adaptive learning rate optimizer (works well in practice)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")

Model has 421642 parameters


In [8]:
# Training loop with progress tracking
def run_epoch(loader, train=True):
    """Run one epoch of training or evaluation"""
    model.train(train)  # Set model to training or evaluation mode
    total, correct, loss_sum = 0, 0, 0.0
    
    # tqdm creates a progress bar for visual feedback
    pbar = tqdm(loader, desc=f"{'Training' if train else 'Validating'}")
    for x, y in pbar:
        x, y = x.to(device), y.to(device)
        
        if train:
            optimizer.zero_grad()  # Clear previous gradients
        
        logits = model(x)  # Forward pass: compute predictions
        loss = criterion(logits, y)  # Calculate loss
        
        if train:
            loss.backward()  # Backward pass: compute gradients
            optimizer.step()  # Update weights
        
        loss_sum += loss.item() * x.size(0)
        pred = logits.argmax(dim=1)
        correct += (pred == y).sum().item()
        total += x.size(0)
        
        # Update progress bar with current accuracy
        pbar.set_postfix({'acc': f'{correct/total:.3f}'})
    
    return correct/total, loss_sum/total

# Train for 5 epochs
EPOCHS = 5
for epoch in range(1, EPOCHS+1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    acc_tr, loss_tr = run_epoch(train_loader, train=True)
    acc_va, loss_va = run_epoch(val_loader, train=False)
    print(f"Train Acc: {acc_tr:.3f}, Loss: {loss_tr:.3f} | Val Acc: {acc_va:.3f}, Loss: {loss_va:.3f}")


Epoch 1/5


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Validating:   0%|          | 0/40 [00:00<?, ?it/s]

Train Acc: 0.773, Loss: 0.634 | Val Acc: 0.857, Loss: 0.382

Epoch 2/5


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Validating:   0%|          | 0/40 [00:00<?, ?it/s]

Train Acc: 0.856, Loss: 0.403 | Val Acc: 0.881, Loss: 0.319

Epoch 3/5


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Validating:   0%|          | 0/40 [00:00<?, ?it/s]

Train Acc: 0.877, Loss: 0.347 | Val Acc: 0.893, Loss: 0.286

Epoch 4/5


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Validating:   0%|          | 0/40 [00:00<?, ?it/s]

Train Acc: 0.887, Loss: 0.317 | Val Acc: 0.900, Loss: 0.266

Epoch 5/5


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Validating:   0%|          | 0/40 [00:00<?, ?it/s]

Train Acc: 0.897, Loss: 0.287 | Val Acc: 0.906, Loss: 0.249


In [9]:
# Evaluate on test set with comprehensive metrics
model.eval()
y_true, y_pred, y_probs = [], [], []

with torch.no_grad():
    for x, y in tqdm(test_loader, desc="Testing"):
        x = x.to(device)
        logits = model(x).cpu()
        probs = torch.softmax(logits, dim=1)  # Convert logits to probabilities
        
        y_pred.extend(logits.argmax(1).tolist())
        y_true.extend(y.tolist())
        y_probs.extend(probs.numpy())

y_probs = np.array(y_probs)

# Calculate comprehensive metrics
acc_test = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)

# AUC-ROC: measures model's ability to distinguish between classes
# Need to binarize labels for multi-class AUC calculation (one-vs-rest)
y_true_bin = label_binarize(y_true, classes=list(range(len(classes))))
auc_score = roc_auc_score(y_true_bin, y_probs, average='weighted', multi_class='ovr')

# Calculate loss on test set
test_loss = 0.0
total = 0
model.eval()
with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        test_loss += loss.item() * x.size(0)
        total += x.size(0)
test_loss /= total

print("=" * 50)
print("TINY CNN TEST RESULTS")
print("=" * 50)
print(f"Test Loss:      {test_loss:.4f}")
print(f"Accuracy:       {acc_test:.4f}  # Proportion of correct predictions")
print(f"Precision:      {precision:.4f}  # Of all positive predictions, how many were correct")
print(f"Recall:         {recall:.4f}  # Of all actual positives, how many were found")
print(f"F1 Score:       {f1:.4f}  # Harmonic mean of precision and recall")
print(f"AUC-ROC:        {auc_score:.4f}  # Area under ROC curve (1.0 = perfect classifier)")
print("=" * 50)

# Explanation of metrics:
print("\nMetric Explanations:")
print("- Test Loss: How wrong the model's predictions are (lower is better)")
print("- Accuracy: Percentage of correctly classified samples")
print("- Precision: When model predicts a class, how often is it right?")
print("- Recall: Of all samples in a class, how many did we find?")
print("- F1 Score: Balance between precision and recall (useful for imbalanced data)")
print("- AUC-ROC: Model's ability to distinguish between classes (0.5=random, 1.0=perfect)")

Testing:   0%|          | 0/40 [00:00<?, ?it/s]

TINY CNN TEST RESULTS
Test Loss:      0.2627
Accuracy:       0.9045  # Proportion of correct predictions
Precision:      0.9041  # Of all positive predictions, how many were correct
Recall:         0.9045  # Of all actual positives, how many were found
F1 Score:       0.9042  # Harmonic mean of precision and recall
AUC-ROC:        0.9933  # Area under ROC curve (1.0 = perfect classifier)

Metric Explanations:
- Test Loss: How wrong the model's predictions are (lower is better)
- Accuracy: Percentage of correctly classified samples
- Precision: When model predicts a class, how often is it right?
- Recall: Of all samples in a class, how many did we find?
- F1 Score: Balance between precision and recall (useful for imbalanced data)
- AUC-ROC: Model's ability to distinguish between classes (0.5=random, 1.0=perfect)
