# Notebook 1: 1_data_preprocessing.ipynb
Goal: Load, preprocess, and visualize the image dataset.

Sections:

Data Loading
- Load the dataset (e.g., from folders or URLs)
- Split into train/validation/test sets

Preprocessing
- Resize images, normalize pixel values, and apply augmentations (flip, rotate, etc.)

Visualization
- Show sample images with labels using matplotlib

In [3]:
import os
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [4]:
# Path to your dataset
dataset_path = '../../data/animals10/'

# Define transformations: Resize, Augment, Convert to Tensor, Normalize
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize to 128x128
    transforms.RandomHorizontalFlip(),  # Data augmentation
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1]
])

# Load dataset
dataset = ImageFolder(root=dataset_path, transform=transform)

# Split dataset (80% training, 20% validation)
val_size = int(0.2 * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Wrap in DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Total images: {len(dataset)}")
print(f"Training set: {len(train_dataset)}")
print(f"Validation set: {len(val_dataset)}")


Total images: 26179
Training set: 20944
Validation set: 5235
