# Build a Feedforward Neural Network from Scratch

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torchvision import datasets
import numpy as np
from typing import Tuple

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

## Dataset




In [None]:
# Import the Fashion MNIST dataset
train_full = datasets.FashionMNIST(root='./data', train=True, download=True)
test_ds = datasets.FashionMNIST(root='./data', train=False, download=True)


In [None]:
# Define split sizes and batch size
batch_size = 10
train_len, val_len = 50_000, 10_000
split_generator = torch.Generator().manual_seed(42)

In [None]:
# Compute normalization from training data, build datasets and loaders
# Scale to [0,1]
train_pixels = train_full.data.float() / 255.0
mean = train_pixels.mean()
std = train_pixels.std() + 1e-7

# Normalize full train and test
x_train_full = ((train_full.data.float() / 255.0) - mean) / std
y_train_full = train_full.targets.clone()

x_test = ((test_ds.data.float() / 255.0) - mean) / std
y_test = test_ds.targets.clone()

# Add channel dimension
x_train_full = x_train_full.unsqueeze(1)
x_test = x_test.unsqueeze(1)

# Build datasets and split
full_ds = TensorDataset(x_train_full, y_train_full)
train_ds, val_ds = random_split(full_ds, [train_len, val_len], generator=split_generator)

test_tensor_ds = TensorDataset(x_test, y_test)

# DataLoaders
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_tensor_ds, batch_size=batch_size, shuffle=False)

## Build a Feedforward Neural Network

In [None]:
class DenseLayer:
    def __init__(self, in_features: int, out_features: int):
        # He uniform initialization without torch.empty/zeros/zeros_like
        limit = float(np.sqrt(2.0 / in_features))
        self.W = (torch.rand(in_features, out_features, device=device) * (2.0 * limit)) - limit
        self.b = torch.rand(out_features, device=device)
        self.b = self.b - self.b  # zero bias without torch.zeros
        self.dW = self.W * 0.0    # zero grad without torch.zeros_like
        self.db = self.b * 0.0    # zero grad without torch.zeros_like
        self.input_cache = None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.input_cache = x
        return x @ self.W + self.b

    def backward(self, grad_output: torch.Tensor) -> torch.Tensor:
        x = self.input_cache
        # Parameter gradients
        self.dW = x.transpose(0, 1) @ grad_output
        self.db = grad_output.sum(dim=0)
        # Gradient w.r.t. inputs
        grad_input = grad_output @ self.W.transpose(0, 1)
        return grad_input

In [None]:
class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.mask = x > 0
        return x.clamp_min(0.0)

    def backward(self, grad_output: torch.Tensor) -> torch.Tensor:
        return grad_output * self.mask.to(grad_output.dtype)

In [None]:
def softmax(x: torch.Tensor, dim: int = -1) -> torch.Tensor:
    x_shift = x - x.max(dim=dim, keepdim=True).values
    exp_x = torch.exp(x_shift)
    return exp_x / exp_x.sum(dim=dim, keepdim=True)

In [None]:
class Flatten:
    def __init__(self):
        self.input_shape = None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self.input_shape = x.shape
        return x.view(x.shape[0], -1)

    def backward(self, grad_output: torch.Tensor) -> torch.Tensor:
        return grad_output.view(self.input_shape)

In [None]:
class FeedForwardNet:
    def __init__(self):
        self.flatten = Flatten()
        self.fc1 = DenseLayer(28*28, 128)
        self.relu1 = ReLU()
        self.fc2 = DenseLayer(128, 10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten.forward(x)
        x = self.fc1.forward(x)
        x = self.relu1.forward(x)
        logits = self.fc2.forward(x)
        return logits

    def backward(self, grad_logits: torch.Tensor) -> None:
        grad = self.fc2.backward(grad_logits)
        grad = self.relu1.backward(grad)
        grad = self.fc1.backward(grad)
        _ = self.flatten.backward(grad)

    def zero_grad(self) -> None:
        self.fc1.dW *= 0.0; self.fc1.db *= 0.0
        self.fc2.dW *= 0.0; self.fc2.db *= 0.0

    def parameters(self):
        return [(self.fc1.W, self.fc1.dW), (self.fc1.b, self.fc1.db), (self.fc2.W, self.fc2.dW), (self.fc2.b, self.fc2.db)]

model = FeedForwardNet()

## Train an Artificial Neural Network

In [None]:
def cross_entropy_loss(logits: torch.Tensor, targets: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    # logits: [B, 10], targets: [B]
    probs = softmax(logits, dim=1)
    batch_indices = torch.arange(logits.shape[0], device=logits.device)
    target_probs = probs[batch_indices, targets]
    # Add epsilon for numerical stability
    eps = 1e-12
    losses = -torch.log(target_probs + eps)
    return losses.mean(), probs

In [None]:
class SGD:
    def __init__(self, params, lr: float = 1e-3):
        self.params = params
        self.lr = lr

    def step(self):
        for weight, grad in self.params:
            weight -= self.lr * grad

    def zero_grad(self):
        for _, grad in self.params:
            grad.zero_()

In [None]:
def accuracy_from_logits(logits: torch.Tensor, targets: torch.Tensor) -> float:
    preds = logits.argmax(dim=1)
    return (preds == targets).float().mean().item() * 100.0

In [None]:
num_epochs = 30
learning_rate = 1e-3

model = FeedForwardNet()
optimizer = SGD(model.parameters(), lr=learning_rate)

for epoch in range(1, num_epochs + 1):
    # Train
    model.zero_grad()
    model_device = device
    running_loss = 0.0
    running_acc = 0.0
    total_batches = 0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        logits = model.forward(xb)
        loss, probs = cross_entropy_loss(logits, yb)

        # Backprop: compute gradient of loss w.r.t logits using dL/dz = probs - one_hot(target)
        grad_logits = probs
        grad_logits[torch.arange(yb.size(0), device=device), yb] -= 1.0
        grad_logits /= yb.size(0)

        model.zero_grad()
        model.backward(grad_logits)
        optimizer.step()

        running_loss += loss.item()
        running_acc += accuracy_from_logits(logits.detach(), yb)
        total_batches += 1

    train_loss = running_loss / total_batches
    train_acc = running_acc / total_batches

    # Validation
    with torch.no_grad():
        val_loss_accum = 0.0
        val_acc_accum = 0.0
        val_batches = 0
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model.forward(xb)
            vloss, _ = cross_entropy_loss(logits, yb)
            val_loss_accum += vloss.item()
            val_acc_accum += accuracy_from_logits(logits, yb)
            val_batches += 1
        val_loss = val_loss_accum / val_batches
        val_acc = val_acc_accum / val_batches

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:02d}/{num_epochs} | train_loss={train_loss:.4f} acc={train_acc:.2f}% | val_loss={val_loss:.4f} acc={val_acc:.2f}%")

## Evaluate an Artificial Neural Network

In [None]:
with torch.no_grad():
    test_loss_accum = 0.0
    test_acc_accum = 0.0
    test_batches = 0
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        logits = model.forward(xb)
        tloss, _ = cross_entropy_loss(logits, yb)
        test_loss_accum += tloss.item()
        test_acc_accum += accuracy_from_logits(logits, yb)
        test_batches += 1
    test_loss = test_loss_accum / test_batches
    test_acc = test_acc_accum / test_batches

print(f"Test loss={test_loss:.4f} acc={test_acc:.2f}%")