<a href="https://colab.research.google.com/github/manandharshairu-wq/deep-learning-assignment-1/blob/main/ShairaManandhar_Assignment1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# 0) MOUNT DRIVE + SETUP
# =========================
from google.colab import drive
drive.mount('/content/drive')

import os, json, torch

base_path = "/content/drive/MyDrive/shairamanandhar_assignment1_final"
os.makedirs(base_path, exist_ok=True)
%cd $base_path

# Project structure
for folder in ["models", "data_loaders", "config", "results", "data"]:
    os.makedirs(folder, exist_ok=True)

print("Project at:", base_path)


In [None]:
# Install deps
!pip -q install torch torchvision numpy pandas scikit-learn matplotlib tqdm tensorflow-datasets

In [None]:
# TFDS cache (PCam uses TFDS)
import os
os.environ["TFDS_DATA_DIR"] = "/content/tfds_cache"
os.makedirs(os.environ["TFDS_DATA_DIR"], exist_ok=True)
print("TFDS cache:", os.environ["TFDS_DATA_DIR"])

TFDS cache: /content/tfds_cache


In [None]:
# =========================
# 1) CONFIG FILE
# =========================
import torch

config = {
    "seed": 42,
    "device": "cuda" if torch.cuda.is_available() else "cpu",

    # Training (consistent across all models)
    "optimizer": "adam",
    "lr": 0.001,
    "batch_size": 128,
    "epochs": 12,

    # Early stopping (consistent)
    "early_stopping": {
        "enabled": True,
        "monitor": "val_loss",
        "patience": 3,
        "min_delta": 0.0
    },

    # What to run (config-driven)
    # NOTE: PCam is implemented but not run by default
    "run_datasets": ["Adult", "CIFAR-100(0-9)"],

    # "ViT" means:
    # - Adult -> TabularAttention
    # - CIFAR/PCam -> TinyViT
    "run_architectures": ["MLP", "CNN", "ViT"],

    # Splits
    "adult_test_size": 0.2,
    "adult_val_size": 0.15,
    "val_fraction": 0.15,

    # PCam limits (for when you do try it)
    "pcam_train_limit": 10000,
    "pcam_val_limit": 2000,
    "pcam_test_limit": 2000,

    # Arch 3 configs
    "tabattn": {"d_model": 64, "heads": 4, "layers": 2, "dropout": 0.1},
    "vit": {"patch": 4, "dim": 128, "heads": 4, "layers": 2, "dropout": 0.1},

    # Bonus
    "bonus": {
        "make_learning_curve_comparison": True,
        "save_param_counts": True
    }
}

with open("config/config.json", "w") as f:
    json.dump(config, f, indent=2)

print("Saved config/config.json")
print("Device detected:", config["device"])

Saved config/config.json
Device detected: cuda


In [None]:
# =========================
# 2) MODELS (architectures.py)
# =========================
%%writefile models/architectures.py
import torch
import torch.nn as nn
import numpy as np

# -------------------------
# Architecture 1: MLP
# -------------------------
class MLP(nn.Module):
    """
    Mandatory MLP:
    Input -> 2 hidden layers (ReLU) -> Output
    Includes BatchNorm + Dropout
    Works for:
      - Adult tabular (B,D)
      - Images (B,C,H,W) after flatten
    """
    def __init__(self, in_shape, out_dim, hidden1=512, hidden2=256, dropout=0.3):
        super().__init__()
        flat_size = in_shape if isinstance(in_shape, int) else int(np.prod(in_shape))
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flat_size, hidden1),
            nn.ReLU(),
            nn.BatchNorm1d(hidden1),
            nn.Dropout(dropout),

            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden2),
            nn.Dropout(dropout),

            nn.Linear(hidden2, out_dim)
        )

    def forward(self, x):
        return self.net(x)

# -------------------------
# Architecture 2: CNN
# -------------------------
class CNN(nn.Module):
    """
    Mandatory CNN:
    At least 2 convolution layers + pooling + FC head.
    Works for:
      - Adult: Conv1D over features
      - Images: Conv2D
    """
    def __init__(self, in_shape, out_dim, dropout=0.2):
        super().__init__()

        if isinstance(in_shape, int):
            # Tabular as 1D signal: [B, 1, D]
            self.feat = nn.Sequential(
                nn.Unflatten(1, (1, in_shape)),
                nn.Conv1d(1, 16, 3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Conv1d(16, 32, 3, padding=1),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Flatten()
            )
            L = in_shape // 4  # pooled twice
            self.head = nn.Sequential(
                nn.Linear(32 * L, 128),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(128, out_dim)
            )
        else:
            # Image CNN: [B, C, H, W]
            C, H, W = in_shape
            self.feat = nn.Sequential(
                nn.Conv2d(C, 32, 3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Conv2d(32, 64, 3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Flatten()
            )
            H2, W2 = H // 4, W // 4
            self.head = nn.Sequential(
                nn.Linear(64 * H2 * W2, 256),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(256, out_dim)
            )

    def forward(self, x):
        return self.head(self.feat(x))

# --------------------------------------------------------
# Architecture 3a (Bonus): Tabular Transformer-style Attention
# Used for Adult
# --------------------------------------------------------
class TabularAttention(nn.Module):
    """
    Treat each tabular feature as a token.
    - Feature scalar -> token embedding
    - Transformer encoder
    - Mean pool tokens -> classifier
    """
    def __init__(self, num_features, out_dim, d_model=64, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.feature_embed = nn.Linear(1, d_model)
        self.pos_embed = nn.Parameter(torch.zeros(1, num_features, d_model))
        nn.init.trunc_normal_(self.pos_embed, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, out_dim)
        )

    def forward(self, x):
        # x: [B, D]
        x = x.unsqueeze(-1)              # [B, D, 1]
        x = self.feature_embed(x)        # [B, D, d_model]
        x = x + self.pos_embed[:, :x.size(1), :]
        x = self.encoder(x)              # [B, D, d_model]
        x = self.norm(x)
        x = x.mean(dim=1)                # mean pool over features
        return self.head(x)

# --------------------------------------------------------
# Architecture 3b (Bonus): Vision Transformer-style (TinyViT)
# Used for CIFAR + PCam
# --------------------------------------------------------
class TinyViT(nn.Module):
    """
    Simple ViT-style encoder for images only.
    No pretrained weights.
    """
    def __init__(self, in_shape, out_dim, patch=4, d_model=128, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        assert not isinstance(in_shape, int), "TinyViT is for images only."

        C, H, W = in_shape
        assert H % patch == 0 and W % patch == 0, "H and W must be divisible by patch size."
        n_patches = (H // patch) * (W // patch)

        self.patch_embed = nn.Conv2d(C, d_model, kernel_size=patch, stride=patch)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + n_patches, d_model))
        self.drop = nn.Dropout(dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, out_dim)

        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)

    def forward(self, x):
        # x: [B,C,H,W]
        x = self.patch_embed(x)          # [B,d,H/p,W/p]
        x = x.flatten(2).transpose(1, 2) # [B,tokens,d]
        B = x.size(0)

        cls = self.cls_token.expand(B, -1, -1)  # [B,1,d]
        x = torch.cat([cls, x], dim=1)          # [B,1+tokens,d]
        x = self.drop(x + self.pos_embed[:, :x.size(1), :])

        x = self.encoder(x)
        cls_out = self.norm(x[:, 0])    # CLS token
        return self.head(cls_out)


Writing models/architectures.py


In [None]:
# =========================
# 3) DATA LOADERS (loaders.py)
# =========================
%%writefile data_loaders/loaders.py
import os
import torch
from torch.utils.data import TensorDataset, random_split, Dataset, Subset
from torchvision import datasets, transforms
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds

# -----------------------
# Adult (tabular) - binary
# -----------------------
def get_adult(seed=42, test_size=0.2, val_size=0.15):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
            'hours-per-week', 'native-country', 'income']
    df = pd.read_csv(url, names=cols, skipinitialspace=True).dropna()

    # Label
    df['income'] = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

    # Encode categoricals
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    X = StandardScaler().fit_transform(df.drop('income', axis=1).values)
    y = df['income'].values

    # Train/test then train/val
    X_tr_val, X_te, y_tr_val, y_te = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )
    X_tr, X_va, y_tr, y_va = train_test_split(
        X_tr_val, y_tr_val, test_size=val_size, random_state=seed, stratify=y_tr_val
    )

    train_ds = TensorDataset(torch.FloatTensor(X_tr), torch.FloatTensor(y_tr).view(-1, 1))
    val_ds   = TensorDataset(torch.FloatTensor(X_va), torch.FloatTensor(y_va).view(-1, 1))
    test_ds  = TensorDataset(torch.FloatTensor(X_te), torch.FloatTensor(y_te).view(-1, 1))

    return train_ds, val_ds, test_ds, X.shape[1], 1, "binary"

# -----------------------------------------
# CIFAR-100 but only classes 0–9 (10-class)
# -----------------------------------------
def get_cifar100_10class(val_fraction=0.15, seed=42):
    t = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
    ])

    full = datasets.CIFAR100(root="./data", train=True, download=True, transform=t)
    test_full = datasets.CIFAR100(root="./data", train=False, download=True, transform=t)

    # Keep labels 0..9 only
    train_idx = [i for i, y in enumerate(full.targets) if y in range(10)]
    test_idx  = [i for i, y in enumerate(test_full.targets) if y in range(10)]

    full_10 = Subset(full, train_idx)
    test_10 = Subset(test_full, test_idx)

    # Split train/val
    v_sz = int(len(full_10) * val_fraction)
    tr_sz = len(full_10) - v_sz
    train, val = random_split(full_10, [tr_sz, v_sz], generator=torch.Generator().manual_seed(seed))

    return train, val, test_10, (3, 32, 32), 10, "multiclass"

# -----------------------
# PCam (TFDS) - binary
# -----------------------
class PCamTorch(Dataset):
    def __init__(self, split="train", limit=None):
        ds = tfds.load(
            "patch_camelyon",
            split=split,
            as_supervised=True,
            data_dir=os.environ.get("TFDS_DATA_DIR")
        )
        if limit is not None:
            ds = ds.take(limit)
        self.samples = list(tfds.as_numpy(ds))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img, y = self.samples[idx]
        img = torch.tensor(img, dtype=torch.float32) / 255.0
        img = img.permute(2, 0, 1)  # CHW
        y = torch.tensor(y, dtype=torch.float32).view(1)
        return img, y

def get_pcam(limit_train=10000, limit_val=2000, limit_test=2000):
    train = PCamTorch(split="train", limit=limit_train)
    val   = PCamTorch(split="validation", limit=limit_val)
    test  = PCamTorch(split="test", limit=limit_test)
    return train, val, test, (3, 96, 96), 1, "binary"

Writing data_loaders/loaders.py


In [None]:
# =========================
# 4) MAIN TRAINING SCRIPT (main.py)
# =========================
%%writefile main.py
import os, json, time
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

from data_loaders.loaders import get_adult, get_cifar100_10class, get_pcam
from models.architectures import MLP, CNN, TinyViT, TabularAttention

with open("config/config.json", "r") as f:
    CFG = json.load(f)

DEVICE = CFG["device"]

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def make_loaders(train_ds, val_ds, test_ds):
    pin = (DEVICE == "cuda")
    return (
        DataLoader(train_ds, batch_size=CFG["batch_size"], shuffle=True,  num_workers=0, pin_memory=pin),
        DataLoader(val_ds,   batch_size=CFG["batch_size"], shuffle=False, num_workers=0, pin_memory=pin),
        DataLoader(test_ds,  batch_size=CFG["batch_size"], shuffle=False, num_workers=0, pin_memory=pin),
    )

def criterion_for(task):
    return nn.BCEWithLogitsLoss() if task == "binary" else nn.CrossEntropyLoss()

def metric_from_logits(logits, y, task):
    # consistent metric reporting
    if task == "binary":
        probs = torch.sigmoid(logits.view(-1))
        preds = (probs >= 0.5).long().cpu().numpy()
        labels = y.view(-1).long().cpu().numpy()
        return f1_score(labels, preds, average="binary")
    else:
        preds = logits.argmax(dim=1).cpu().numpy()
        labels = y.cpu().numpy()
        return accuracy_score(labels, preds)

@torch.no_grad()
def test_metrics(model, test_loader, task):
    model.eval()
    all_logits, all_y = [], []
    for x, y in test_loader:
        x = x.to(DEVICE)
        all_logits.append(model(x).cpu())
        all_y.append(y.cpu())
    logits = torch.cat(all_logits)
    y = torch.cat(all_y)

    if task == "binary":
        p = (torch.sigmoid(logits.view(-1)) >= 0.5).long().numpy()
        l = y.view(-1).long().numpy()
        return accuracy_score(l, p), f1_score(l, p, average="binary")
    else:
        p = logits.argmax(1).numpy()
        l = y.numpy()
        return accuracy_score(l, p), f1_score(l, p, average="weighted")

def save_curves(history, name, metric_name):
    os.makedirs("results", exist_ok=True)

    plt.figure()
    plt.plot(history["train_loss"], label="Train Loss")
    plt.plot(history["val_loss"], label="Val Loss")
    plt.title(f"{name} Loss")
    plt.legend()
    plt.savefig(f"results/{name}_loss_curve.png")
    plt.close()

    plt.figure()
    plt.plot(history["train_metric"], label=f"Train {metric_name}")
    plt.plot(history["val_metric"], label=f"Val {metric_name}")
    plt.title(f"{name} {metric_name}")
    plt.legend()
    plt.savefig(f"results/{name}_{metric_name.lower()}_curve.png")
    plt.close()

def train_eval(model, train_loader, val_loader, task):
    # optimizer family consistent
    assert CFG["optimizer"].lower() == "adam"
    optimizer = optim.Adam(model.parameters(), lr=CFG["lr"])
    crit = criterion_for(task)

    es = CFG["early_stopping"]
    best_val = float("inf")
    best_state = None
    bad = 0
    best_epoch = 0

    history = {"train_loss": [], "val_loss": [], "train_metric": [], "val_metric": []}

    for epoch in range(CFG["epochs"]):
        # ---- train
        model.train()
        train_loss_sum = 0.0
        tr_logits, tr_y = [], []

        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(x)

            if task == "binary":
                loss = crit(logits.view(-1, 1), y.float().view(-1, 1))
            else:
                loss = crit(logits, y.long())

            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item()
            tr_logits.append(logits.detach().cpu())
            tr_y.append(y.detach().cpu())

        train_loss = train_loss_sum / len(train_loader)
        train_metric = metric_from_logits(torch.cat(tr_logits), torch.cat(tr_y), task)

        # ---- val
        model.eval()
        val_loss_sum = 0.0
        va_logits, va_y = [], []

        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(DEVICE), y.to(DEVICE)
                logits = model(x)

                if task == "binary":
                    loss = crit(logits.view(-1, 1), y.float().view(-1, 1))
                else:
                    loss = crit(logits, y.long())

                val_loss_sum += loss.item()
                va_logits.append(logits.detach().cpu())
                va_y.append(y.detach().cpu())

        val_loss = val_loss_sum / len(val_loader)
        val_metric = metric_from_logits(torch.cat(va_logits), torch.cat(va_y), task)

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_metric"].append(train_metric)
        history["val_metric"].append(val_metric)

        # early stopping on val loss
        improved = (best_val - val_loss) > es.get("min_delta", 0.0)
        if improved:
            best_val = val_loss
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            bad = 0
            best_epoch = epoch + 1
        else:
            bad += 1
            if es["enabled"] and bad >= es["patience"]:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    return history, best_epoch

def load_dataset(dataset_name):
    if dataset_name == "Adult":
        return get_adult(CFG["seed"], CFG["adult_test_size"], CFG["adult_val_size"])
    elif dataset_name == "CIFAR-100(0-9)":
        return get_cifar100_10class(CFG["val_fraction"], CFG["seed"])
    else:
        return get_pcam(CFG["pcam_train_limit"], CFG["pcam_val_limit"], CFG["pcam_test_limit"])

def build_model(arch, dataset_name, in_shape, out_dim):
    if arch == "MLP":
        return MLP(in_shape, out_dim)

    if arch == "CNN":
        return CNN(in_shape, out_dim)

    # Arch 3 (Bonus)
    # - Adult: TabularAttention
    # - CIFAR/PCam: TinyViT
    if arch == "ViT":
        if dataset_name == "Adult":
            tcfg = CFG["tabattn"]
            return TabularAttention(
                num_features=in_shape,
                out_dim=out_dim,
                d_model=tcfg["d_model"],
                n_heads=tcfg["heads"],
                n_layers=tcfg["layers"],
                dropout=tcfg["dropout"]
            )
        else:
            vcfg = CFG["vit"]
            return TinyViT(
                in_shape, out_dim,
                patch=vcfg["patch"],
                d_model=vcfg["dim"],
                n_heads=vcfg["heads"],
                n_layers=vcfg["layers"],
                dropout=vcfg["dropout"]
            )

    raise ValueError(f"Unknown architecture: {arch}")

def run_experiment(dataset_name, arch):
    train_ds, val_ds, test_ds, in_shape, out_dim, task = load_dataset(dataset_name)
    train_loader, val_loader, test_loader = make_loaders(train_ds, val_ds, test_ds)

    model = build_model(arch, dataset_name, in_shape, out_dim).to(DEVICE)
    params = count_params(model)

    start = time.time()
    history, best_epoch = train_eval(model, train_loader, val_loader, task)
    train_time = time.time() - start

    acc, f1 = test_metrics(model, test_loader, task)

    # Friendly naming: show Adult + ViT as TabularAttention in results table
    shown_arch = "TabularAttention" if (arch == "ViT" and dataset_name == "Adult") else arch

    exp_name = f"{dataset_name}_{shown_arch}".replace("/", "_")
    os.makedirs("results", exist_ok=True)
    torch.save(model.state_dict(), f"results/{exp_name}_best.pt")

    metric_name = "F1" if task == "binary" else "Accuracy"
    save_curves(history, exp_name, metric_name)

    notes = f"time={train_time:.1f}s; best_epoch={best_epoch}; params={params}"
    return {
        "Dataset": dataset_name,
        "Architecture": shown_arch,
        "Accuracy": round(acc, 4),
        "F1": round(f1, 4),
        "Notes": notes,
        "_history": history
    }

def plot_compare(histories, title, out_path, key):
    plt.figure()
    for label, hist in histories.items():
        plt.plot(hist[key], label=label)
    plt.title(title)
    plt.legend()
    plt.savefig(out_path)
    plt.close()

if __name__ == "__main__":
    set_seed(CFG["seed"])
    print("Using device:", DEVICE)

    run_datasets = CFG["run_datasets"]
    run_archs = CFG["run_architectures"]

    results = []
    histories_by_dataset = {}

    for d in run_datasets:
        histories_by_dataset[d] = {}
        for a in run_archs:
            print(f"Running: {d} + {a}")
            r = run_experiment(d, a)
            histories_by_dataset[d][r["Architecture"]] = r["_history"]
            results.append({k: r[k] for k in ["Dataset", "Architecture", "Accuracy", "F1", "Notes"]})

    df = pd.DataFrame(results)
    df.to_csv("results/final_metrics.csv", index=False)

    print("\nFinal Results Table:\n")
    print(df.to_markdown(index=False))

    # Bonus: per-dataset learning curve comparisons
    if CFG.get("bonus", {}).get("make_learning_curve_comparison", False):
        for d, hdict in histories_by_dataset.items():
            if len(hdict) <= 1:
                continue
            plot_compare(hdict, f"{d} - Val Loss Comparison", f"results/{d}_VAL_LOSS_COMPARISON.png", "val_loss")
            plot_compare(hdict, f"{d} - Val Metric Comparison", f"results/{d}_VAL_METRIC_COMPARISON.png", "val_metric")


Writing main.py


In [None]:
!python main.py

Using device: cuda
Running: Adult + MLP
Running: Adult + CNN
Running: Adult + ViT
Running: CIFAR-100(0-9) + MLP
100% 169M/169M [00:03<00:00, 43.7MB/s]
Running: CIFAR-100(0-9) + CNN
Running: CIFAR-100(0-9) + ViT

Final Results Table:

| Dataset        | Architecture     |   Accuracy |     F1 | Notes                                     |
|:---------------|:-----------------|-----------:|-------:|:------------------------------------------|
| Adult          | MLP              |     0.8535 | 0.6669 | time=8.5s; best_epoch=12; params=140801   |
| Adult          | CNN              |     0.8548 | 0.6702 | time=9.0s; best_epoch=10; params=14177    |
| Adult          | TabularAttention |     0.8511 | 0.6769 | time=14.7s; best_epoch=7; params=105345   |
| CIFAR-100(0-9) | MLP              |     0.555  | 0.5541 | time=16.7s; best_epoch=8; params=1708810  |
| CIFAR-100(0-9) | CNN              |     0.69   | 0.6884 | time=17.4s; best_epoch=11; params=1070794 |
| CIFAR-100(0-9) | ViT              | 

In [None]:
%%writefile README.md

# Deep Learning Assignment
## Shaira Manandhar
**Datasets × Architectures Benchmark (PyTorch)**

## Objective
This project benchmarks how different neural network architectures perform across tabular and image datasets, highlighting how data modality and model inductive bias affect learning.
All models are trained from scratch (no pretraining) with a consistent experimental setup.

---

## Datasets
- **Adult Income (UCI)**
  - Modality: Tabular
  - Task: Binary classification (income >50K)
  - Metrics: Accuracy, F1-score

- **CIFAR-100 (classes 0–9)**
  - Modality: Image (32×32 RGB)
  - Task: 10-class classification
  - Metric: Accuracy

- **PatchCamelyon (PCam)** *(implemented, not run by default)*
  - Modality: Histopathology images (96×96 RGB)
  - Task: Binary classification (tumor vs normal)

---

## Architectures
### 1. Multilayer Perceptron (MLP)
- Fully connected feedforward network
- 2 hidden layers (ReLU)
- Batch Normalization + Dropout
- Used on **all datasets** (after preprocessing)

### 2. Convolutional Neural Network (CNN)
- ≥2 convolution layers with pooling
- Conv1D for tabular data, Conv2D for images
- Fully connected classifier head
- Used on **all datasets**

### 3. Attention-Based Model
- **Adult**: Transformer-style Tabular Attention (features treated as tokens)
- **CIFAR / PCam**: Vision Transformer (TinyViT)
- No pretrained weights

---

## Experimental Setup
- Framework: PyTorch
- Optimizer: Adam (same across all models)
- Batch size: 128
- Learning rate: 0.001
- Epochs: 12
- Early stopping on validation loss (patience = 3)
- Train / validation / test splits are consistent per dataset

---
##Note: PCam is implemented but not run by default due to compute constraints in Colab.

---

## Results Summary
| Dataset         | Architecture     | Accuracy | F1 |
|-----------------|------------------|----------|----|
| Adult           | MLP              | 0.8535   | 0.6669 |
| Adult           | CNN              | 0.8548   | 0.6702 |
| Adult           | TabularAttention | 0.8511   | 0.6769 |
| CIFAR-100 (0–9) | MLP              | 0.5550   | 0.5541 |
| CIFAR-100 (0–9) | CNN              | 0.6900   | 0.6884 |
| CIFAR-100 (0–9) | ViT              | 0.6230   | 0.6266 |

Full metrics, training time, parameter counts, and learning curves are saved in `results/`.

---

## Key Insights
- **MLPs perform well on tabular data**, where feature interactions are relatively simple.
- **CNNs outperform MLPs on images** due to spatial inductive bias.
- **Attention-based models require more data or pretraining** to outperform CNNs on images; without pretraining, ViT underperforms CNN on CIFAR.
- On tabular data, **attention offers marginal gains** but does not drastically outperform simpler models.

> Importantly, **I intentionally avoided pretraining**, as required by the assignment. Lower ViT performance is expected and correctly interpreted.

---

## Reproducibility
1. Open the notebook in **Google Colab**
2. Ensure **GPU is enabled**
3. Run all cells one by one
4. Run:
```bash
python main.py

---
```

#Project Structure

```
shairamanandhar_assignment1_final/
├── models/            # Architectures
├── data_loaders/      # Dataset loaders
├── config/            # Config file
├── results/           # Metrics, plots, checkpoints
├── main.py            # Training + evaluation
└── README.md



Overwriting README.md
