In [1]:
pip install --quiet torch torchvision pytorch-lightning wandb matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

In [3]:
from torchvision.datasets import MNIST
MNIST(".", train=True, download=True)
MNIST(".", train=False, download=True)

Dataset MNIST
    Number of datapoints: 10000
    Root location: .
    Split: Test

In [4]:
import pytorch_lightning as lightning
lightning.seed_everything(42)

Global seed set to 42


42

In [5]:
import torchvision
mnist_train= MNIST(".", train=True, transform=torchvision.transforms.ToTensor())
train_ds, validation_ds = torch.utils.data.random_split(mnist_train,[55000,5000])
#
test_ds = MNIST(".", train=False, transform=torchvision.transforms.ToTensor())

get only 3 batches to overfit a model

In [6]:
small_train_ds, big_validation_ds = torch.utils.data.random_split(mnist_train,[128*3,60000-128*3])

In [7]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmfa[0m (use `wandb login --relogin` to force relogin)


True

In [8]:
from torchmetrics import Accuracy
from torch.nn import functional as F

# INTERATIONS

1. hidden_size = 5  - min extreme
2. hidden_size = 128
3. CNN (not in the notebook anymore!)
4. only use 3 batches to train: see change in `train_dataloader()` -- increase epochs to `100`

In [9]:
class MNISTModel(lightning.LightningModule):
    def __init__(self, hidden_size=128, learning_rate=1e-2):
        super().__init__()
        
        self.OVERFIT = False
        
        self.save_hyperparameters()

        self.learning_rate = learning_rate
        self.batch_size = 128

        # 10 different numers; images are 28x28 greyscale
        self.num_classes = 10
        channels, width, height = (1, 28, 28)

        # simple model
        self.model = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(channels * width * height, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, self.num_classes),
        )
       
        self.loss = torch.nn.CrossEntropyLoss()

        acc = Accuracy()
        self.train_acc = acc.clone()
        self.valid_acc = acc.clone()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y)
        self.log("train/loss", loss)

        logits = self(x)
        preds = torch.argmax(logits, dim=1)
        self.train_acc(preds, y)
        self.log("train/acc", self.train_acc)
        return {"loss": loss}


    def validation_step(self, batch, batch_idx, name="val"):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y)
        self.log(f"{name}/loss", loss)

        logits = self(x)
        preds = torch.argmax(logits, dim=1)
        self.valid_acc(preds, y)

        self.log(f"{name}/acc", self.valid_acc)
        return {"loss": loss}

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx, name="test")

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def train_dataloader(self):
        ds = train_ds
        if self.OVERFIT:
            ds = small_train_ds
        return torch.utils.data.DataLoader(ds, batch_size=self.batch_size)
        
    def val_dataloader(self):
        ds = validation_ds
        if self.OVERFIT:
            ds = big_validation_ds
        return torch.utils.data.DataLoader(ds, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(test_ds, batch_size=self.batch_size)

`log_every_n_steps` default is `50` --- set to `1` is a lot slower! -- needed to get train curve for small_ds

In [10]:
model = MNISTModel()
wandb_logger = lightning.loggers.WandbLogger(project="mlugs-overfitting")
wandb_logger.watch(model, log="all")
trainer = lightning.Trainer(gpus=1, max_epochs=10, logger=wandb_logger, log_every_n_steps=1)
trainer.fit(model)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | Sequential       | 118 K 
1 | loss      | CrossEntropyLoss | 0     
2 | train_acc | Accuracy         | 0     
3 | valid_acc | Accuracy         | 0     
-----------------------------------------------
118 K     Trainable params
0         Non-trainable params
118 K     Total params
0.473     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [11]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train/acc,▃▁▅▆▆█▆▇▆▆▆▆▆▆▇▆▅▇█▇▆▇▇███▇██▆▇█▆▆▆▇▇▆██
train/loss,▆█▄▃▂▁▃▂▂▃▃▃▄▂▂▂▄▂▃▂▃▂▂▁▁▁▂▁▂▄▂▁▂▅▃▂▂▂▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/acc,▁▄▆▇▇▇▇█▅█
val/loss,▅▃▁▃▂▄▇▆█▇

0,1
epoch,9.0
train/acc,0.97727
train/loss,0.2841
trainer/global_step,4299.0
val/acc,0.9678
val/loss,0.18323


## Result plots

https://wandb.ai/mfa/mlugs-overfitting/reports/MLUGS-2021-01-overfitting--VmlldzoxNDQ3MDM3?accessToken=uckvmib0s1zbifx1vpjkl0um2wg15fek2ske6mzkuts5blg0rzmy67uka8ckgbcn