In [1]:
import os
import torch
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
import torch.nn.functional as F
from torchvision.transforms import ToTensor
import lightning.pytorch as pl
import mlflow
import multiprocessing

In [2]:
mlflow.set_experiment("mnist-image-classification")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.pytorch.autolog()

In [7]:
# hyperparameters
batch_size = 64
lr = .001
dropout = .2
momentum = .3
epochs = 20
# device = "cuda" if torch.cuda.is_available() else "cpu"cpu
# path = "model_v1.pth"

In [8]:
# Define transforms
transform = ToTensor()

# Create training set and define training dataloader
train = MNIST(root="data", train=True, download=True, transform=transform)

# generate validation data from train data
split = .9
train_size = int(train.data.shape[0]*split)
valid_size = train.data.shape[0] - train_size
trainset, validset = utils.data.random_split(train, [train_size, valid_size],
                                                   generator=torch.Generator().manual_seed(40))

# dataloaders
trainloader = utils.data.DataLoader(trainset, shuffle=True, batch_size=batch_size, num_workers=multiprocessing.cpu_count())
validloader = utils.data.DataLoader(validset, batch_size=batch_size, num_workers=multiprocessing.cpu_count())

# Create test set and define test dataloader
testset = MNIST(root="data", train=False, download=True, transform=transform)
testloader = utils.data.DataLoader(testset, batch_size=batch_size, num_workers=multiprocessing.cpu_count())

In [9]:
# define the LightningModule
class Net(pl.LightningModule):
    def __init__(self):
        super().__init__()
        #convolutional layer
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        # fully connected layers
        self.fc1 = nn.Linear(64 * 7 * 7, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 10)
        # dropout
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        # add sequence of convolutional and max pooling layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        # flatten image input
        x = torch.flatten(x, 1)
        # add hidden layer, with relu activation function
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = F.log_softmax(self.fc4(x), dim=1)

        return x

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        data, target = batch
        output = self.forward(data)
        train_loss = nn.functional.nll_loss(output, target)
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", train_loss)
        return train_loss

    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        data, target = batch
        output = self.forward(data)
        val_loss = nn.functional.nll_loss(output, target)
        self.log("val_loss", val_loss)

    def test_step(self, batch, batch_idx):
        # this is the test loop
        data, target = batch
        output = self.forward(data)
        test_loss = nn.functional.nll_loss(output, target)
        self.log("test_loss", test_loss)
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct = pred.eq(target.view_as(pred)).sum().item()
        self.log("accuracy", correct/len(data))

    def configure_optimizers(self):
        # optimizer = optim.Adam(self.parameters(), lr=lr)
        optimizer = optim.SGD(self.parameters(), lr=lr, momentum=momentum)
        return optimizer

In [10]:
# init the model
model = Net()
model

Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [11]:
# train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
trainer = pl.Trainer(max_epochs=epochs)
trainer.fit(model=model, train_dataloaders=trainloader, val_dataloaders=validloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2023/08/17 07:29:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bc7fa3b6afb348eeb98d9c846dbaf603', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` whic

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [12]:
trainer.test(model=model, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.3052712082862854, 'accuracy': 0.9104999899864197}]