## MNIST 

MNIST stuff.

## Get Data

from Azure? 

In [1]:
from adlfs import AzureBlobFileSystem

container_name = "datasets"
storage_options = {"account_name": "azuremlexamples"}

In [2]:
fs = AzureBlobFileSystem(**storage_options)
fs

<adlfs.spec.AzureBlobFileSystem at 0x7f7eb037a820>

In [3]:
files = fs.ls(f"{container_name}/mnist")
files

['datasets/mnist/t10k-images-idx3-ubyte.gz',
 'datasets/mnist/t10k-labels-idx1-ubyte.gz',
 'datasets/mnist/train-images-idx3-ubyte.gz',
 'datasets/mnist/train-labels-idx1-ubyte.gz']

## Create a LightningDataModule 

This is tricky! Not!

In [4]:
import gzip
import numpy as np 

import pytorch_lightning as pl

from adlfs import AzureBlobFileSystem
from torch.utils.data import DataLoader
from sklearn.preprocessing import OneHotEncoder

class AzureMLMNISTDataModule(pl.LightningModule):

    def __init__(self, batch_size: int=10):
        super().__init__()
        self.batch_size = batch_size

    def setup(self, stage=None):

        data_dir = "datasets/mnist"        
        storage_options = {"account_name": "azuremlexamples"}
        fs = AzureBlobFileSystem(**storage_options)
        files = fs.ls(data_dir)

        train_len = 60000
        test_len = 10000
        
        for f in files:
            if "train-images" in f:
                self.X_train = self._read_images(gzip.open(fs.open(f)), train_len)
            elif "train-labels" in f:
                self.y_train = self._read_labels(gzip.open(fs.open(f)), train_len)
            elif "images" in f:
                self.X_test = self._read_images(gzip.open(fs.open(f)), test_len)
            elif "labels" in f:
                self.y_test = self._read_labels(gzip.open(fs.open(f)), test_len)

        self.ohe = OneHotEncoder().fit(self.y_train.reshape(-1, 1))

        self.mnist_train = list(zip(self.X_train, self.ohe.transform(self.y_train.reshape(-1, 1)).toarray()))
        self.mnist_test = list(zip(self.X_test, self.ohe.transform(self.y_test.reshape(-1, 1)).toarray()))

    def _read_images(self, f, images):
        image_size = 28
        
        f.read(16) # magic
        buf = f.read(image_size * image_size * images)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        data = data.reshape(images, image_size, image_size, 1)

        return data

    def _read_labels(self, f, labels):

        f.read(8) # magic

        buf = f.read(1 * labels)
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
        return labels

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size)

In [5]:
mnist = AzureMLMNISTDataModule()
mnist.setup()

In [None]:
import matplotlib.pyplot as plt

for batch in mnist.mnist_test:
    x, y = batch

    plt.imshow(x.squeeze())
    print(f"Label: {y}")
    break

## Fun Time!

the work pays off?

In [6]:
import torch

import pytorch_lightning as pl

from torch import nn
from torch.nn import functional as F 

class System1(pl.LightningModule):

    def __init__(self, batch_size):
        # magic
        super().__init__()

        self.batch_size = batch_size

        self.net = nn.Sequential(
                        nn.Linear(28 * 28, 128),
                        nn.ReLU(),
                        nn.Linear(128, 256),
                        nn.ReLU(),
                        nn.Linear(256, 10),
                        nn.Softmax()
        )

    def forward(self, x):
        x = self.net(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.view(self.batch_size, -1)
        y = y.view(self.batch_size, -1)
        y_hat = self.forward(x)
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [7]:
system1 = System1(mnist.batch_size)
system1

System1(
  (net): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
    (5): Softmax(dim=None)
  )
)

In [8]:
%%time
trainer = pl.Trainer(max_epochs=100)
trainer.fit(system1, mnist.train_dataloader())

GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name | Type       | Params
------------------------------------
0 | net  | Sequential | 136 K 
------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params


Epoch 0:  24%|██▍       | 14589/60000 [01:08<03:33, 212.45it/s, loss=17, v_num=8]

In [None]:
from torchvision.datasets import MNIST 

test = MNIST(".", download=True)

In [None]:
test

In [None]:
for batch in test:
    print(batch)
    break