# MNIST Digits Dataset

In this example, we'll show several different models operating on the MNIST digits dataset. We use the customized `MNISTDigitsWorkshop`, `Workshop` type to define needed auxillary methods. Importantly the `data_prep` method which produces a `Dataset` with the train and test set we need.

In [91]:
import tensorflow as tf
import torch
import numpy as np

import dryml
import dryml.context
import dryml.models.torch
import dryml.models.tf
import dryml.data.torch
import dryml.examples.mnist_digits as dry_digits
import sklearn.neighbors
import dryml.models.sklearn
import dryml.models.xgb

## Local analysis Prep

We define a training method to allow remote process training.

In [2]:
work_dir = './models'

In [3]:
@dryml.compute
def test_method(trainable):
    import dryml.examples.mnist_digits as dry_digits
    import dryml.metrics
    
    repo = dry_digits.MNISTDigitsWorkshop()
    repo.data_prep()
    
    return dryml.metrics.scalar.categorical_accuracy(trainable, repo.test_ds)

In [42]:
@dryml.context.compute_context(ctx_update_objs=True)
def train_method(trainable):
    import dryml.examples.mnist_digits as dry_digits
    import dryml.metrics
    repo = dry_digits.MNISTDigitsWorkshop()
    repo.data_prep()

    trainable.prep_train()
    trainable.train(repo.train_ds)
    
    return dryml.metrics.scalar.categorical_accuracy(trainable, repo.test_ds)

In [5]:
shop = dry_digits.MNISTDigitsWorkshop(work_dir='./models')

In [6]:
# Create a necessary preprocessing step for most models to be successful
def normalize_img(x, y):
    return tf.cast(x, dtype=tf.float32)/255., tf.cast(y, dtype=tf.int32)
img_prep = dryml.data.tf.FuncMap.from_function(normalize_img)

In [7]:
# Let's make a dictionary to save the trained models so we can measure them later
model_dict = {}

# pytorch

First, we'll do a couple of pytorch models.

## Simple 2d convolutional model

First, we'll do a simple 2d convolutional model.

In [8]:
import torch
import dryml.models.torch
import dryml.data.torch
import dryml.data.transforms

In [43]:
model = dryml.models.torch.generic.Sequential(
    layer_defs = [
        [ torch.nn.LazyConv2d, (32, 3), {} ],
        [ torch.nn.ReLU, (), {}],
        [ torch.nn.LazyConv2d, (32, 3), {} ],
        [ torch.nn.Flatten, (), {}],
        [ torch.nn.LazyLinear, (10,), {}],
    ]
)

trainable = dryml.models.torch.generic.Trainable(
    model=model,
    train_fn=dryml.models.torch.generic.BasicTraining(
        epochs=3,
        optimizer = dryml.models.torch.generic.TorchOptimizer(torch.optim.Adam, model),
        loss = dryml.models.torch.Wrapper(torch.nn.CrossEntropyLoss),
    ),
)

torch_model_1 = dryml.models.Pipe(
    dryml.data.transforms.Cast(mode='X', dtype='float32'),
    dryml.data.transforms.Transpose(mode='X', axes=(2, 0, 1)),
    trainable,
    dryml.data.transforms.BestCat(),
    dryml.data.torch.transforms.TorchDevice(device='cpu')
)

In [44]:
train_method(torch_model_1, call_context_reqs={'tf': {}, 'torch': {'gpu/0': 1.}})

100%|██████████| 1875/1875 [00:17<00:00, 107.56it/s, loss=0.0521]
  1%|          | 10/1875 [00:00<00:19, 94.25it/s, loss=0.0162]

Epoch 1 - Average Loss: 0.05208663081971193


100%|██████████| 1875/1875 [00:15<00:00, 117.96it/s, loss=0.00733]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.0059] 

Epoch 2 - Average Loss: 0.007329490900244894


100%|██████████| 1875/1875 [00:15<00:00, 117.32it/s, loss=0.00554]

Epoch 3 - Average Loss: 0.00554444462057637





0.9701522435897436

In [66]:
model_dict['torch_simple_2layer_conv2d'] = {
    'model': torch_model_1,
    'label': 'Simple 2 layer convolutional (PyTorch)',
}

## Pytorch Lenet5

Let's create a lenet5 pytorch model, and use the `ModelWrapper` to use it directly with `dryml`.

In [12]:
%%writefile torchlenet5_temp.py

import torch

# From medium post:
# https://towardsdatascience.com/implementing-yann-lecuns-lenet-5-in-pytorch-5e05a0911320
class TorchLenet5(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        
        self.feature_extractor = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
            torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2),
            torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2),
            torch.nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
            torch.nn.Tanh(),
        )
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(in_features=120, out_features=84),
            torch.nn.Tanh(),
            torch.nn.Linear(in_features=84, out_features=n_classes),
        )
    
    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, start_dim=1)
        logits = self.classifier(x)
        #probs = torch.nn.functional.softmax(x, dim=1)
        return logits

Overwriting torchlenet5_temp.py


In [13]:
from torchlenet5_temp import TorchLenet5

In [14]:
# We also need to create a resize operation so the image is the appropriate
# size when it arrives at the model.

def resize_img(img):
    return tf.image.resize(img, [32, 32], method='bilinear')
resize_step = dryml.data.tf.transforms.FuncXMap.from_function(resize_img)

In [15]:
# Permute operation
channels_first_step = dryml.data.transforms.Transpose(axes=(2, 0, 1))

In [46]:
# We can now build and train a pipeline containing this model!
torch_lenet5_mdl = dryml.models.torch.generic.ModelWrapper(
    TorchLenet5,
    10
)

torch_lenet5_trainable = dryml.models.torch.generic.Trainable(
    model=torch_lenet5_mdl,
    train_fn=dryml.models.torch.generic.BasicTraining(
        optimizer=dryml.models.torch.generic.TorchOptimizer(
            torch.optim.Adam,
            torch_lenet5_mdl
        ),
        loss=dryml.models.torch.Wrapper(
            torch.nn.CrossEntropyLoss
        ),
        epochs=5
    )
)

In [47]:
# We need to change the device of the final result
to_cpu = dryml.data.torch.transforms.TorchDevice(device='cpu')

torch_lenet5_pipe = dryml.models.Pipe(
    resize_step,
    img_prep,
    channels_first_step,
    dryml.data.transforms.Cast(mode='Y', dtype='int64'),
    torch_lenet5_trainable,
    dryml.data.transforms.BestCat(),
    to_cpu
)

In [48]:
train_method(torch_lenet5_pipe, call_context_reqs={'tf': {}, 'torch': {'gpu/0': 1.}})

100%|██████████| 1875/1875 [00:18<00:00, 98.95it/s, loss=0.00735] 
  0%|          | 7/1875 [00:00<00:26, 69.66it/s, loss=0.00259]

Epoch 1 - Average Loss: 0.007348409230766508


100%|██████████| 1875/1875 [00:18<00:00, 102.50it/s, loss=0.00272]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.00137] 

Epoch 2 - Average Loss: 0.002719182376524744


100%|██████████| 1875/1875 [00:18<00:00, 103.21it/s, loss=0.00188]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.00123] 

Epoch 3 - Average Loss: 0.0018814885082111383


100%|██████████| 1875/1875 [00:18<00:00, 102.04it/s, loss=0.00141]
  0%|          | 7/1875 [00:00<00:27, 67.68it/s, loss=0.00106]

Epoch 4 - Average Loss: 0.0014061697780518444


100%|██████████| 1875/1875 [00:17<00:00, 106.54it/s, loss=0.0011] 

Epoch 5 - Average Loss: 0.0010957789681652987





0.9785657051282052

In [68]:
model_dict['torch_lenet5'] = {
    'model': torch_lenet5_pipe,
    'label': 'Lenet-5 (PyTorch)',
}

## Sklearn Test

Let's create a nearest neighbor version

In [20]:
sklearn_knn_10_2000_bt = dryml.models.sklearn.Trainable(
    model=dryml.models.sklearn.ClassifierModel(
        sklearn.neighbors.KNeighborsClassifier,
        n_neighbors=10,
        algorithm='ball_tree'
    ),
    train_fn=dryml.models.sklearn.BasicTraining(
        num_examples=2000, shuffle=True, shuffle_buffer_size=20000
    ),
)

In [50]:
sklearn_pipe = dryml.models.Pipe(
    img_prep,
    dryml.data.transforms.Flatten(),
    sklearn_knn_10_2000_bt,
    dryml.data.transforms.BestCat(),
)

In [51]:
train_method(sklearn_pipe, call_context_reqs={'tf': {}})

2022-11-21 14:46:16.646859: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


0.9019431089743589

In [69]:
model_dict['sklearn_knn_10_2000'] = {
    'model': sklearn_pipe,
    'label': 'K-NN 10 Neighbors (2000 points)',
}

## XGB Test

In [24]:
xgb_2000_mdl = dryml.models.sklearn.Trainable(
    model=dryml.models.xgb.ClassifierModel(),
    train_fn=dryml.models.sklearn.BasicTraining(
        num_examples=2000, shuffle=True, shuffle_buffer_size=20000
    ),
)

In [25]:
xgb_pipe = dryml.models.Pipe(
    dryml.data.transforms.Flatten(),
    xgb_2000_mdl,
    dryml.data.transforms.BestCat())

In [53]:
train_method(xgb_pipe, call_context_reqs={'tf': {}})

2022-11-21 14:46:38.158913: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




  from pandas import MultiIndex, Int64Index


0.9122596153846154

In [70]:
model_dict['xgb'] = {
    'model': xgb_pipe,
    'label': 'XGB (2000 points)',
}

## Tensorflow convolutional transformer

In [98]:
%%writefile conv_transformer_test.py

import tensorflow as tf
import numpy as np

# Define the scaled dotproduct attention
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, qk_dim=None, v_dim=None, **kwargs):
        super().__init__(**kwargs)
        if v_dim is None and qk_dim is not None:
            v_dim = qk_dim
        self.scale_factor = np.sqrt(qk_dim)
        self.query_layer = tf.keras.layers.Dense(qk_dim)
        self.key_layer = tf.keras.layers.Dense(qk_dim)
        self.value_layer = tf.keras.layers.Dense(v_dim)
    
    def call(self, inputs):
        q = self.query_layer(inputs)
        k = self.key_layer(inputs)
        v = self.value_layer(inputs)
        sft_m = tf.math.softmax(tf.matmul(q, tf.transpose(k, perm=[0, 2, 1]))/self.scale_factor, axis=-1)
        return tf.matmul(sft_m, v)

# Multihead attention
class MultiHeadScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, n_heads, qk_dim, v_dim, **kwargs):
        super().__init__(**kwargs)
        head_dim = qk_dim // n_heads
        self.n_heads = n_heads
        self.heads = []
        for i in range(self.n_heads):
            self.heads.append(ScaledDotProductAttention(qk_dim=head_dim, v_dim=v_dim))
    
    def call(self, inputs):
        head_results = []
        for i in range(self.n_heads):
            head_results.append(self.heads[i](inputs))
        return tf.concat(head_results, axis=-1)

class ConvTransformerTest(tf.keras.Model):
    def __init__(self, conv2d_filters, **kwargs):
        super().__init__(**kwargs)
        self.conv2d_1 = tf.keras.layers.Conv2D(conv2d_filters, 3, activation='relu')
        self.conv2d_2 = tf.keras.layers.Conv2D(conv2d_filters, 3, activation='relu')
        self.at_layer = MultiHeadScaledDotProductAttention(4, conv2d_filters//4, conv2d_filters//4)
        self.dense_1 = tf.keras.layers.Dense(conv2d_filters*4)
        self.dense_2 = tf.keras.layers.Dense(conv2d_filters)
        self.dense_final = tf.keras.layers.Dense(10, activation='softmax')
    
    def call(self, inputs):
        x = self.conv2d_1(inputs)
        x = self.conv2d_2(x)
        x_shape = tf.shape(x)
        x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]])
        at_x = self.at_layer(x)
        x = x+at_x
        x = tf.keras.layers.Flatten()(x)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_final(x)
        return x

Overwriting conv_transformer_test.py


In [99]:
from conv_transformer_test import ConvTransformerTest

In [103]:
tf_transformer_test_mdl = dryml.models.tf.keras.Trainable(
    model=dryml.models.tf.keras.ModelWrapper(
        ConvTransformerTest,
        32
    ),
    optimizer=dryml.models.tf.Wrapper(
        tf.keras.optimizers.Adam
    ),
    loss=dryml.models.tf.Wrapper(
        tf.keras.losses.SparseCategoricalCrossentropy,
        from_logits=True
    ),
    train_fn=dryml.models.tf.keras.BasicEarlyStoppingTraining(
    )
)

In [104]:
tf_transformer_test_pipe = dryml.models.Pipe(
    img_prep,
    tf_transformer_test_mdl,
    dryml.data.transforms.BestCat()
)

In [None]:
train_method(tf_transformer_test_pipe, call_context_reqs={'tf':{'gpu/1': 1.}})

2022-11-21 16:50:26.505289: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7368 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1


Epoch 1/10000


2022-11-21 16:50:35.775042: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8500


Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000

In [None]:
model_dict['tf_transformer_test_1'] = {
    'model': tf_transformer_test_pipe,
    'label': 'Convolutional Transformer Test (Tensorflow)'
}

# Model Comparison

Let's compare these models directly and create a plot!

In [28]:
import matplotlib.pyplot as plt

In [None]:
# compute model accuracies
model_accs = {}
for mdl_name in model_dict:
    model_dict[mdl_name]['acc'] = test_method(model_dict[mdl_name]['model'], call_context_reqs={'tf': {}, 'torch': {}})

In [None]:
# Compute error rate
for mdl_name in model_dict:
    model_dict[mdl_name]['err'] = (1.-model_dict[mdl_name]['acc'])*100.

In [None]:
model_names = model_dict.keys()

In [None]:
# put the models with lowest error rates last.
model_names = sorted(model_names, key=lambda v: model_dict[v]['err'])

In [None]:
y_pos = np.arange(len(model_names))

In [None]:
fig, ax = plt.subplots()

errs = [model_dict[n]['err'] for n in model_names]
labels = [model_dict[n]['label'] for n in model_names]

ax.barh(y_pos, errs, align='center')
ax.set_yticks(y_pos, labels=labels)

plt.show()