# MNIST Digits Dataset

In this example, we'll show several different models operating on the MNIST digits dataset. We use the customized `MNISTDigitsWorkshop`, `Workshop` type to define needed auxillary methods. Importantly the `data_prep` method which produces a `Dataset` with the train and test set we need. Additionally, we'll look at the effect of normalizing the input has on the performance of the models.

In [1]:
import tensorflow as tf
import torch
import numpy as np

import dryml
import dryml.context
import dryml.models.torch
import dryml.models.tf
import dryml.data.torch
import dryml.examples.mnist_digits as dry_digits
import sklearn.neighbors
import dryml.models.sklearn
import dryml.models.xgb

  from .autonotebook import tqdm as notebook_tqdm
  from pandas import MultiIndex, Int64Index


## Local analysis Prep

We define a training method to allow remote process training.

In [2]:
work_dir = './models'

In [3]:
@dryml.compute
def test_method(trainable):
    import dryml.examples.mnist_digits as dry_digits
    import dryml.metrics
    
    repo = dry_digits.MNISTDigitsWorkshop()
    repo.data_prep()
    
    trainable.prep_eval()
    return dryml.metrics.scalar.categorical_accuracy(trainable, repo.test_ds)

In [4]:
@dryml.context.compute_context(ctx_update_objs=True)
def train_method(trainable):
    import dryml.examples.mnist_digits as dry_digits
    import dryml.metrics
    repo = dry_digits.MNISTDigitsWorkshop()
    repo.data_prep()

    trainable.prep_train()
    trainable.train(repo.train_ds)
    
    return dryml.metrics.scalar.categorical_accuracy(trainable, repo.test_ds)

In [5]:
shop = dry_digits.MNISTDigitsWorkshop(work_dir='./models')

In [6]:
# Let's make a dictionary to save the trained models so we can measure them later
model_dict = {}

In [7]:
gpu_req = {'num_gpus': 2}

# Preprocessing step preparation 

Here we prepare several preprocessing steps we're going to reuse for our models.

In [8]:
# Create a necessary preprocessing step for most models to be successful
def normalize_img(x, y):
    return tf.cast(x, dtype=tf.float32)/255., tf.cast(y, dtype=tf.int32)
img_prep_step = dryml.data.tf.FuncMap.from_function(normalize_img)

In [9]:
# For the Lenet5 models, we need to create a resize step.
def resize_img(x):
    return tf.image.resize_with_pad(x, 32, 32, method='bilinear')
resize_step = dryml.data.tf.FuncXMap.from_function(resize_img)

In [10]:
# Permute operation since the tf dataset is in channels last format, and we need channels first for pytorch models using convolutions.
channels_first_step = dryml.data.transforms.Transpose(axes=(2, 0, 1))

In [11]:
# A flatten operation we need for the sklearn based models
flatten_step = dryml.data.transforms.Flatten()

In [12]:
# A best cat operation we need for all of our models.
best_cat_step = dryml.data.transforms.BestCat()

In [13]:
# A needed step for pytorch
to_cpu_step = dryml.data.torch.transforms.TorchDevice(device='cpu')

In [14]:
# We need to cast the target to an int64 for pytorch models trained with CrossEntropyLoss
torch_target_cast_step = dryml.data.transforms.Cast(mode='Y', dtype='int64')

In [15]:
# We need to cast the target to an int64 for pytorch models trained with CrossEntropyLoss
torch_x_cast_step = dryml.data.transforms.Cast(mode='X', dtype='float32')

## Sklearn Test

Let's create a nearest neighbor version

In [16]:
sklearn_knn_10_bt_def = dryml.ObjectDef(
    dryml.models.sklearn.ClassifierModel,
    sklearn.neighbors.KNeighborsClassifier,
    n_neighbors=10,
    algorithm='ball_tree'
)

sklearn_knn_10_2000_bt_def = dryml.ObjectDef(
    dryml.models.sklearn.Trainable,
    model=sklearn_knn_10_bt_def,
    train_fn=dryml.models.sklearn.BasicTraining(
        num_examples=2000, shuffle=True, shuffle_buffer_size=20000
    ),
)

In [17]:
sklearn_pipe = dryml.models.Pipe(
    img_prep_step,
    flatten_step,
    sklearn_knn_10_2000_bt_def.build(),
    best_cat_step,
)

In [18]:
train_method(sklearn_pipe, call_context_reqs={'tf': {}})

2022-12-09 17:26:21.128365: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


0.9002403846153846

In [19]:
model_dict['sklearn_knn_10_2000_with_norm'] = {
    'model': sklearn_pipe,
    'label': 'K-NN 10 Neighbors (2000 points) (With Norm)',
}

In [20]:
sklearn_pipe = dryml.models.Pipe(
    flatten_step,
    sklearn_knn_10_2000_bt_def.build(),
    best_cat_step,
)

In [21]:
train_method(sklearn_pipe, call_context_reqs={'tf': {}})

2022-12-09 17:26:41.747449: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


0.8989383012820513

In [22]:
model_dict['sklearn_knn_10_2000_no_norm'] = {
    'model': sklearn_pipe,
    'label': 'K-NN 10 Neighbors (2000 points) (No Norm)',
}

## XGB Test

In [23]:
xgb_2000_def = dryml.ObjectDef(
    dryml.models.xgb.ClassifierModel
)

In [24]:
xgb_2000_mdl_def = dryml.ObjectDef(
    dryml.models.sklearn.Trainable,
    model=dryml.models.xgb.ClassifierModel(),
    train_fn=dryml.models.sklearn.BasicTraining(
        num_examples=2000, shuffle=True, shuffle_buffer_size=20000
    ),
)

In [25]:
xgb_pipe = dryml.models.Pipe(
    flatten_step,
    xgb_2000_mdl_def.build(),
    best_cat_step)

In [26]:
train_method(xgb_pipe, call_context_reqs={'tf': {}})

2022-12-09 17:27:02.163678: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




  from pandas import MultiIndex, Int64Index


0.9079527243589743

In [27]:
model_dict['xgb_no_norm'] = {
    'model': xgb_pipe,
    'label': 'XGB (2000 points) (No Norm)',
}

In [28]:
xgb_pipe = dryml.models.Pipe(
    img_prep_step,
    flatten_step,
    xgb_2000_mdl_def.build(),
    best_cat_step)

In [29]:
train_method(xgb_pipe, call_context_reqs={'tf': {}})

2022-12-09 17:27:18.091440: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




  from pandas import MultiIndex, Int64Index


0.9050480769230769

In [30]:
model_dict['xgb_norm'] = {
    'model': xgb_pipe,
    'label': 'XGB (2000 points) (With Norm)',
}

# pytorch

First, we'll do a couple of pytorch models.

## Simple 2d convolutional model

First, we'll do a simple 2d convolutional model.

In [31]:
import torch
import dryml.models.torch
import dryml.data.torch
import dryml.data.transforms

In [32]:
mdl_def = dryml.ObjectDef(
    dryml.models.torch.generic.Sequential,
    layer_defs = [
        [ torch.nn.LazyConv2d, (32, 3), {} ],
        [ torch.nn.ReLU, (), {}],
        [ torch.nn.LazyConv2d, (32, 3), {} ],
        [ torch.nn.ReLU, (), {}],
        [ torch.nn.Flatten, (), {}],
        [ torch.nn.LazyLinear, (10,), {}],
    ]
)

trainable_def = dryml.ObjectDef(
    dryml.models.torch.generic.Trainable,
    model=mdl_def,
    train_fn=dryml.ObjectDef(
        dryml.models.torch.generic.BasicTraining,
        epochs=3,
        optimizer=dryml.ObjectDef(
            dryml.models.torch.generic.TorchOptimizer,
            torch.optim.Adam,
            mdl_def),
        loss=dryml.models.torch.Wrapper(torch.nn.CrossEntropyLoss),
    ),
)

In [33]:
torch_model = dryml.models.Pipe(
    img_prep_step,
    channels_first_step,
    torch_target_cast_step,
    trainable_def.build(),
    best_cat_step,
    to_cpu_step
)

In [34]:
train_method(torch_model, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [00:16<00:00, 110.40it/s, loss=0.00454]
  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 1 - Average Loss: 0.004541296210648337


100%|██████████| 1875/1875 [00:14<00:00, 126.88it/s, loss=0.00158]
  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 0.0015752436498412863


100%|██████████| 1875/1875 [00:14<00:00, 126.72it/s, loss=0.000975]

Epoch 3 - Average Loss: 0.0009754337974461426





0.9834735576923077

In [35]:
model_dict['torch_simple_2layer_conv2d_with_norm'] = {
    'model': torch_model,
    'label': 'Simple 2 layer convolutional (With Norm) (PyTorch)',
}

In [36]:
torch_model = dryml.models.Pipe(
    channels_first_step,
    torch_x_cast_step,
    torch_target_cast_step,
    trainable_def.build(),
    best_cat_step,
    to_cpu_step
)

In [37]:
train_method(torch_model, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [00:16<00:00, 112.20it/s, loss=0.00841]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.00199] 

Epoch 1 - Average Loss: 0.00840864681971046


100%|██████████| 1875/1875 [00:14<00:00, 127.89it/s, loss=0.00265]
  1%|          | 10/1875 [00:00<00:19, 96.60it/s, loss=0.00161]

Epoch 2 - Average Loss: 0.0026458228423231352


100%|██████████| 1875/1875 [00:14<00:00, 127.95it/s, loss=0.00169]

Epoch 3 - Average Loss: 0.0016949125157928392





0.9665464743589743

In [38]:
model_dict['torch_simple_2layer_conv2d_no_norm'] = {
    'model': torch_model,
    'label': 'Simple 2 layer convolutional (No Norm) (PyTorch)',
}

## Pytorch Lenet5

Let's create a lenet5 pytorch model, and use the `ModelWrapper` to use it directly with `dryml`.

In [39]:
%%writefile torchlenet5_temp.py

import torch

# From medium post:
# https://towardsdatascience.com/implementing-yann-lecuns-lenet-5-in-pytorch-5e05a0911320
class TorchLenet5(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        
        self.feature_extractor = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
            torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2),
            torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            torch.nn.Tanh(),
            torch.nn.AvgPool2d(kernel_size=2),
            torch.nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
            torch.nn.Tanh(),
        )
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(in_features=120, out_features=84),
            torch.nn.Tanh(),
            torch.nn.Linear(in_features=84, out_features=n_classes),
        )
    
    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, start_dim=1)
        logits = self.classifier(x)
        #probs = torch.nn.functional.softmax(x, dim=1)
        return logits

Overwriting torchlenet5_temp.py


In [40]:
from torchlenet5_temp import TorchLenet5

In [41]:
# We can now build and train a pipeline containing this model!
torch_lenet5_mdl_def = dryml.ObjectDef(
    dryml.models.torch.generic.ModelWrapper,
    TorchLenet5,
    10
)

torch_lenet5_trainable_def = dryml.ObjectDef(
    dryml.models.torch.generic.Trainable,
    model=torch_lenet5_mdl_def,
    train_fn=dryml.ObjectDef(
        dryml.models.torch.generic.BasicTraining,
        optimizer=dryml.ObjectDef(
            dryml.models.torch.generic.TorchOptimizer,
            torch.optim.Adam,
            torch_lenet5_mdl_def
        ),
        loss=dryml.models.torch.Wrapper(
            torch.nn.CrossEntropyLoss
        ),
        epochs=5
    )
)

In [42]:
torch_lenet5_pipe = dryml.models.Pipe(
    resize_step,
    img_prep_step,
    channels_first_step,
    torch_target_cast_step,
    torch_lenet5_trainable_def.build(),
    best_cat_step,
    to_cpu_step,
)

In [43]:
train_method(torch_lenet5_pipe, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [00:17<00:00, 106.85it/s, loss=0.00703]
  0%|          | 6/1875 [00:00<00:33, 55.93it/s, loss=0.00258]

Epoch 1 - Average Loss: 0.007032527755666524


100%|██████████| 1875/1875 [00:16<00:00, 112.00it/s, loss=0.00245]
  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 0.0024532282122022783


100%|██████████| 1875/1875 [00:16<00:00, 112.87it/s, loss=0.00173]
  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 3 - Average Loss: 0.0017285096634256964


100%|██████████| 1875/1875 [00:17<00:00, 110.00it/s, loss=0.00132]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.00144] 

Epoch 4 - Average Loss: 0.0013239631879065808


100%|██████████| 1875/1875 [00:16<00:00, 110.78it/s, loss=0.00108]

Epoch 5 - Average Loss: 0.001078672293821243





0.9814703525641025

In [44]:
model_dict['torch_lenet5_with_norm'] = {
    'model': torch_lenet5_pipe,
    'label': 'Lenet-5 (With Norm) (PyTorch)',
}

In [45]:
torch_lenet5_pipe = dryml.models.Pipe(
    resize_step,
    channels_first_step,
    torch_x_cast_step,
    torch_target_cast_step,
    torch_lenet5_trainable_def.build(),
    best_cat_step,
    to_cpu_step,
)

In [46]:
train_method(torch_lenet5_pipe, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [00:17<00:00, 106.77it/s, loss=0.00513]
  0%|          | 7/1875 [00:00<00:27, 67.18it/s, loss=0.00185]

Epoch 1 - Average Loss: 0.005134942951511281


100%|██████████| 1875/1875 [00:16<00:00, 112.21it/s, loss=0.00195]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.0005]  

Epoch 2 - Average Loss: 0.0019529745913188284


100%|██████████| 1875/1875 [00:16<00:00, 112.00it/s, loss=0.00142]
  0%|          | 7/1875 [00:00<00:27, 67.39it/s, loss=0.00157]

Epoch 3 - Average Loss: 0.0014228214631885445


100%|██████████| 1875/1875 [00:16<00:00, 112.44it/s, loss=0.00112]
  0%|          | 7/1875 [00:00<00:28, 66.14it/s, loss=0.00128]

Epoch 4 - Average Loss: 0.0011156551241263515


100%|██████████| 1875/1875 [00:16<00:00, 112.91it/s, loss=0.000899]

Epoch 5 - Average Loss: 0.000898826320404866





0.9846754807692307

In [47]:
model_dict['torch_lenet5_no_norm'] = {
    'model': torch_lenet5_pipe,
    'label': 'Lenet-5 (No Norm) (PyTorch)',
}

## Convolutional transformer

In [48]:
%%writefile torch_conv_transformer_test.py

import torch
import torch.nn.functional as F
import numpy as np

# Define the scaled dotproduct attention
class TorchScaledDotProductAttention(torch.nn.Module):
    def __init__(self, in_features, qk_dim=None, v_dim=None, **kwargs):
        super().__init__(**kwargs)
        if v_dim is None and qk_dim is not None:
            v_dim = qk_dim
        self.scale_factor = np.sqrt(qk_dim)
        self.query_layer = torch.nn.Linear(in_features, qk_dim)
        self.key_layer = torch.nn.Linear(in_features, qk_dim)
        self.value_layer = torch.nn.Linear(in_features, v_dim)

    def forward(self, inputs):
        q = self.query_layer(inputs)
        k = self.key_layer(inputs)
        v = self.value_layer(inputs)
        sft_m = F.softmax(torch.bmm(q, k.transpose(1, 2))/self.scale_factor, dim=-1)
        return torch.bmm(sft_m, v)

# Multihead attention
class TorchMultiHeadScaledDotProductAttention(torch.nn.Module):
    def __init__(self, n_heads, in_features, qk_dim, v_dim, **kwargs):
        super().__init__(**kwargs)
        head_dim = qk_dim // n_heads
        self.n_heads = n_heads
        heads = []
        for i in range(self.n_heads):
            heads.append(TorchScaledDotProductAttention(in_features, qk_dim=head_dim, v_dim=v_dim))
        self.heads = torch.nn.ModuleList(heads)

    def forward(self, inputs):
        head_results = []
        for i in range(self.n_heads):
            head_results.append(self.heads[i](inputs))
        return torch.concat(head_results, dim=-1)

class TorchConvTransformerTest(torch.nn.Module):
    def __init__(self, conv2d_filters, **kwargs):
        super().__init__(**kwargs)
        self.conv2d_1 = torch.nn.Conv2d(1, conv2d_filters, 3)
        self.relu = torch.nn.ReLU()
        self.conv2d_2 = torch.nn.Conv2d(conv2d_filters, conv2d_filters, 3)
        self.at_layer = TorchMultiHeadScaledDotProductAttention(4, conv2d_filters, conv2d_filters//4, conv2d_filters//4)
        self.dense_1 = torch.nn.Linear(conv2d_filters, conv2d_filters*4)
        self.dense_2 = torch.nn.Linear(conv2d_filters*4, conv2d_filters)
        self.dense_final = torch.nn.Linear(24*24*conv2d_filters, 10)

    def forward(self, inputs):
        # in channels first format
        x = self.relu(self.conv2d_1(inputs))
        x = self.relu(self.conv2d_2(x))
        # Need to reshape and permute
        x_shape = x.shape
        x = torch.reshape(x, [x_shape[0], x_shape[1], x_shape[2]*x_shape[3]])
        x = torch.permute(x, (0, 2, 1))
        # Do attention
        at_x = self.at_layer(x)
        x = x+at_x
        # Expansion block
        x = self.relu(self.dense_1(x))
        x = self.relu(self.dense_2(x))
        # Flatten
        x = torch.flatten(x, start_dim=1)
        # Classification head
        x = self.dense_final(x)
        # ?? Need softmax?
        return x

Overwriting torch_conv_transformer_test.py


In [49]:
from torch_conv_transformer_test import TorchConvTransformerTest

In [50]:
torch_transformer_mdl_def = dryml.ObjectDef(
    dryml.models.torch.generic.ModelWrapper,
    TorchConvTransformerTest,
    32)

torch_transformer_test_mdl_def = dryml.ObjectDef(
    dryml.models.torch.generic.Trainable,
    model=torch_transformer_mdl_def,
    train_fn=dryml.ObjectDef(
        dryml.models.torch.generic.BasicTraining,
        optimizer=dryml.ObjectDef(
            dryml.models.torch.generic.TorchOptimizer,
            torch.optim.Adam,
            model=torch_transformer_mdl_def
        ),
        loss=dryml.models.torch.generic.Wrapper(
            torch.nn.CrossEntropyLoss
        ),
        epochs=3
    )
)

In [51]:
torch_transformer_test_pipe = dryml.models.Pipe(
    img_prep_step,
    channels_first_step,
    torch_target_cast_step,
    torch_transformer_test_mdl_def.build(),
    best_cat_step,
    to_cpu_step,
)

In [52]:
train_method(torch_transformer_test_pipe, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [00:59<00:00, 31.27it/s, loss=0.00638]
  0%|          | 3/1875 [00:00<01:18, 23.75it/s, loss=0.00201] 

Epoch 1 - Average Loss: 0.006383662279639005


100%|██████████| 1875/1875 [00:59<00:00, 31.68it/s, loss=0.00192]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.000151]

Epoch 2 - Average Loss: 0.001918178413102578


100%|██████████| 1875/1875 [00:59<00:00, 31.74it/s, loss=0.00121]

Epoch 3 - Average Loss: 0.0012148515647911458





0.9845753205128205

In [53]:
model_dict['torch_transformer_test_norm'] = {
    'model': torch_transformer_test_pipe,
    'label': 'Convolutional Transformer Test (With Norm) (PyTorch)'
}

In [54]:
torch_transformer_test_pipe = dryml.models.Pipe(
    channels_first_step,
    torch_target_cast_step,
    torch_x_cast_step,
    torch_transformer_test_mdl_def.build(),
    best_cat_step,
    to_cpu_step,
)

In [55]:
train_method(torch_transformer_test_pipe, call_context_reqs={'tf': {}, 'torch': gpu_req})

100%|██████████| 1875/1875 [01:00<00:00, 31.22it/s, loss=0.00557]
  0%|          | 0/1875 [00:00<?, ?it/s, loss=0.00127] 

Epoch 1 - Average Loss: 0.005565646504743199


100%|██████████| 1875/1875 [01:00<00:00, 31.10it/s, loss=0.00249]
  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 0.00248566318638647


100%|██████████| 1875/1875 [01:00<00:00, 31.10it/s, loss=0.00172]

Epoch 3 - Average Loss: 0.0017240369595531472





0.9736578525641025

In [56]:
model_dict['torch_transformer_test_no_norm'] = {
    'model': torch_transformer_test_pipe,
    'label': 'Convolutional Transformer Test (No Norm) (PyTorch)'
}

# Tensorflow

Now we'll do the same for tensorflow.

## Simple 2-layer convolutional

In [16]:
import tensorflow as tf
import dryml.models.tf
import dryml.data.tf
import dryml.data.transforms

In [17]:
model_def = dryml.ObjectDef(
    dryml.models.tf.keras.Sequential,
    layer_defs = [
        [ 'Conv2D', (), {'filters': 32, 'kernel_size': 3, 'activation': 'relu', 'input_shape': (28, 28, 1)} ],
        [ 'Conv2D', (), {'filters': 32, 'kernel_size': 3, 'activation': 'relu'} ],
        [ 'Flatten', (), {}],
        [ 'Dense', (), {'units': 10, 'activation': 'softmax'}],
    ]
)

trainable_def = dryml.ObjectDef(
    dryml.models.tf.keras.Trainable,
    model=model_def,
    optimizer=dryml.models.tf.Wrapper(tf.keras.optimizers.Adam),
    loss=dryml.models.tf.Wrapper(
        tf.keras.losses.SparseCategoricalCrossentropy),
    train_fn=dryml.models.tf.keras.BasicTraining(
        epochs=3),
)

In [18]:
tf_model_1 = dryml.models.Pipe(
    img_prep_step,
    trainable_def.build(),
    best_cat_step,
)

In [19]:
train_method(tf_model_1, call_context_reqs={'tf': gpu_req })

2022-12-09 17:40:57.653236: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6293 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1
2022-12-09 17:40:57.653986: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7368 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1
2022-12-09 17:41:04.288051: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:547] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


Epoch 1/3


2022-12-09 17:41:08.411161: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8600
2022-12-09 17:41:08.683418: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8600




2022-12-09 17:41:14.024228: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:547] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


Epoch 2/3
Epoch 3/3


0.9856770833333334

In [None]:
model_dict['tf_simple_2layer_conv2d_norm'] = {
    'model': tf_model_1,
    'label': 'Simple 2 layer convolutional (With Norm) (Tensorflow)',
}

In [None]:
tf_model_1 = dryml.models.Pipe(
    trainable_def.build(),
    best_cat_step,
)

In [None]:
train_method(tf_model_1, call_context_reqs={'tf': gpu_req })

In [None]:
model_dict['tf_simple_2layer_conv2d_no_norm'] = {
    'model': tf_model_1,
    'label': 'Simple 2 layer convolutional (No Norm) (Tensorflow)',
}

## Lenet5

Let's create a lenet5 tensorflow model, and use the `ModelWrapper` to use it directly with `dryml`.

In [None]:
%%writefile tflenet5_temp.py

import tensorflow as tf

# From medium post:
# https://towardsdatascience.com/implementing-yann-lecuns-lenet-5-in-pytorch-5e05a0911320
class TFLenet5(tf.keras.Model):
    def __init__(self, n_classes, **kwargs):
        super().__init__(**kwargs)

        self.feature_extractor = tf.keras.Sequential([
            tf.keras.layers.Conv2D(filters=6, kernel_size=5, activation='tanh', input_shape=(32, 32, 1)),
            tf.keras.layers.AveragePooling2D(pool_size=2),
            tf.keras.layers.Conv2D(filters=16, kernel_size=5, activation='tanh'),
            tf.keras.layers.AveragePooling2D(pool_size=2),
            tf.keras.layers.Conv2D(filters=120, kernel_size=5, activation='tanh'),
        ])

        self.classifier = tf.keras.Sequential([
            tf.keras.layers.Dense(80, input_shape=(120,), activation='tanh'),
            tf.keras.layers.Dense(n_classes, activation='linear'),
        ])
    
    def call(self, x, training=True):
        x = self.feature_extractor(x)
        x = tf.keras.layers.Flatten()(x)
        logits = self.classifier(x)
        if not training:
            return tf.nn.softmax(logits, axis=-1)
        else:
            return logits

In [None]:
from tflenet5_temp import TFLenet5

In [None]:
# We can now build and train a pipeline containing this model!
tf_lenet5_mdl_def = dryml.ObjectDef(
    dryml.models.tf.keras.ModelWrapper,
    TFLenet5,
    10
)

tf_lenet5_trainable_def = dryml.ObjectDef(
    dryml.models.tf.keras.Trainable,
    model=tf_lenet5_mdl_def,
    optimizer=dryml.models.tf.Wrapper(
        tf.keras.optimizers.Adam
    ),
    loss=dryml.models.tf.Wrapper(
        tf.keras.losses.SparseCategoricalCrossentropy,
        from_logits=True,
    ),
    train_fn=dryml.models.tf.keras.BasicTraining(
        epochs=5
    )
)

In [None]:
# We need to change the device of the final result
tf_lenet5_pipe = dryml.models.Pipe(
    resize_step,
    img_prep_step,
    tf_lenet5_trainable_def.build(),
    best_cat_step,
)

In [None]:
train_method(tf_lenet5_pipe, call_context_reqs={'tf': gpu_req})

In [None]:
model_dict['tf_lenet5_norm'] = {
    'model': tf_lenet5_pipe,
    'label': 'Lenet-5 (With Norm) (Tensorflow)',
}

In [None]:
# We need to change the device of the final result
tf_lenet5_pipe = dryml.models.Pipe(
    resize_step,
    tf_lenet5_trainable_def.build(),
    best_cat_step,
)

In [None]:
train_method(tf_lenet5_pipe, call_context_reqs={'tf': gpu_req})

In [None]:
model_dict['tf_lenet5_no_norm'] = {
    'model': tf_lenet5_pipe,
    'label': 'Lenet-5 (No Norm) (Tensorflow)',
}

## Convolutional transformer

In [None]:
%%writefile tf_conv_transformer_test.py

import tensorflow as tf
import numpy as np

# Define the scaled dotproduct attention
class TFScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, qk_dim=None, v_dim=None, **kwargs):
        super().__init__(**kwargs)
        if v_dim is None and qk_dim is not None:
            v_dim = qk_dim
        self.scale_factor = np.sqrt(qk_dim)
        self.query_layer = tf.keras.layers.Dense(qk_dim)
        self.key_layer = tf.keras.layers.Dense(qk_dim)
        self.value_layer = tf.keras.layers.Dense(v_dim)
    
    def call(self, inputs):
        q = self.query_layer(inputs)
        k = self.key_layer(inputs)
        v = self.value_layer(inputs)
        sft_m = tf.math.softmax(tf.matmul(q, tf.transpose(k, perm=[0, 2, 1]))/self.scale_factor, axis=-1)
        return tf.matmul(sft_m, v)

# Multihead attention
class TFMultiHeadScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, n_heads, qk_dim, v_dim, **kwargs):
        super().__init__(**kwargs)
        head_dim = qk_dim // n_heads
        self.n_heads = n_heads
        self.heads = []
        for i in range(self.n_heads):
            self.heads.append(TFScaledDotProductAttention(qk_dim=head_dim, v_dim=v_dim))
    
    def call(self, inputs):
        head_results = []
        for i in range(self.n_heads):
            head_results.append(self.heads[i](inputs))
        return tf.concat(head_results, axis=-1)

class TFConvTransformerTest(tf.keras.Model):
    def __init__(self, conv2d_filters, **kwargs):
        super().__init__(**kwargs)
        self.conv2d_1 = tf.keras.layers.Conv2D(conv2d_filters, 3, activation='relu')
        self.conv2d_2 = tf.keras.layers.Conv2D(conv2d_filters, 3, activation='relu')
        self.at_layer = TFMultiHeadScaledDotProductAttention(4, conv2d_filters//4, conv2d_filters//4)
        self.dense_1 = tf.keras.layers.Dense(conv2d_filters*4)
        self.dense_2 = tf.keras.layers.Dense(conv2d_filters)
        self.dense_final = tf.keras.layers.Dense(10, activation='softmax')
    
    def call(self, inputs):
        x = self.conv2d_1(inputs)
        x = self.conv2d_2(x)
        x_shape = tf.shape(x)
        x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]])
        at_x = self.at_layer(x)
        x = x+at_x
        x = tf.keras.layers.Flatten()(x)
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dense_final(x)
        return x

In [None]:
from tf_conv_transformer_test import TFConvTransformerTest

In [None]:
tf_transformer_mdl_def = dryml.ObjectDef(
    dryml.models.tf.keras.ModelWrapper,
    TFConvTransformerTest,
    32
)

tf_transformer_test_mdl_def = dryml.ObjectDef(
    dryml.models.tf.keras.Trainable,
    model=tf_transformer_mdl_def,
    optimizer=dryml.models.tf.Wrapper(
        tf.keras.optimizers.Adam
    ),
    loss=dryml.models.tf.Wrapper(
        tf.keras.losses.SparseCategoricalCrossentropy,
        from_logits=True
    ),
    train_fn=dryml.models.tf.keras.BasicTraining(
        callbacks=[
            dryml.models.tf.Wrapper(
                tf.keras.callbacks.EarlyStopping(
                    patience=2
                    restore_best_weights=True
                )
            )
        ]
    )
)

In [None]:
tf_transformer_test_pipe = dryml.models.Pipe(
    img_prep_step,
    tf_transformer_test_mdl_def.build(),
    best_cat_step,
)

In [None]:
train_method(tf_transformer_test_pipe, call_context_reqs={'tf': gpu_req})

In [None]:
model_dict['tf_transformer_test_1_norm'] = {
    'model': tf_transformer_test_pipe,
    'label': 'Convolutional Transformer Test (With Norm) (Tensorflow)'
}

In [None]:
tf_transformer_test_pipe = dryml.models.Pipe(
    dryml.data.transforms.Cast(mode='X', dtype='float32'),
    tf_transformer_test_mdl_def.build(),
    best_cat_step,
)

In [None]:
train_method(tf_transformer_test_pipe, call_context_reqs={'tf': gpu_req})

In [None]:
model_dict['tf_transformer_test_1_no_norm'] = {
    'model': tf_transformer_test_pipe,
    'label': 'Convolutional Transformer Test (No Norm) (Tensorflow)'
}

# Model Comparison

Let's compare these models directly and create a plot! We'll create a function which takes a dictionary with models and their labels, and creates the plot.

In [None]:
import matplotlib.pyplot as plt

In [None]:
# define method for computing accuracy
def compute_accuracies(model_dict):
    message = False
    # compute model accuracies
    for mdl_name in model_dict:
        if 'acc' not in model_dict[mdl_name]:
            if not message:
                print("Computing Accuracies")
                message = True
            print(f"model: {mdl_name}")
            model_dict[mdl_name]['acc'] = test_method(model_dict[mdl_name]['model'], call_context_reqs={'tf': {'gpu/1': 1.}, 'torch': {'gpu/0': 1.}})

In [None]:
# refresh accuracies
compute_accuracies(model_dict)

In [None]:
# Define a method to build a full plot
def plot_model_errors(model_dict):
    # Compute error rate
    for mdl_name in model_dict:
        if 'acc' in model_dict[mdl_name]:
            model_dict[mdl_name]['err'] = (1.-model_dict[mdl_name]['acc'])*100.

    model_names = model_dict.keys()
    # put the models with lowest error rates last.
    model_names = sorted(model_names, key=lambda v: model_dict[v]['err'])
    y_pos = np.arange(len(model_names))

    fig, ax = plt.subplots()

    errs = [model_dict[n]['err'] for n in model_names]
    labels = [model_dict[n]['label'] for n in model_names]

    ax.barh(y_pos, errs, align='center')
    ax.set_yticks(y_pos, labels=labels)
    ax.set_xlabel("Error rate [Percentage]")

    plt.show()

In [None]:
plot_model_errors(model_dict)

# Can we do better?

There are a few techniques we've left on the cutting room floor so-to-speak.

* **Data Augmentation**: We can apply transformations to the dataset to expose the model to 'new' image examples
* **L2 Regularization**: We can restrict the size of the weights of some layers by applying an l2 regularization term. This term is penalizes the model for having weights with large magnitudes.
* **Learning Rate Scheduling**: As training proceeds, the model will reach a plateau. This is a sign we should reduce the learning rate.
* **Dropout Regularization**: Commonly, Dropout is used to make the model more robust. Dropout encourages the model to learn useful features so it can still do well when a random selection of features is removed.
* **Batch Normalization**: this technique helps gradients propogate through the network.

Models and techniques throughout this section are taken from the following article: (Thanks to Jay Gupta)
https://towardsdatascience.com/going-beyond-99-mnist-handwritten-digits-recognition-cfff96337392

## Data Augmentation

First, let's apply some data augmentation to the dataset to give the model more 'examples' to learn from. We'll add random scaling, rotations, flips, and some noise. We'll have to create our own data transform object for that.

In [20]:
%%writefile image_augmentation.py
import tensorflow as tf
import dryml.models
import dryml.data

class ImageAugmentation(dryml.models.Trainable):
    def __init__(
            self,
            noise_stddev=0.0,
            min_max=(0., 1.),
            zoom=(0., 0.),
            rot=(0., 0.),
            trans_w=(0., 0.),
            trans_h=(0., 0.),
            fill_mode='constant',
            seed=42,
            batch_size=32,
            channels=3):
        # Noise
        self.noise_stddev = noise_stddev
        self.min_max = min_max
        # Scaling
        self.zoom = zoom
        # Rotation
        self.rot = rot
        # Translation
        self.trans_w = trans_w
        self.trans_h = trans_h
        # Seed
        self.seed = seed
        self.fill_mode = fill_mode
        self.batch_size = batch_size
        self.channels = channels

    def compute_prepare_imp(self):
        inp = tf.keras.layers.Input((None, None, self.channels))
        last_layer = inp

        # Noise
        last_layer = tf.keras.layers.GaussianNoise(
            self.noise_stddev,
            seed=self.seed,
        )(last_layer)

        # Scaling
        last_layer = tf.keras.layers.RandomZoom(
            self.zoom,
            seed=self.seed,
            fill_mode=self.fill_mode
        )(last_layer)
        # Rotation
        last_layer = tf.keras.layers.RandomRotation(
            self.rot,
            seed=self.seed,
            fill_mode=self.fill_mode,
        )(last_layer)
        # Translation
        last_layer = tf.keras.layers.RandomTranslation(
            self.trans_h,
            self.trans_w,
            fill_mode=self.fill_mode
        )(last_layer)
        # Clip value
        last_layer = tf.clip_by_value(last_layer, self.min_max[0], self.min_max[1])

        self.augmenter = tf.keras.Model(
            inputs=inp,
            outputs=last_layer
        )
    
    def compute_cleanup_imp(self):
        del self.augmenter
    
    def eval(self, data: dryml.data.dataset.Dataset, *args, **kwargs):
        if not data.batched:
            data = data.batch(batch_size=self.batch_size)
        return data.apply_X(
            lambda X: self.augmenter(X, training=True)
        )

Overwriting image_augmentation.py


In [21]:
from image_augmentation import ImageAugmentation

In [22]:
# Create the image augmenter
img_augmenter = ImageAugmentation(
    noise_stddev=0.0,
    zoom=(-0.1, 0.1),
    rot=(-0.2, 0.2),
    trans_w=(-0.1, 0.1),
    trans_h=(-0.1, 0.1),
    channels=1,
)

We need to create a new training method which can apply the image augmentation to the training set.

In [23]:
@dryml.context.compute_context(ctx_update_objs=True)
def train_method_aug(trainable, augmenter):
    import dryml.examples.mnist_digits as dry_digits
    import dryml.metrics

    shop = dry_digits.MNISTDigitsWorkshop()
    shop.data_prep()
    train_ds = augmenter.eval(shop.train_ds)

    trainable.prep_train()
    trainable.train(train_ds)

    return dryml.metrics.scalar.categorical_accuracy(trainable, shop.test_ds)

## Adjusting the Learning Rate

We can now add a callback to our training method, `tf.keras.callbacks.ReduceLROnPlateau`

In [24]:
# Let's define the callback
lr_callback = dryml.models.tf.Wrapper(
    tf.keras.callbacks.ReduceLROnPlateau,
    factor=0.2,
    patience=5,
)

## Bigger Model

We'll now improve the Lenet-5 model with more techniques like `Dropout`, and l2regularization

In [25]:
%%writefile tf_better_lenet5.py

import tensorflow as tf

class TFBetterLenet5(tf.keras.Model):
    def __init__(self):
        inp = tf.keras.layers.Input((32, 32, 1))
        last_layer = inp
        # Layer 1
        last_layer = tf.keras.layers.Conv2D(
            filters=32,
            kernel_size=5,
            strides=1,
            activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.0005))(last_layer)
        # Layer 2
        last_layer = tf.keras.layers.Conv2D(
            filters=32,
            kernel_size=5,
            strides=1,
            use_bias=False)(last_layer)
        last_layer = tf.keras.layers.BatchNormalization()(last_layer)
        last_layer = tf.keras.layers.Activation('relu')(last_layer)
        # Layer 3
        last_layer = tf.keras.layers.MaxPooling2D(
            pool_size=2,
            strides=2)(last_layer)
        last_layer = tf.keras.layers.Dropout(0.25)(last_layer)
        # Layer 4
        last_layer = tf.keras.layers.Conv2D(
            filters=64,
            kernel_size=3,
            strides=1,
            activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.0005))(last_layer)
        # Layer 5
        last_layer = tf.keras.layers.Conv2D(
            filters=64,
            kernel_size=3,
            strides=1,
            use_bias=False)(last_layer)
        last_layer = tf.keras.layers.BatchNormalization()(last_layer)
        last_layer = tf.keras.layers.Activation('relu')(last_layer)
        # Layer 6
        last_layer = tf.keras.layers.MaxPooling2D(
            pool_size=2,
            strides=2)(last_layer)
        last_layer = tf.keras.layers.Dropout(0.25)(last_layer)
        last_layer = tf.keras.layers.Flatten()(last_layer)
        # Layer 7
        last_layer = tf.keras.layers.Dense(
            units=256,
            use_bias=False)(last_layer)
        last_layer = tf.keras.layers.BatchNormalization()(last_layer)
        last_layer = tf.keras.layers.Activation('relu')(last_layer)
        # Layer 8
        last_layer = tf.keras.layers.Dense(
            units=128,
            use_bias=False)(last_layer)
        last_layer = tf.keras.layers.BatchNormalization()(last_layer)
        last_layer = tf.keras.layers.Activation('relu')(last_layer)
        # Layer 9
        last_layer = tf.keras.layers.Dense(
            units=84,
            use_bias=False)(last_layer)
        last_layer = tf.keras.layers.BatchNormalization()(last_layer)
        last_layer = tf.keras.layers.Activation('relu')(last_layer)
        last_layer = tf.keras.layers.Dropout(0.25)(last_layer)
        # Layer 10
        last_layer = tf.keras.layers.Dense(units = 10, activation = 'softmax')(last_layer)
        
        super().__init__(inputs=inp, outputs=last_layer)

Overwriting tf_better_lenet5.py


In [26]:
from tf_better_lenet5 import TFBetterLenet5

Now we have access to all the improvements mentioned before. Let's create a model which we hope will train better.

In [28]:
tflenet5_2_mdl_def = dryml.ObjectDef(
    dryml.models.tf.keras.ModelWrapper,
    TFBetterLenet5,
)

tflenet5_mdl_comp_def = dryml.ObjectDef(
    dryml.models.tf.keras.Trainable,
    model=tflenet5_2_mdl_def,
    optimizer=dryml.models.tf.Wrapper(
        tf.keras.optimizers.Adam
    ),
    loss=dryml.models.tf.Wrapper(
        tf.keras.losses.SparseCategoricalCrossentropy
    ),
    train_fn=dryml.models.tf.keras.BasicTraining(
        epochs=30,
        callbacks=[
            dryml.models.tf.Wrapper(
                tf.keras.callbacks.ReduceLROnPlateau,
                factor=0.2,
                patience=2,
            )
        ],
        train_transform=ImageAugmentation(
            noise_stddev=0.0,
            zoom=(-0.1, 0.1),
            rot=(-0.2, 0.2),
            trans_w=(-0.1, 0.1),
            trans_h=(-0.1, 0.1),
            channels=1,
        )
    )
)

In [29]:
tflenet_5_pipe_norm = dryml.models.Pipe(
    img_prep_step,
    resize_step,
    tflenet5_mdl_comp_def.build(),
    best_cat_step,
)

In [30]:
train_method(tflenet_5_pipe_norm, call_context_reqs={'tf': gpu_req})

2022-12-09 17:42:05.875060: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6292 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1
2022-12-09 17:42:05.875480: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7368 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1


Exception encountered in context thread! pid: 2324434
Traceback (most recent call last):
  File "/data0/matthew/Software/NCSA/DRYML/src/dryml/context/process.py", line 34, in run
    super().run()
  File "/data0/matthew/Software/NCSA/DRYML/venv_dryml_dev/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/data0/matthew/Software/NCSA/DRYML/src/dryml/context/process.py", line 186, in __call__
    self.final_call(f, ctx_send_q, ctx_ret_q, *args, **kwargs)
  File "/data0/matthew/Software/NCSA/DRYML/src/dryml/context/process.py", line 129, in final_call
    res = f(*args, **kwargs)
  File "/tmp/ipykernel_2323505/602169870.py", line 9, in train_method
  File "/data0/matthew/Software/NCSA/DRYML/src/dryml/context/process.py", line 268, in wrapped_func
    res = f(*args, **kwargs)
  File "/data0/matthew/Software/NCSA/DRYML/src/dryml/models/pipe.py", line 37, in train
    step.train(
  File "/data0/matthew/Software/NCSA/DRYML/src/dry

AttributeError: 'PrefetchDataset' object has no attribute 'batched'

In [None]:
model_dict['tflenet5_better_norm'] = {
    'model': tflenet_5_pipe_norm,
    'label': "Lenet-5 Improved (With Norm) (Tensorflow)"
}

In [None]:
compute_accuracies(model_dict)

In [None]:
plot_model_errors(model_dict)