# MNIST image classification with Keras only

In [1]:
import os
import shutil
from datetime import datetime
import sys
import json
import argparse

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
print(tf.__version__)

2.1.0


In [2]:
PROJECT = "ml-practice-260405" # REPLACE WITH YOUR PROJECT ID
BUCKET = "bucket-ml-practice-260405" # REPLACE WITH YOUR BUCKET NAME
REGION = "us-central1" # REPLACE WITH YOUR BUCKET REGION e.g. us-central1
MODEL_TYPE = "dnn"  # "linear", "dnn", "dnn_dropout", or "cnn"

In [13]:
# Do not change 
os.environ["ACCOUNT"] = "sandcorp2014@gmail.com"
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION
os.environ["MODEL_TYPE"] = MODEL_TYPE
os.environ["TFVERSION"] = "2.1.0"  # Tensorflow version
os.environ["IMAGE_URI"] = os.path.join("gcr.io", PROJECT, "mnistmodel")

In [4]:
%%bash
gcloud config set account $ACCOUNT

Updated property [core/account].


In [5]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [None]:
%%bash
mkdir mnistmodel_keras_only
mkdir mnistmodel_keras_only/trainer/

In [6]:
%%writefile mnistmodel_keras_only/trainer/__init__.py
# Empty file

Overwriting mnistmodel_keras_only/trainer/__init__.py


In [7]:
%%writefile mnistmodel_keras_only/trainer/task.py
import argparse
import json
import os
import sys

from . import model

def _parse_arguments(argv):
    """
    Parse command line arguments
    """
    parser = argparse.ArgumentParser()
    
    parser.add_argument(
        '--model_type',
        help="Which model type to use.",
        type=str,
        default='dnn'
    )
    
    parser.add_argument(
        '--epochs',
        help='The number of epochs to train.',
        type=int,
        default=10
    )
    
    parser.add_argument(
        '--steps_per_epoch',
        help='The number of steps per epoch to train.',
        type=int,
        default=100
    )
    
    parser.add_argument(
        '--job-dir',
        help="Directory where to save the model.",
        type=str,
        default='mnistmodel_keras_only/'
    )
    
    return parser.parse_known_args(argv)

def main():
    """
    Parse command line arguments and kicks off model training.
    """
    args = _parse_arguments(sys.argv[1:])[0]
    
    model_layers = model.get_layers(args.model_type)
    
    image_model = model.build_model(model_layers, args.job_dir)
    
    model_history = model.train_and_evaluate(
        image_model, args.epochs, args.steps_per_epoch, args.job_dir
    )
    
if __name__ == '__main__':
    main()

Overwriting mnistmodel_keras_only/trainer/task.py


Next, let's group non-model functions into a util file to keep the model file simple. We'll copy over the `scale` and `load_dataset` functions from the previous lab.

In [8]:
%%writefile mnistmodel_keras_only/trainer/util.py
import tensorflow as tf

def scale(image, label):
    """
    Scale image from 0 to 255 int range to a 0.0 to 1.0 float range
    """
    image = tf.cast(x=image, dtype=tf.float32)
    image /= 255
    image = tf.expand_dims(input=image, axis=-1)
    return image, label

def load_dataset(data, training=True, buffer_size=5000, batch_size=100, nclasses=10):
    """
    Loads MNIST dataset into a tf.data.Dataset
    """
    (x_train, y_train), (x_test, y_test) = data
    
    x = x_train if training else x_test
    y = y_train if training else y_test
    
    # One-hot encode the class
    y = tf.keras.utils.to_categorical(y = y, num_classes=nclasses)
    
    # Convert our data into tf.data
    dataset = tf.data.Dataset.from_tensor_slices((x,y))
    dataset = dataset.map(scale).batch(batch_size)
    
    # During training shuffle our dataset
    if training:
        dataset = dataset.shuffle(buffer_size).repeat()
        
    return dataset

Overwriting mnistmodel_keras_only/trainer/util.py


Finally, let's code the models! The [tf.keras API](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras) accepts an array of [layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers) into a [model object](https://www.tensorflow.org/api_docs/python/tf/keras/Model), so we can create a dictionary of layers based on the different model types we want to use. The below file has two functions: `get_layers` and `create_and_train_model`. We will build the structure of our model in `get_layers`. Last but not least, we'll copy over the training code from the previous lab into `train_and_evaluate`.

These models progressively build on each other. Look at the imported `tensorflow.keras.layers` modules and the default values for the variables defined in `get_layers` for guidance.

In [9]:
%%writefile mnistmodel_keras_only/trainer/model.py
import os
import shutil

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv2D, Dense, Dropout, Flatten,
                                     MaxPooling2D, Softmax)

from . import util

# Image Variables
WIDTH = 28
HEIGHT = 28

def get_layers(model_type, nclasses = 10, hidden_layer_1_neurons=400,
              hidden_layer_2_neurons=100):
    """
    Construct layers for keras model based on a dict of model types.
    """
    model_layers = {
        'linear':[
            Flatten(),
            Dense(nclasses),
            Softmax()
        ],
        'dnn':[
            Flatten(),
            Dense(hidden_layer_1_neurons, activation='relu'),
            Dense(hidden_layer_2_neurons, activation='relu'),
            Dense(nclasses),
            Softmax()
        ]
    }
    
    return model_layers[model_type]

def build_model(layers, output_dir):
    """
    Compiles keras model for image classification.
    """
    model = Sequential(layers)
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def train_and_evaluate(model, num_epochs, steps_per_epoch, output_dir):
    """
    Compiles keras model and loads data into it for training.
    """
    # Load MNIST dataset
    mnist = tf.keras.datasets.mnist.load_data()
    
    # Spilt dataset into train and validation.
    train_data = util.load_dataset(mnist)
    validation_data = util.load_dataset(mnist, training=False)
    
    # Create TensorBoard callback
    callbacks = []
    if output_dir:
        tensorboard_callback = TensorBoard(log_dir=output_dir)
        callbacks = [tensorboard_callback]
    
    # Train the model
    history = model.fit(
        train_data,
        validation_data=validation_data,
        epochs=num_epochs,
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=callbacks
    )
    
    # Now save the trained model
    if output_dir:
        export_path = os.path.join(output_dir, 'keras_export')
        model.save(export_path, save_format='tf')
    
    return history

Overwriting mnistmodel_keras_only/trainer/model.py


## Run as a Python module

Since we want to run our code on Cloud ML Engine, we've packaged it as a python module.

The `model.py` and `task.py` containing the model code is in <a href="mnistmodel_keras_only/trainer">mnistmodel_keras_only/trainer</a>

**Let's first run it locally for a few steps to test the code.** 

In [10]:
%%bash
MODEL_TYPE='dnn'
JOB_DIR='mnistmodel_keras_only'
rm -rf mnistmodel_keras_only.tar.gz mnist_keras_only_trained
python3 -m mnistmodel_keras_only.trainer.task \
    --job-dir=${JOB_DIR}\
    --epochs=5 \
    --steps_per_epoch=50 \
    --model_type=${MODEL_TYPE}

Train for 50 steps, validate for 100 steps
Epoch 1/5
50/50 - 3s - loss: 0.8095 - accuracy: 0.7768 - val_loss: 0.3892 - val_accuracy: 0.8829
Epoch 2/5
50/50 - 1s - loss: 0.3459 - accuracy: 0.8936 - val_loss: 0.2922 - val_accuracy: 0.9121
Epoch 3/5
50/50 - 1s - loss: 0.2650 - accuracy: 0.9208 - val_loss: 0.2329 - val_accuracy: 0.9291
Epoch 4/5
50/50 - 1s - loss: 0.2520 - accuracy: 0.9286 - val_loss: 0.2080 - val_accuracy: 0.9399
Epoch 5/5
50/50 - 1s - loss: 0.2035 - accuracy: 0.9396 - val_loss: 0.1881 - val_accuracy: 0.9438


2020-06-25 12:23:58.912648: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-06-25 12:23:58.912713: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-06-25 12:23:58.912720: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-06-25 12:23:59.857152: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-06-25 12:23:59.857177: E tensorflow/stream_executor/cuda

In [11]:
%%bash
MODEL_TYPE='dnn'
JOB_DIR='mnistmodel_keras_only'
rm -rf mnistmodel_keras_only.tar.gz mnist_keras_only_trained
JOB_DIR=./tmp
gcloud ml-engine local train \
    --module-name=trainer.task \
    --package-path=./mnistmodel_keras_only/trainer \
    -- \
    --job-dir=${JOB_DIR}\
    --epochs=10 \
    --steps_per_epoch=50 \
    --model_type=${MODEL_TYPE}

Train for 50 steps, validate for 100 steps
Epoch 1/10
50/50 - 3s - loss: 0.7703 - accuracy: 0.7766 - val_loss: 0.3322 - val_accuracy: 0.9076
Epoch 2/10
50/50 - 1s - loss: 0.3577 - accuracy: 0.8954 - val_loss: 0.2801 - val_accuracy: 0.9182
Epoch 3/10
50/50 - 1s - loss: 0.2713 - accuracy: 0.9204 - val_loss: 0.2451 - val_accuracy: 0.9252
Epoch 4/10
50/50 - 1s - loss: 0.2280 - accuracy: 0.9340 - val_loss: 0.2121 - val_accuracy: 0.9362
Epoch 5/10
50/50 - 1s - loss: 0.2141 - accuracy: 0.9396 - val_loss: 0.1982 - val_accuracy: 0.9380
Epoch 6/10
50/50 - 1s - loss: 0.2022 - accuracy: 0.9380 - val_loss: 0.1827 - val_accuracy: 0.9430
Epoch 7/10
50/50 - 1s - loss: 0.1573 - accuracy: 0.9528 - val_loss: 0.1673 - val_accuracy: 0.9491
Epoch 8/10
50/50 - 1s - loss: 0.1991 - accuracy: 0.9370 - val_loss: 0.1591 - val_accuracy: 0.9502
Epoch 9/10
50/50 - 1s - loss: 0.1604 - accuracy: 0.9502 - val_loss: 0.1386 - val_accuracy: 0.9586
Epoch 10/10
50/50 - 1s - loss: 0.1437 - accuracy: 0.9560 - val_loss: 0.1241

2020-06-25 12:24:24.059087: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-06-25 12:24:24.059175: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-06-25 12:24:24.059186: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-06-25 12:24:24.944040: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-06-25 12:24:24.944063: E tensorflow/stream_executor/cuda

**Now, let's do it on Cloud ML Engine so we can train on GPU (`--scale-tier=BASIC_GPU`)**

Note the GPU speed up depends on the model type. You'll notice the more complex CNN model trains significantly faster on GPU, however the speed up on the simpler models is not as pronounced.

In [14]:
%%bash
OUTDIR=gs://${BUCKET}/mnist/trained_dnn
JOBNAME=mnist_dnn_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer.task \
    --package-path=./mnistmodel_keras_only/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=BASIC_GPU \
    --runtime-version=$TFVERSION \
    -- \
    --epochs=50 \
    --steps_per_epoch=50 \
    --model_type=$MODEL_TYPE

gs://bucket-ml-practice-260405/mnist/trained_dnn us-central1 mnist_dnn_200625_062605


CommandException: 1 files/objects could not be removed.
ERROR: (gcloud.ml-engine.jobs.submit.training) INVALID_ARGUMENT: Field: runtime_version Error: The specified runtime version '2.1.0' with the Python version '' is not supported or is deprecated.  Please specify a different runtime version. See https://cloud.google.com/ml-engine/docs/runtime-version-list for a list of supported versions
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: The specified runtime version '2.1.0' with the Python version ''
      is not supported or is deprecated.  Please specify a different runtime version.
      See https://cloud.google.com/ml-engine/docs/runtime-version-list for a list
      of supported versions
    field: runtime_version


CalledProcessError: Command 'b'OUTDIR=gs://${BUCKET}/mnist/trained_dnn\nJOBNAME=mnist_dnn_$(date -u +%y%m%d_%H%M%S)\necho $OUTDIR $REGION $JOBNAME\ngsutil -m rm -rf $OUTDIR\ngcloud ml-engine jobs submit training $JOBNAME \\\n    --region=$REGION \\\n    --module-name=trainer.task \\\n    --package-path=./mnistmodel_keras_only/trainer \\\n    --job-dir=$OUTDIR \\\n    --staging-bucket=gs://$BUCKET \\\n    --scale-tier=BASIC_GPU \\\n    --runtime-version=$TFVERSION \\\n    -- \\\n    --epochs=50 \\\n    --steps_per_epoch=50 \\\n    --model_type=$MODEL_TYPE\n'' returned non-zero exit status 1.