# MNIST image classification with Keras only

In [1]:
import os
import shutil
from datetime import datetime
import sys
import json
import argparse

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
print(tf.__version__)

2.1.0


In [3]:
PROJECT = "ml-practice-260405" # REPLACE WITH YOUR PROJECT ID
BUCKET = "bucket-ml-practice-260405" # REPLACE WITH YOUR BUCKET NAME
REGION = "us-central1" # REPLACE WITH YOUR BUCKET REGION e.g. us-central1
MODEL_TYPE = "cnn"  # "linear", "dnn", "dnn_dropout", or "cnn"
SAC = 'jupyter-notebook-sac-f'
SAC_KEY_DESTINATION = '/media/mujahid7292/Data/Gcloud_Tem_SAC'

In [4]:
# Do not change 
os.environ["ACCOUNT"] = "sandcorp2014@gmail.com"
os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION
os.environ['SAC'] = SAC
os.environ['SAC_KEY_DESTINATION'] = SAC_KEY_DESTINATION
os.environ["MODEL_TYPE"] = MODEL_TYPE
os.environ["TFVERSION"] = "2.1.0"  # Tensorflow version
os.environ["IMAGE_URI"] = os.path.join("gcr.io", PROJECT, "mnistmodel")

# Activate the service account with above key

In [5]:
%%bash
gcloud auth activate-service-account \
--key-file=${SAC_KEY_DESTINATION}/${SAC}.json

Activated service account credentials for: [jupyter-notebook-sac-f@ml-practice-260405.iam.gserviceaccount.com]


# Set Google Application Credentials

In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='{}/{}.json'.format(SAC_KEY_DESTINATION,SAC)

Check Whether Google Application Credential Was Set Successfully Outside Virtual Environment

In [7]:
%%bash
set | grep GOOGLE_APPLICATION_CREDENTIALS 

GOOGLE_APPLICATION_CREDENTIALS=/media/mujahid7292/Data/Gcloud_Tem_SAC/jupyter-notebook-sac-f.json


# Set Default Project And Region

In [8]:
%%bash
gcloud config set account $ACCOUNT

Updated property [core/account].


In [9]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


# Give GCS Access Permision To This Service Account

In [10]:
%%bash
gcloud projects add-iam-policy-binding $PROJECT \
    --member serviceAccount:$SAC@$PROJECT.iam.gserviceaccount.com \
    --role roles/storage.admin

bindings:
- members:
  - serviceAccount:jupyter-notebook-sac-f@ml-practice-260405.iam.gserviceaccount.com
  role: roles/bigquery.admin
- members:
  - serviceAccount:229327834475@cloudbuild.gserviceaccount.com
  role: roles/cloudbuild.builds.builder
- members:
  - serviceAccount:service-229327834475@gcp-sa-cloudbuild.iam.gserviceaccount.com
  role: roles/cloudbuild.serviceAgent
- members:
  - serviceAccount:service-229327834475@compute-system.iam.gserviceaccount.com
  role: roles/compute.serviceAgent
- members:
  - serviceAccount:service-229327834475@container-engine-robot.iam.gserviceaccount.com
  role: roles/container.serviceAgent
- members:
  - serviceAccount:service-229327834475@dataflow-service-producer-prod.iam.gserviceaccount.com
  role: roles/dataflow.serviceAgent
- members:
  - serviceAccount:service-229327834475@trifacta-gcloud-prod.iam.gserviceaccount.com
  role: roles/dataprep.serviceAgent
- members:
  - serviceAccount:229327834475-compute@developer.gserviceaccount.com
  - s

Updated IAM policy for project [ml-practice-260405].


In [11]:
%%bash
mkdir mnistmodel_keras_only
mkdir mnistmodel_keras_only/trainer/

In [12]:
%%writefile mnistmodel_keras_only/trainer/__init__.py
# Empty file

Writing mnistmodel_keras_only/trainer/__init__.py


In [13]:
%%writefile mnistmodel_keras_only/trainer/task.py
import argparse
import json
import os
import sys

from . import model

def _parse_arguments(argv):
    """
    Parse command line arguments
    """
    parser = argparse.ArgumentParser()
    
    parser.add_argument(
        '--model_type',
        help="Which model type to use.",
        type=str,
        default='cnn'
    )
    
    parser.add_argument(
        '--epochs',
        help='The number of epochs to train.',
        type=int,
        default=10
    )
    
    parser.add_argument(
        '--steps_per_epoch',
        help='The number of steps per epoch to train.',
        type=int,
        default=100
    )
    
    parser.add_argument(
        '--job-dir',
        help="Directory where to save the model.",
        type=str,
        default='mnistmodel_keras_only/'
    )
    
    return parser.parse_known_args(argv)

def main():
    """
    Parse command line arguments and kicks off model training.
    """
    args = _parse_arguments(sys.argv[1:])[0]
    
    model_layers = model.get_layers(args.model_type)
    
    image_model = model.build_model(model_layers, args.job_dir)
    
    model_history = model.train_and_evaluate(
        image_model, args.epochs, args.steps_per_epoch, args.job_dir
    )
    
if __name__ == '__main__':
    main()

Writing mnistmodel_keras_only/trainer/task.py


Next, let's group non-model functions into a util file to keep the model file simple. We'll copy over the `scale` and `load_dataset` functions from the previous lab.

In [14]:
%%writefile mnistmodel_keras_only/trainer/util.py
import tensorflow as tf

def scale(image, label):
    """
    Scale image from 0 to 255 int range to a 0.0 to 1.0 float range
    """
    image = tf.cast(x=image, dtype=tf.float32)
    image /= 255
    image = tf.expand_dims(input=image, axis=-1)
    return image, label

def load_dataset(data, training=True, buffer_size=5000, batch_size=100, nclasses=10):
    """
    Loads MNIST dataset into a tf.data.Dataset
    """
    (x_train, y_train), (x_test, y_test) = data
    
    x = x_train if training else x_test
    y = y_train if training else y_test
    
    # One-hot encode the class
    y = tf.keras.utils.to_categorical(y = y, num_classes=nclasses)
    
    # Convert our data into tf.data
    dataset = tf.data.Dataset.from_tensor_slices((x,y))
    dataset = dataset.map(scale).batch(batch_size)
    
    # During training shuffle our dataset
    if training:
        dataset = dataset.shuffle(buffer_size).repeat()
        
    return dataset

Writing mnistmodel_keras_only/trainer/util.py


Finally, let's code the models! The [tf.keras API](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras) accepts an array of [layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers) into a [model object](https://www.tensorflow.org/api_docs/python/tf/keras/Model), so we can create a dictionary of layers based on the different model types we want to use. The below file has two functions: `get_layers` and `create_and_train_model`. We will build the structure of our model in `get_layers`. Last but not least, we'll copy over the training code from the previous lab into `train_and_evaluate`.

These models progressively build on each other. Look at the imported `tensorflow.keras.layers` modules and the default values for the variables defined in `get_layers` for guidance.

In [15]:
%%writefile mnistmodel_keras_only/trainer/model.py
import os
import shutil

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (Conv2D, Dense, Dropout, Flatten,
                                     MaxPooling2D, Softmax)

from . import util

# Image Variables
WIDTH = 28
HEIGHT = 28

def get_layers(
            model_type, 
            nclasses = 10, 
            hidden_layer_1_neurons=400,
            hidden_layer_2_neurons=100, 
            dropout_rate=0.25,
            num_filters_1=64,
            kernel_size_1=3,
            pooling_size_1=2,
            num_filters_2=32,
            kernel_size_2=3,
            pooling_size_2=2
    ):
    """
    Construct layers for keras model based on a dict of model types.
    """
    model_layers = {
        'linear':[
            Flatten(),
            Dense(nclasses),
            Softmax()
        ],
        'dnn':[
            Flatten(),
            Dense(hidden_layer_1_neurons, activation='relu'),
            Dense(hidden_layer_2_neurons, activation='relu'),
            Dense(nclasses),
            Softmax()
        ],
        'dnn_dropout': [
            Flatten(),
            Dense(hidden_layer_1_neurons, activation='relu'),
            Dense(hidden_layer_2_neurons, activation = 'relu'),
            Dropout(dropout_rate),
            Dense(nclasses),
            Softmax()
        ],
        'cnn': [
            Conv2D(
                num_filters_1, kernel_size=kernel_size_1,
                activation='relu', input_shape=(WIDTH, HEIGHT, 1)
            ),
            MaxPooling2D(
                pooling_size_1
            ),
            Conv2D(
                num_filters_2, kernel_size=kernel_size_2,
                activation='relu'
            ),
            MaxPooling2D(
                pooling_size_2
            ),
            Flatten(),
            Dense(
                hidden_layer_1_neurons, activation='relu'
            ),
            Dense(
                hidden_layer_2_neurons, activation='relu'
            ),
            Dropout(
                dropout_rate
            ),
            Dense(
                nclasses
            ),
            Softmax()
        ]
    }
    
    return model_layers[model_type]

def build_model(layers, output_dir):
    """
    Compiles keras model for image classification.
    """
    model = Sequential(layers)
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def train_and_evaluate(model, num_epochs, steps_per_epoch, output_dir):
    """
    Compiles keras model and loads data into it for training.
    """
    # Load MNIST dataset
    mnist = tf.keras.datasets.mnist.load_data()
    
    # Spilt dataset into train and validation.
    train_data = util.load_dataset(mnist)
    validation_data = util.load_dataset(mnist, training=False)
    
    # Create TensorBoard callback
    callbacks = []
    if output_dir:
        tensorboard_callback = TensorBoard(log_dir=output_dir)
        callbacks = [tensorboard_callback]
    
    # Train the model
    history = model.fit(
        train_data,
        validation_data=validation_data,
        epochs=num_epochs,
        steps_per_epoch=steps_per_epoch,
        verbose=2,
        callbacks=callbacks
    )
    
    # Now save the trained model
    if output_dir:
        export_path = os.path.join(output_dir, 'keras_export')
        model.save(export_path, save_format='tf')
    
    return history

Writing mnistmodel_keras_only/trainer/model.py


## Run as a Python module

Since we want to run our code on Cloud ML Engine, we've packaged it as a python module.

The `model.py` and `task.py` containing the model code is in <a href="mnistmodel_keras_only/trainer">mnistmodel_keras_only/trainer</a>

**Let's first run it locally for a few steps to test the code.** 

In [16]:
%%bash
JOB_DIR='mnistmodel_keras_only'
rm -rf mnistmodel_keras_only.tar.gz mnist_keras_only_trained
python3 -m mnistmodel_keras_only.trainer.task \
    --job-dir=${JOB_DIR}\
    --epochs=5 \
    --steps_per_epoch=50 \
    --model_type=$MODEL_TYPE

Train for 50 steps, validate for 100 steps
Epoch 1/5
50/50 - 6s - loss: 0.9674 - accuracy: 0.6948 - val_loss: 0.2748 - val_accuracy: 0.9189
Epoch 2/5
50/50 - 4s - loss: 0.3193 - accuracy: 0.9036 - val_loss: 0.1642 - val_accuracy: 0.9504
Epoch 3/5
50/50 - 4s - loss: 0.1864 - accuracy: 0.9424 - val_loss: 0.1382 - val_accuracy: 0.9585
Epoch 4/5
50/50 - 4s - loss: 0.1673 - accuracy: 0.9474 - val_loss: 0.1103 - val_accuracy: 0.9670
Epoch 5/5
50/50 - 4s - loss: 0.1281 - accuracy: 0.9616 - val_loss: 0.0744 - val_accuracy: 0.9770


2020-08-05 08:22:36.314279: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-08-05 08:22:36.314342: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-08-05 08:22:36.314349: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-08-05 08:22:37.033258: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-08-05 08:22:37.033279: E tensorflow/stream_executor/cuda

In [17]:
%%bash
JOB_DIR='mnistmodel_keras_only'
rm -rf mnistmodel_keras_only.tar.gz mnist_keras_only_trained
JOB_DIR=./tmp
gcloud ml-engine local train \
    --module-name=trainer.task \
    --package-path=./mnistmodel_keras_only/trainer \
    -- \
    --job-dir=${JOB_DIR}\
    --epochs=10 \
    --steps_per_epoch=50 \
    --model_type=$MODEL_TYPE

Train for 50 steps, validate for 100 steps
Epoch 1/10
50/50 - 6s - loss: 1.0072 - accuracy: 0.6782 - val_loss: 0.3012 - val_accuracy: 0.9073
Epoch 2/10
50/50 - 4s - loss: 0.2937 - accuracy: 0.9080 - val_loss: 0.1520 - val_accuracy: 0.9549
Epoch 3/10
50/50 - 4s - loss: 0.1902 - accuracy: 0.9444 - val_loss: 0.1434 - val_accuracy: 0.9549
Epoch 4/10
50/50 - 4s - loss: 0.1392 - accuracy: 0.9570 - val_loss: 0.1009 - val_accuracy: 0.9689
Epoch 5/10
50/50 - 4s - loss: 0.1474 - accuracy: 0.9572 - val_loss: 0.0826 - val_accuracy: 0.9742
Epoch 6/10
50/50 - 4s - loss: 0.1240 - accuracy: 0.9618 - val_loss: 0.0914 - val_accuracy: 0.9712
Epoch 7/10
50/50 - 4s - loss: 0.1050 - accuracy: 0.9666 - val_loss: 0.0646 - val_accuracy: 0.9805
Epoch 8/10
50/50 - 4s - loss: 0.0939 - accuracy: 0.9714 - val_loss: 0.0637 - val_accuracy: 0.9793
Epoch 9/10
50/50 - 4s - loss: 0.0838 - accuracy: 0.9724 - val_loss: 0.0542 - val_accuracy: 0.9823
Epoch 10/10
50/50 - 4s - loss: 0.0842 - accuracy: 0.9744 - val_loss: 0.0597

2020-08-05 08:24:29.479004: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-08-05 08:24:29.479084: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-08-05 08:24:29.479094: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-08-05 08:24:30.140618: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-08-05 08:24:30.140642: E tensorflow/stream_executor/cuda

**Now, let's do it on Cloud ML Engine so we can train on GPU (`--scale-tier=BASIC_GPU`)**

Note the GPU speed up depends on the model type. You'll notice the more complex CNN model trains significantly faster on GPU, however the speed up on the simpler models is not as pronounced.

In [18]:
%%bash
OUTDIR=gs://${BUCKET}/mnist/trained_dnn
JOBNAME=mnist_dnn_$(date -u +%y%m%d_%H%M%S)
echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
    --region=$REGION \
    --module-name=trainer.task \
    --package-path=./mnistmodel_keras_only/trainer \
    --job-dir=$OUTDIR \
    --staging-bucket=gs://$BUCKET \
    --scale-tier=BASIC_GPU \
    --runtime-version=$TFVERSION \
    -- \
    --epochs=50 \
    --steps_per_epoch=50 \
    --model_type=$MODEL_TYPE

gs://bucket-ml-practice-260405/mnist/trained_dnn us-central1 mnist_dnn_200805_022615


CommandException: 1 files/objects could not be removed.
ERROR: (gcloud.ml-engine.jobs.submit.training) INVALID_ARGUMENT: Field: runtime_version Error: The specified runtime version '2.1.0' with the Python version '' is not supported or is deprecated.  Please specify a different runtime version. See https://cloud.google.com/ml-engine/docs/runtime-version-list for a list of supported versions
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: The specified runtime version '2.1.0' with the Python version ''
      is not supported or is deprecated.  Please specify a different runtime version.
      See https://cloud.google.com/ml-engine/docs/runtime-version-list for a list
      of supported versions
    field: runtime_version


CalledProcessError: Command 'b'OUTDIR=gs://${BUCKET}/mnist/trained_dnn\nJOBNAME=mnist_dnn_$(date -u +%y%m%d_%H%M%S)\necho $OUTDIR $REGION $JOBNAME\ngsutil -m rm -rf $OUTDIR\ngcloud ml-engine jobs submit training $JOBNAME \\\n    --region=$REGION \\\n    --module-name=trainer.task \\\n    --package-path=./mnistmodel_keras_only/trainer \\\n    --job-dir=$OUTDIR \\\n    --staging-bucket=gs://$BUCKET \\\n    --scale-tier=BASIC_GPU \\\n    --runtime-version=$TFVERSION \\\n    -- \\\n    --epochs=50 \\\n    --steps_per_epoch=50 \\\n    --model_type=$MODEL_TYPE\n'' returned non-zero exit status 1.

## Local Training

Now that we know that our models are working as expected, let's run it on the [Google Cloud AI Platform](https://cloud.google.com/ml-engine/docs/). We can run it as a python module locally first using the command line.

The below cell transfers some of our variables to the command line as well as create a job directory including a timestamp.

You can change the model_type to try out different models.

In [None]:
current_time = datetime.now().strftime('%y%m%d_%H%M%S')
model_type='dnn_dropout'

os.environ['MODEL_TYPE']=model_type
os.environ['JOB_DIR']="gs://{}/mnist_{}_{}".format(BUCKET,model_type,current_time)
os.environ['JOB_NAME']="mnist_{}_{}".format(model_type, current_time)

The cell below runs the local version of the code. The epochs and steps_per_epoch flag can be changed to run for longer or shorther, as defined in our `mnistmodel_keras_only/trainer/task.py` file.

In [None]:
%%bash
python3 -m mnistmodel_keras_only.trainer.task \
    --job-dir=$JOB_DIR \
    --epochs=5 \
    --steps_per_epoch=50 \
    --model_type=$MODEL_TYPE

## Training on the cloud

Since we're using an unreleased version of TensorFlow on AI Platform, we can instead use a [Deep Learning Container](https://cloud.google.com/ai-platform/deep-learning-containers/docs/overview) in order to take advantage of libraries and applications not normally packaged with AI Platform. Below is a simple [Dockerlife](https://docs.docker.com/engine/reference/builder/) which copies our code to be used in a TF2 environment.

In [None]:
%%writefile mnistmodel_keras_only/Dockerfile
FROM gcr.io/deeplearning-platform-release/tf2-cpu
COPY mnistmodel_keras_only/trainer mnistmodel/trainer
ENTRYPOINT ["python3", "-m", "mnistmodel.trainer.task"]

The below command builds the image and ships it off to Google Cloud so it can be used for AI Platform. When built, it will show up [here](http://console.cloud.google.com/gcr) with the name `mnistmodel`. ([Click here](https://console.cloud.google.com/cloud-build) to enable Cloud Build)

<b><p style='color:red'>This below `docker buid` and `docker push` command will not run in this laptop. It will only run in `AI Platmorm` notebook. So stop here.</p></b>

In [None]:
%%bash
docker build -f mnistmodel_keras_only/Dockerfile -t $IMAGE_URI ./

In [None]:
%%bash
docker push $IMAGE_URI

Finally, we can kickoff the [AI Platform training job](https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training). We can pass in our docker image using the `master-image-uri` flag.

In [None]:
%%bash
echo $JOB_DIR $REGION $JOB_NAME
gcloud ai-platform jobs submit training $JOB_NAME \
    --staging-bucket=gs://$BUCKET \
    --region=$REGION \
    --master-image-uri=$IMAGE_URI \
    --scale-tier=BASIC_GPU \
    --job-dir=$JOB_DIR \
    -- \
    --model_type=$MODEL_TYPE

Can't wait to see the results? Run the code below and copy the output into the [Google Cloud Shell](https://console.cloud.google.com/home/dashboard?cloudshell=true) to follow.

## Deploying and predicting with model

Once you have a model you're proud of, let's deploy it! All we need to do is give AI Platform the location of the model. Below uses the keras export path of the previous job, but `${JOB_DIR}keras_export/` can always be changed to a different path.

Uncomment the delete commands below if you are getting an "already exists error" and want to deploy a new model.

In [None]:
%%bash
MODEL_NAME="mnist"
MODEL_VERSION=${MODEL_TYPE}
MODEL_LOCATION=${JOB_DIR}keras_export/
echo "Deleting and deploying $MODEL_NAME $MODEL_VERSION from $MODEL_LOCATION ... this will take a few minutes"
#yes | gcloud ai-platform versions delete ${MODEL_VERSION} --model ${MODEL_NAME}
#yes | gcloud ai-platform models delete ${MODEL_NAME}
gcloud ai-platform models create ${MODEL_NAME} --regions $REGION
gcloud ai-platform versions create ${MODEL_VERSION} \
    --model ${MODEL_NAME} \
    --origin ${MODEL_LOCATION} \
    --framework tensorflow \
    --runtime-version=2.1