# Exercise04 : Train on Remote GPU Virtual Machine

Now we run our previous sample (see "[Exercise03 : Just Train in Your Working Machine](./exercise03_train_simple.ipynb)") on remote virtual machine with GPU utilized.<br>
You can also run remote training on your favorite docker images.

*back to [index](https://github.com/tsmatz/azureml-tutorial-tensorflow-v1/)*

## Variable's Setting

Replace below's branket's string and set the required variables.

In [1]:
my_resource_group = "{AML-RESOURCE-GROUP-NAME}"
my_workspace = "{AML-WORSPACE-NAME}"

## Save your training script as file (train.py)

Create ```scirpt``` directory and save Python script as ```./script/train.py```.

In [2]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

In [3]:
%%writefile script/train.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#

def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath],
        num_epochs = num_epochs) # data is repeated and it raises OutOfRange when data is over
    data_reader = tf.TFRecordReader()
    _, serialized_exam = data_reader.read(data_queue)
    data_exam = tf.parse_single_example(
        serialized_exam,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64)
        })
    data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
    data_image.set_shape([784])
    data_image = tf.cast(data_image, tf.float32) * (1. / 255)
    data_label = tf.cast(data_exam['label'], tf.int32)
    data_batch_image, data_batch_label = tf.train.batch(
        [data_image, data_label],
        batch_size=batch_size)
    return {'inputs': data_batch_image}, data_batch_label

def _get_input_fn(filepath, num_epochs):
    return lambda: _my_input_fn(filepath, num_epochs)

def _my_model_fn(features, labels, mode):
    # with tf.device(...): # You can set device if using GPUs

    # define network and inference
    # (simple 2 fully connected hidden layer : 784->128->64->10)
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal(
                [784, FLAGS.first_layer],
                stddev=1.0 / math.sqrt(float(784))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.first_layer]),
            name='biases')
        hidden1 = tf.nn.relu(tf.matmul(features['inputs'], weights) + biases)
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.first_layer, FLAGS.second_layer],
                stddev=1.0 / math.sqrt(float(FLAGS.first_layer))),
            name='weights')
        biases = tf.Variable(
            tf.zeros([FLAGS.second_layer]),
            name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal(
                [FLAGS.second_layer, 10],
                stddev=1.0 / math.sqrt(float(FLAGS.second_layer))),
        name='weights')
        biases = tf.Variable(
            tf.zeros([10]),
            name='biases')
        logits = tf.matmul(hidden2, weights) + biases
 
    # compute evaluation matrix
    predicted_indices = tf.argmax(input=logits, axis=1)
    if mode != tf.estimator.ModeKeys.PREDICT:
        label_indices = tf.cast(labels, tf.int32)
        accuracy = tf.metrics.accuracy(label_indices, predicted_indices)
        tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard
 
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=logits)
 
    # define operations
    if mode == tf.estimator.ModeKeys.TRAIN:
        #global_step = tf.train.create_global_step()
        #global_step = tf.contrib.framework.get_or_create_global_step()
        global_step = tf.train.get_or_create_global_step()        
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=global_step)
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {
            'accuracy': accuracy
        }
        return tf.estimator.EstimatorSpec(
            mode,
            loss=loss,
            eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits, name='softmax_tensor')
        predictions = {
            'classes': predicted_indices,
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs=export_outputs)

def _my_serving_input_fn():
    inputs = {'inputs': tf.placeholder(tf.float32, [None, 784])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)

#
# Main
#

parser = argparse.ArgumentParser()
parser.add_argument(
    '--data_folder',
    type=str,
    default='./data',
    help='Folder path for input data')
parser.add_argument(
    '--chkpoint_folder',
    type=str,
    default='./logs',  # AML experiments logs folder
    help='Folder path for checkpoint files')
parser.add_argument(
    '--model_folder',
    type=str,
    default='./outputs',  # AML experiments outputs folder
    help='Folder path for model output')
parser.add_argument(
    '--learning_rate',
    type=float,
    default='0.07',
    help='Learning Rate')
parser.add_argument(
    '--first_layer',
    type=int,
    default='128',
    help='Neuron number for the first hidden layer')
parser.add_argument(
    '--second_layer',
    type=int,
    default='64',
    help='Neuron number for the second hidden layer')
FLAGS, unparsed = parser.parse_known_args()

# clean checkpoint and model folder if exists
if os.path.exists(FLAGS.chkpoint_folder) :
    for file_name in os.listdir(FLAGS.chkpoint_folder):
        file_path = os.path.join(FLAGS.chkpoint_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
if os.path.exists(FLAGS.model_folder) :
    for file_name in os.listdir(FLAGS.model_folder):
        file_path = os.path.join(FLAGS.model_folder, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

# read TF_CONFIG
run_config = tf.estimator.RunConfig()

# create Estimator
mnist_fullyconnected_classifier = tf.estimator.Estimator(
    model_fn=_my_model_fn,
    model_dir=FLAGS.chkpoint_folder,
    config=run_config)
train_spec = tf.estimator.TrainSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'train.tfrecords'), 2),
    max_steps=60000 * 2 / batch_size)
eval_spec = tf.estimator.EvalSpec(
    input_fn=_get_input_fn(os.path.join(FLAGS.data_folder, 'test.tfrecords'), 1),
    steps=10000 * 1 / batch_size,
    start_delay_secs=0)

# run !
tf.estimator.train_and_evaluate(
    mnist_fullyconnected_classifier,
    train_spec,
    eval_spec
)

# save model and variables
model_dir = mnist_fullyconnected_classifier.export_savedmodel(
    export_dir_base = FLAGS.model_folder,
    serving_input_receiver_fn = _my_serving_input_fn)
print('current working directory is ', os.getcwd())
print('model is saved ', model_dir)

Writing script/train.py


## Train on remote VM

Now let's start to integrate with AML and automate training on remote virtual machine.

### Step 1 : Create new remote virtual machine

Create your new reomte virtual machine with **GPU**.<br>
Before starting, **please check as follows**.

- Make sure that the following size (in the following script, ```Standard_NC4as_T4_v3```) is supported in the location (in which AML workspace resides).
- You should have quota for ML GPU VM in your Azure subscription. If you don't have, please request quota in Azure Portal.

By setting 0 in ```--min-instances```, the node will be terminated if it's inactive. (You can save money.)    

> Note : You can also attach an existing virtual machine (bring your own compute resource) as a compute target.

In [4]:
!az ml compute create --name myvm01 \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace \
  --type amlcompute \
  --min-instances 0 \
  --max-instances 1 \
  --size Standard_NC4as_T4_v3 # or Standard_NC6

[36mCommand group 'ml compute' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
{
  "id": "/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/computes/myvm01",
  "idle_time_before_scale_down": 120,
  "location": "eastus",
  "max_instances": 1,
  "min_instances": 0,
  "name": "myvm01",
  "network_settings": {},
  "provisioning_state": "Succeeded",
  "resourceGroup": "AzureML-rg",
  "size": "STANDARD_NC6",
  "ssh_public_access_enabled": true,
  "tier": "dedicated",
  "type": "amlcompute"
}
[0m

### Step 2 : Create environment

Here we create a new docker environments for running scripts. In the first time, it will generate our own conatiner image as following settings. (It will then take a long time for completing experiment.)
However, you can speed up by reusing the generated environment in the next run, once you have registered the generated environment.

In this example, I create my own environment manually, but **you can also use existing environments (called, curated environments) for a variety of purposes**. (In Exercise 05, we will use a curated environment, which includes TensorFlow 1.x.)

First I create conda dependancies yaml and save as ```04_conda_pydata.yml```.<br>
To run TensorFlow 1.x, here I use Python version 3.6.

In [5]:
%%writefile 04_conda_pydata.yml
name: project_environment
dependencies:
- python=3.6
- pip:
  - azureml-defaults
- tensorflow-gpu==1.15
channels:
- anaconda
- conda-forge

Writing 04_conda_pydata.yml


Register my environment (named ```test-remote-gpu-env```) as AML custom environment.

In [6]:
%%writefile 04_env_register.yml
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: test-remote-gpu-env
image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
conda_file: 04_conda_pydata.yml
description: This is example

Writing 04_env_register.yml


In [7]:
!az ml environment create --file 04_env_register.yml \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace

[36mCommand group 'ml environment' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
{
  "conda_file": {
    "channels": [
      "anaconda",
      "conda-forge"
    ],
    "dependencies": [
      "python=3.6",
      {
        "pip": [
          "azureml-defaults"
        ]
      },
      "tensorflow-gpu==1.15"
    ],
    "name": "azureml_cab8db7ae173656990dc701dc27a6df0"
  },
  "creation_context": {
    "created_at": "2022-02-28T05:57:35.096757+00:00",
    "created_by": "Tsuyoshi Matsuzaki",
    "created_by_type": "User",
    "last_modified_at": "2022-02-28T05:57:35.096757+00:00",
    "last_modified_by": "Tsuyoshi Matsuzaki",
    "last_modified_by_type": "User"
  },
  "description": "This is example",
  "id": "azureml:/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/environments/test-remote-gpu-env/versions/1",
  "image": "mcr.microsoft.com/azureml

### Step 3 : Submit training job

Submit a training job with above compute and environment.

In this example, I use the registered dataset  (train.tfrecords, test.tfrecords) named ```mnist_tfrecords_dataset``` to mount in your compute target. (Run "[Exercise02 : Prepare Data](./exercise02_prepare_data.ipynb)" for dataset preparation.) In order to use dataset in AML, set ```azureml:{DATASET_NAME}:{DATASET_VERSION}``` in ```inputs``` section as follows.

In [8]:
%%writefile 04_mnist_train_job.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
code: 
  local_path: script
command: >-
  python train.py 
  --data_folder ${{inputs.mnist_tf}}
inputs:
  mnist_tf: 
    dataset: azureml:mnist_tfrecords_dataset:1
environment: azureml:test-remote-gpu-env:1
compute: azureml:myvm01
display_name: tf_remote_experiment
experiment_name: tf_remote_experiment
description: This is example

Writing 04_mnist_train_job.yml


Now let's submit a job with AML CLI.<br>
See the progress and results in [AML Studio](https://ml.azure.com/) experiments.

In [10]:
!az ml job create --file 04_mnist_train_job.yml \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace

[36mCommand group 'ml job' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
{
  "code": "azureml:81719496-1f42-4834-8f83-366c7a44ffc2:1",
  "command": "python train.py  --data_folder ${{inputs.mnist_tf}}",
  "compute": "azureml:myvm01",
  "creation_context": {
    "created_at": "2022-02-28T06:00:07.015031+00:00",
    "created_by": "Tsuyoshi Matsuzaki",
    "created_by_type": "User"
  },
  "description": "This is example",
  "display_name": "tf_remote_experiment",
  "environment": "azureml:test-remote-gpu-env:1",
  "environment_variables": {},
  "experiment_name": "tf_remote_experiment",
  "id": "azureml:/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/jobs/87b7f190-d807-456e-ab02-0cf3c2239913",
  "inputs": {
    "mnist_tf": {
      "dataset": "azureml:mnist_tfrecords_dataset:1",
      "mode": "ro_mount"
    }
  },
  "name": "87b7f190-d807-456e-ab0

You can show the progress and result with the following CLI command.<br>
(**Replace ```87b7f190-d807-456e-ab02-0cf3c2239913``` with your generated job name**.)

In [12]:
!az ml job show --name 87b7f190-d807-456e-ab02-0cf3c2239913 \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace

[36mCommand group 'ml job' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
{
  "code": "azureml:81719496-1f42-4834-8f83-366c7a44ffc2:1",
  "command": "python train.py  --data_folder ${{inputs.mnist_tf}}",
  "compute": "azureml:myvm01",
  "creation_context": {
    "created_at": "2022-02-28T06:00:07.015031+00:00",
    "created_by": "Tsuyoshi Matsuzaki",
    "created_by_type": "User"
  },
  "description": "This is example",
  "display_name": "tf_remote_experiment",
  "environment": "azureml:test-remote-gpu-env:1",
  "environment_variables": {},
  "experiment_name": "tf_remote_experiment",
  "id": "azureml:/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/jobs/87b7f190-d807-456e-ab02-0cf3c2239913",
  "inputs": {
    "mnist_tf": {
      "dataset": "azureml:mnist_tfrecords_dataset:1",
      "mode": "ro_mount"
    }
  },
  "name": "87b7f190-d807-456e-ab0

### Step 4 : Download results and evaluate

Now let's check the generated model in local computer.

Go to [Azure ML studio UI](https://ml.azure.com/).<br>
You can then see the saved model in outputs directory.

![Saved Outputs](https://tsmatz.github.io/images/github/azure-ml-tensorflow-complete-sample/20220225_Experiment_Outputs.jpg)

By running the following ```az ml job download``` command, the logs and outputs are downloaded in local computer.<br>
The logs are saved in ```{JOB-NAME}/logs``` and outputs are in ```{JOB-NAME}/outputs```.<br>
(**Replace ```87b7f190-d807-456e-ab02-0cf3c2239913``` with your job name**.)

In [13]:
!az ml job download --name 87b7f190-d807-456e-ab02-0cf3c2239913 \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace

[36mCommand group 'ml job' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
Downloading the job logs ExperimentRun/dcid.87b7f190-d807-456e-ab02-0cf3c2239913/ at /home/tsmatsuz/cli_yaml/87b7f190-d807-456e-ab02-0cf3c2239913



Now check the downloaded result.<br>
(**Replace ```87b7f190-d807-456e-ab02-0cf3c2239913``` and ```1646029360``` with your job name and model name**.)

In [14]:
import tensorflow as tf

JOB_NAME = "87b7f190-d807-456e-ab02-0cf3c2239913"
MODEL_NAME = "1646029360"

# Read data by tensor
tfdata = tf.data.TFRecordDataset('./data/test.tfrecords')
iterator = tf.compat.v1.data.make_one_shot_iterator(tfdata)
data_org = iterator.get_next()
data_exam = tf.parse_single_example(
    data_org,
    features={
        'image_raw': tf.FixedLenFeature([], tf.string),
        'label': tf.FixedLenFeature([], tf.int64)
    })
data_image = tf.decode_raw(data_exam['image_raw'], tf.uint8)
data_image.set_shape([784])
data_image = tf.cast(data_image, tf.float32) * (1. / 255)
data_label = tf.cast(data_exam['label'], tf.int32)

# Run tensor and generate data
with tf.Session() as sess:
    image_arr = []
    label_arr = []
    for i in range(3):
        image, label = sess.run([data_image, data_label])
        image_arr.append(image)
        label_arr.append(label)

# Predict
pred_fn = tf.contrib.predictor.from_saved_model('./{}/outputs/{}'.format(JOB_NAME, MODEL_NAME))
pred = pred_fn({'inputs': image_arr})

print('Predicted: ', pred['classes'].tolist())
print('Actual   : ', label_arr)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./87b7f190-d807-456e-ab02-0cf3c2239913/outputs/1646029360/variables/variables
Predicted:  [7, 2, 1]
Actual   :  [7, 2, 1]


### Step 5 : Register Model

Now upload (register) the downloaded model into AML model management.<br>
(**Replace the following ```87b7f190-d807-456e-ab02-0cf3c2239913``` and ```1645689369``` with your job name and model name**.)

In [15]:
!mv ./87b7f190-d807-456e-ab02-0cf3c2239913/outputs/1646029360 ./generated_model

In [16]:
!az ml model create --name mnist_model_test \
  --version 1 \
  --local-path ./generated_model \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace

[36mCommand group 'ml model' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
[32mUploading generated_model (0.45 MBs): 100%|█| 453424/453424 [00:00<00:00, 133666[0m
[39m

{
  "creation_context": {
    "created_at": "2022-02-28T06:50:55.339180+00:00",
    "created_by": "Tsuyoshi Matsuzaki",
    "created_by_type": "User",
    "last_modified_at": "2022-02-28T06:50:55.339180+00:00",
    "last_modified_by": "Tsuyoshi Matsuzaki",
    "last_modified_by_type": "User"
  },
  "id": "azureml:/subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/providers/Microsoft.MachineLearningServices/workspaces/ws01/models/mnist_model_test/versions/1",
  "model_format": "custom",
  "model_uri": "azureml://subscriptions/b3ae1c15-4fef-4362-8c3a-5d804cdeb18d/resourceGroups/AzureML-rg/workspaces/ws01/datastores/workspaceblobstore/paths/LocalUpload/a047dda6b249c2540aa30636b0743162/generated_model",
  "name": "mnist_model_test",
  "prop

### Step 6 : Remove AML compute

**You don't need to remove your AML compute** for saving money, because the nodes will be automatically terminated, when it's inactive.<br>
But if you want to clean up, please run as follows.

In [17]:
!az ml compute delete --name myvm01 \
  --resource-group $my_resource_group \
  --workspace-name $my_workspace \
  --yes

[36mCommand group 'ml compute' is in preview and under development. Reference and support levels: https://aka.ms/CLI_refstatus[0m
Deleting compute myvm01 
........Done.
(0m 42s)

[0m