In [1]:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-tensorflow

## Import packages

In [2]:
import os
import urllib
import shutil
import azureml

from azureml.core import Experiment
from azureml.core import Workspace, Run

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow

# Initialize a workspace

In [3]:
ws = Workspace.from_config()

In [5]:
ws.get_details()

{'id': '/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourceGroups/SQLServerKonferenz2020/providers/Microsoft.MachineLearningServices/workspaces/MLDemo',
 'name': 'MLDemo',
 'location': 'northeurope',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'tags': {},
 'sku': 'Enterprise',
 'workspaceid': '33f5cdb9-d04e-4f11-bb7a-263b2736aab0',
 'description': '',
 'friendlyName': '',
 'creationTime': '2020-02-18T12:30:56.1352089+00:00',
 'containerRegistry': '/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourceGroups/SQLServerKonferenz2020/providers/Microsoft.ContainerRegistry/registries/mldemof2348786',
 'keyVault': '/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/providers/microsoft.keyvault/vaults/mldemo9819592044',
 'applicationInsights': '/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/providers/microsoft.insights/components/mldemo2790914701',
 'identityPrincipalId': 'c7f84daa-3a

## Create a deep learning experiment

In [6]:
script_folder = './tf-mnist'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='tf-mnist')

## Create a file dataset

In [7]:
from azureml.core.dataset import Dataset

web_paths = [
            'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
            ]
dataset = Dataset.File.from_files(path=web_paths)

In [8]:
dataset = dataset.register(workspace=ws,
                           name='mnist dataset',
                           description='training and test dataset',
                           create_new_version=True)

# list the files referenced by dataset
dataset.to_path()

array(['/http/yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'],
      dtype=object)

## Create a compute target

In [9]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


## Create a TensorFlow estimator

In [10]:
script_params = {
    '--data-folder': dataset.as_named_input('mnist').as_mount(),
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

est = TensorFlow(source_directory=script_folder,
                 entry_script='tf_mnist.py',
                 script_params=script_params,
                 compute_target=compute_target,
                 use_gpu=True,
                 pip_packages=['azureml-dataprep[pandas,fuse]'])



## Submit a run

In [11]:
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: tf-mnist_1583168288_485f9fd8
Web View: https://ml.azure.com/experiments/tf-mnist/runs/tf-mnist_1583168288_485f9fd8?wsid=/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/workspaces/mldemo

Streaming azureml-logs/55_azureml-execution-tvmps_ea36fda09dbae355eb997b39512516fa8949d6833e06ffe56359d4a0fe1a0299_d.txt

2020-03-02T17:00:56Z Starting output-watcher...
2020-03-02T17:00:56Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_1d5e282518217ede461ec1726f1f4557
f7277927d38a: Pulling fs layer
8d3eac894db4: Pulling fs layer
edf72af6d627: Pulling fs layer
3e4f86211d23: Pulling fs layer
d6e9603ff777: Pulling fs layer
5cad422780e2: Pulling fs layer
8130687c8acb: Pulling fs layer
c11e9246d621: Pulling fs layer
0dfae24cbbd9: Pulling fs layer
0bb049a6d391: Pulling fs layer
22a53069998a: Pulling fs layer
db550b9db251: Pulling fs layer
10a826755d7e: Pulling fs la

(60000, 784)
(60000,)
(10000, 784)
(10000,)
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
2020-03-02 17:03:16.770169: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-03-02 17:03:17.010751: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x4b07120 executing computations on platform CUDA. Devices:
2020-03-02 17:03:17.010836: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): Tesla K80, Compute Capability 3.7
2020-03-02 17:03:17.013644: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2596990000 Hz
2020-03-02 17:03:17.014032: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x4c221d0 executing computations on platform Host. Devices:
2020-03-02 17:03:17.014090: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor 

{'runId': 'tf-mnist_1583168288_485f9fd8',
 'target': 'gpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-03-02T17:00:57.507434Z',
 'endTimeUtc': '2020-03-02T17:04:15.179828Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'fe26bcec-e9a8-496a-a0e2-093c947ecb55',
  'azureml.git.repository_uri': 'https://github.com/marcelfranke/sqlserverkonferenz.git',
  'mlflow.source.git.repoURL': 'https://github.com/marcelfranke/sqlserverkonferenz.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'cd62b64c4786bfdafcce452be272413cf5cc82d7',
  'mlflow.source.git.commit': 'cd62b64c4786bfdafcce452be272413cf5cc82d7',
  'azureml.git.dirty': 'False',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'c1dc2de4-45ab-4877-add2-fa43e8cd8db5'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'mnist', 'mecha

## Register or download a model

In [12]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

model = run.register_model(model_name='tf-dnn-mnist', 
                           model_path='outputs/model',
                           model_framework=Model.Framework.TENSORFLOW,
                           model_framework_version='1.13.0',
                           resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5))

## Distributed training

In [13]:
# Create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/model'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)

Downloading from outputs/model/checkpoint to ./model/checkpoint ...
Downloading from outputs/model/mnist-tf.model.data-00000-of-00001 to ./model/mnist-tf.model.data-00000-of-00001 ...
Downloading from outputs/model/mnist-tf.model.index to ./model/mnist-tf.model.index ...
Downloading from outputs/model/mnist-tf.model.meta to ./model/mnist-tf.model.meta ...


In [14]:
from azureml.core.runconfig import MpiConfiguration
from azureml.train.dnn import TensorFlow

# Tensorflow constructor
estimator= TensorFlow(source_directory=script_folder,
                      compute_target=compute_target,
                      script_params=script_params,
                      entry_script='tf_mnist.py',
                      node_count=2,
                      process_count_per_node=1,
                      distributed_training=MpiConfiguration(),
                      framework_version='1.13',
                      use_gpu=True,
                      pip_packages=['azureml-dataprep[pandas,fuse]'])

In [15]:
run = exp.submit(estimator)
run.wait_for_completion(show_output=True)

RunId: tf-mnist_1583168659_e8ea87f6
Web View: https://ml.azure.com/experiments/tf-mnist/runs/tf-mnist_1583168659_e8ea87f6?wsid=/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/workspaces/mldemo

Streaming azureml-logs/55_azureml-execution-tvmps_8b84303de576a570cdf02cfc0d7b5eeebe6e10edb473ed2d0c756e344633687e_d.txt

2020-03-02T17:07:19Z Starting output-watcher...
2020-03-02T17:07:19Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_1d5e282518217ede461ec1726f1f4557
f7277927d38a: Pulling fs layer
8d3eac894db4: Pulling fs layer
edf72af6d627: Pulling fs layer
3e4f86211d23: Pulling fs layer
d6e9603ff777: Pulling fs layer
5cad422780e2: Pulling fs layer
8130687c8acb: Pulling fs layer
c11e9246d621: Pulling fs layer
0dfae24cbbd9: Pulling fs layer
5cad422780e2: Waiting
8130687c8acb: Waiting
3e4f86211d23: Waiting
0bb049a6d391: Pulling fs layer
22a53069998a: Pulling f

3 -- Training accuracy: 0.98 Validation accuracy: 0.9384
4 -- Training accuracy: 0.98 Validation accuracy: 0.9437
5 -- Training accuracy: 0.94 Validation accuracy: 0.9475
6 -- Training accuracy: 0.98 Validation accuracy: 0.9507
7 -- Training accuracy: 0.92 Validation accuracy: 0.9524
8 -- Training accuracy: 1.0 Validation accuracy: 0.9565
9 -- Training accuracy: 1.0 Validation accuracy: 0.9584
10 -- Training accuracy: 0.94 Validation accuracy: 0.9606
11 -- Training accuracy: 0.98 Validation accuracy: 0.9622
12 -- Training accuracy: 1.0 Validation accuracy: 0.9632
13 -- Training accuracy: 0.98 Validation accuracy: 0.9662
14 -- Training accuracy: 0.96 Validation accuracy: 0.9661
15 -- Training accuracy: 1.0 Validation accuracy: 0.9683
16 -- Training accuracy: 0.98 Validation accuracy: 0.9689
17 -- Training accuracy: 1.0 Validation accuracy: 0.9703
18 -- Training accuracy: 1.0 Validation accuracy: 0.971
19 -- Training accuracy: 0.98 Validation accuracy: 0.972


The experiment completed su

{'runId': 'tf-mnist_1583168659_e8ea87f6',
 'target': 'gpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-03-02T17:07:15.554792Z',
 'endTimeUtc': '2020-03-02T17:10:35.794201Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'fe26bcec-e9a8-496a-a0e2-093c947ecb55',
  'azureml.git.repository_uri': 'https://github.com/marcelfranke/sqlserverkonferenz.git',
  'mlflow.source.git.repoURL': 'https://github.com/marcelfranke/sqlserverkonferenz.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'cd62b64c4786bfdafcce452be272413cf5cc82d7',
  'mlflow.source.git.commit': 'cd62b64c4786bfdafcce452be272413cf5cc82d7',
  'azureml.git.dirty': 'False',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'c1dc2de4-45ab-4877-add2-fa43e8cd8db5'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'mnist', 'mecha