In [None]:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-tensorflow

In [18]:
import os
import urllib
import shutil
import azureml

from azureml.core import Experiment
from azureml.core import Workspace, Run

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import TensorFlow

In [19]:
ws = Workspace.from_config()

In [20]:
script_folder = './tf-mnist'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='tf-mnist')

In [21]:
from azureml.core.dataset import Dataset

web_paths = [
            'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
            ]
dataset = Dataset.File.from_files(path=web_paths)

In [22]:
dataset = dataset.register(workspace=ws,
                           name='mnist dataset',
                           description='training and test dataset',
                           create_new_version=True)

# list the files referenced by dataset
dataset.to_path()

array(['/http/yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       '/http/yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'],
      dtype=object)

In [23]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


In [24]:
script_params = {
    '--data-folder': dataset.as_named_input('mnist').as_mount(),
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

est = TensorFlow(source_directory=script_folder,
                 entry_script='tf_mnist.py',
                 script_params=script_params,
                 compute_target=compute_target,
                 use_gpu=True,
                 pip_packages=['azureml-dataprep[pandas,fuse]'])



In [25]:
run = exp.submit(est)
run.wait_for_completion(show_output=True)

RunId: tf-mnist_1583139699_b3afeca3
Web View: https://ml.azure.com/experiments/tf-mnist/runs/tf-mnist_1583139699_b3afeca3?wsid=/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/workspaces/mldemo

Streaming azureml-logs/55_azureml-execution-tvmps_ed452866c7848d62e6290c230e72d0bdd55b43f149b18317ea6ae17891ee623c_d.txt

2020-03-02T09:06:32Z Starting output-watcher...
2020-03-02T09:06:32Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_1d5e282518217ede461ec1726f1f4557
f7277927d38a: Pulling fs layer
8d3eac894db4: Pulling fs layer
edf72af6d627: Pulling fs layer
3e4f86211d23: Pulling fs layer
d6e9603ff777: Pulling fs layer
5cad422780e2: Pulling fs layer
8130687c8acb: Pulling fs layer
c11e9246d621: Pulling fs layer
0dfae24cbbd9: Pulling fs layer
0bb049a6d391: Pulling fs layer
22a53069998a: Pulling fs layer
db550b9db251: Pulling fs layer
10a826755d7e: Pulling fs la

TensorFlow version: 1.13.1
Data folder: /tmp/tmpohasxqb9
(60000, 784)
(60000,)
(10000, 784)
(10000,)
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
2020-03-02 09:08:55.140859: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-03-02 09:08:55.440206: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x58ff260 executing computations on platform CUDA. Devices:
2020-03-02 09:08:55.440328: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): Tesla K80, Compute Capability 3.7
2020-03-02 09:08:55.442491: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2596990000 Hz
2020-03-02 09:08:55.443287: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x5a1a310 executing computations on platform Host. Devices:
2020-03-02 09:08:55.443360: I tensorfl

{'runId': 'tf-mnist_1583139699_b3afeca3',
 'target': 'gpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-03-02T09:06:28.240312Z',
 'endTimeUtc': '2020-03-02T09:09:57.984617Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'fe26bcec-e9a8-496a-a0e2-093c947ecb55',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'c1dc2de4-45ab-4877-add2-fa43e8cd8db5'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'mnist', 'mechanism': 'Mount'}}],
 'runDefinition': {'script': 'tf_mnist.py',
  'useAbsolutePath': False,
  'arguments': ['--data-folder',
   'DatasetConsumptionConfig:mnist',
   '--batch-size',
   '50',
   '--first-layer-neurons',
   '300',
   '--second-layer-neurons',
   '100',
   '--learning-rate',
   '0.01'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'gpucluster',
  'dataReferen

In [26]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

model = run.register_model(model_name='tf-dnn-mnist', 
                           model_path='outputs/model',
                           model_framework=Model.Framework.TENSORFLOW,
                           model_framework_version='1.13.0',
                           resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5))

In [27]:
# Create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/model'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)

Downloading from outputs/model/checkpoint to ./model/checkpoint ...
Downloading from outputs/model/mnist-tf.model.data-00000-of-00001 to ./model/mnist-tf.model.data-00000-of-00001 ...
Downloading from outputs/model/mnist-tf.model.index to ./model/mnist-tf.model.index ...
Downloading from outputs/model/mnist-tf.model.meta to ./model/mnist-tf.model.meta ...


In [37]:
from azureml.core.runconfig import MpiConfiguration
from azureml.train.dnn import TensorFlow

# Tensorflow constructor
estimator= TensorFlow(source_directory=script_folder,
                      compute_target=compute_target,
                      script_params=script_params,
                      entry_script='tf_mnist.py',
                      node_count=2,
                      process_count_per_node=1,
                      distributed_training=MpiConfiguration(),
                      framework_version='1.13',
                      use_gpu=True,
                      pip_packages=['azureml-dataprep[pandas,fuse]'])

In [38]:
run = exp.submit(estimator)
run.wait_for_completion(show_output=True)

RunId: tf-mnist_1583143014_58b7d5a6
Web View: https://ml.azure.com/experiments/tf-mnist/runs/tf-mnist_1583143014_58b7d5a6?wsid=/subscriptions/b208dd3b-2592-4e14-a626-cd6941369193/resourcegroups/sqlserverkonferenz2020/workspaces/mldemo

Streaming azureml-logs/55_azureml-execution-tvmps_6dd2acea89579d6ca6ffd465426fe3ef8b5dbdf753c993c0092ed45e4524895c_d.txt

2020-03-02T09:59:46Z Starting output-watcher...
2020-03-02T09:59:46Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_1d5e282518217ede461ec1726f1f4557
f7277927d38a: Pulling fs layer
8d3eac894db4: Pulling fs layer
edf72af6d627: Pulling fs layer
3e4f86211d23: Pulling fs layer
d6e9603ff777: Pulling fs layer
5cad422780e2: Pulling fs layer
8130687c8acb: Pulling fs layer
c11e9246d621: Pulling fs layer
0dfae24cbbd9: Pulling fs layer
0bb049a6d391: Pulling fs layer
22a53069998a: Pulling fs layer
db550b9db251: Pulling fs layer
3e4f86211d23: Waiting
10a82

(60000, 784)
(60000,)
(10000, 784)
(10000,)
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
2020-03-02 10:02:37.399252: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-03-02 10:02:37.794660: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x58b5530 executing computations on platform CUDA. Devices:
2020-03-02 10:02:37.794812: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): Tesla K80, Compute Capability 3.7
2020-03-02 10:02:37.798436: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2596990000 Hz
2020-03-02 10:02:37.799127: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x59d05e0 executing computations on platform Host. Devices:
2020-03-02 10:02:37.799228: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor 

{'runId': 'tf-mnist_1583143014_58b7d5a6',
 'target': 'gpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-03-02T09:59:42.259127Z',
 'endTimeUtc': '2020-03-02T10:03:51.286391Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'fe26bcec-e9a8-496a-a0e2-093c947ecb55',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'c1dc2de4-45ab-4877-add2-fa43e8cd8db5'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'mnist', 'mechanism': 'Mount'}}],
 'runDefinition': {'script': 'tf_mnist.py',
  'useAbsolutePath': False,
  'arguments': ['--data-folder',
   'DatasetConsumptionConfig:mnist',
   '--batch-size',
   '50',
   '--first-layer-neurons',
   '300',
   '--second-layer-neurons',
   '100',
   '--learning-rate',
   '0.01'],
  'sourceDirectoryData