In [2]:
from azureml.core import Workspace 

ws = Workspace.from_config()
print(ws)

Workspace.create(name='aml-workspace', subscription_id='3571f8dc-3527-4993-9d2b-ac0812d807fd', resource_group='aml-resources')


# Preparing data for experiment

In [12]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    
    # Uploading the data from notebook to default datastore
    Dataset.File.upload_directory(src_dir='practice-arena/data',
                              target=DataPath(default_ds, 'diabetes-data/')
                              )

    # Getting data from the default datastore and converting it into tabular dataset
    tabular_dataset = Dataset.Tabular.from_delimited_files(path = (default_ds, 'diabetes-data/*.csv'))

    # Registering the tabular dataset created above
    try:
        tabular_dataset = tabular_dataset.register(workspace = ws, name = 'diabetes dataset',  tags = {'format': 'CSV'}, create_new_version = True)

        print("Dataset registered.")
    
    except Exception as ex:
        print(ex)
    
else:
    print("Dataset is already registered")


Validating arguments.
Arguments validated.
Uploading file to diabetes-data/
Uploading an estimated of 4 files
Uploading practice-arena/data/.amlignore
Uploaded practice-arena/data/.amlignore, 1 files out of an estimated total of 4
Uploading practice-arena/data/.amlignore.amltmp
Uploaded practice-arena/data/.amlignore.amltmp, 2 files out of an estimated total of 4
Uploading practice-arena/data/diabetes.csv
Uploaded practice-arena/data/diabetes.csv, 3 files out of an estimated total of 4
Uploading practice-arena/data/diabetes2.csv
Uploaded practice-arena/data/diabetes2.csv, 4 files out of an estimated total of 4
Uploaded 4 files
Creating new dataset
Dataset registered.


# Creating a training script

In [18]:
import os

# Creating a folder with experiment files
experiment_folder = 'practice-arena/diabetes_training_logistic'

os.makedirs(experiment_folder, exist_ok = True)
print(experiment_folder, ": Folder is created")

practice-arena/diabetes_training_logistic : Folder is created


In [33]:
%%writefile $experiment_folder/diabetes_training.py

import argparse

# Getting the scripts
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type = float, dest = 'reg_rate', default = 0.01, help = 'Enter regularization rate')
parser.add_argument('--input-data', type = str, dest = 'training_dataset_id', help = 'training dataset')
args = parser.parse_args()

# Set regularization rate
reg = args.reg_rate

# Get the experiment run context
from azureml.core import Run 

run = Run.get_context()

# Load the diabetes dataset
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

# Seperate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Train a logistic regression model
from sklearn.linear_model import LogisticRegression
import numpy as np

print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
from sklearn.metrics import roc_auc_score

y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))

# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')

# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')

run.log_image(name = "ROC", plot = fig)
plt.show()

# Creating output folder
os.makedirs('practice-arena/outputs', exist_ok = True)

# Save the model
import joblib

joblib.dump(value = model, filename = 'practice-arena/outputs/diabetes_model.pkl')

run.complete()

Overwriting practice-arena/diabetes_training_logistic/diabetes_training.py


# Define an Environment

In [21]:
%%writefile $experiment_folder/experiment_env.yml

name: experiment_env

dependencies: 
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing practice-arena/diabetes_training_logistic/experiment_env.yml


# Creating Environment

In [24]:
from azureml.core import Environment

# Creating a Python environment for the experiment from .yml file
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + '/experiment_env.yml')

# Letting Azure ML manage the dependencies
experiment_env.python.user_managed_dependencies = False 

print(experiment_env.name)
print(experiment_env.python.conda_dependencies.serialize_to_string())

experiment_env
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow



In [34]:
from azureml.core import Experiment, ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Creating a configuration script
script_config = ScriptRunConfig(source_directory = experiment_folder, script = 'diabetes_training.py',
                                arguments = ['--regularization', 0.1,
                                             '--input-data', diabetes_ds.as_named_input('training_data')],
                                environment = experiment_env,
                                docker_runtime_config = DockerConfiguration(use_docker = True)  
                                )

# Submitting the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace = ws, name = experiment_name)
run = experiment.submit(config = script_config)

RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'mslearn-train-diabetes_1659680048_62c603ce',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2022-08-05T06:14:09.810995Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '2569f13d-c727-47ab-90e5-882c5517be14'},
 'inputDatasets': [{'dataset': {'id': '8e0a27a9-8448-4975-9847-1b6c07c2af0c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'diabetes_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--regularization',
   '0.1',
   '--input-data',
   'DatasetConsumptionConfig:training_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'training_data': {'dataLocation': {'dataset': {'id': '8e0a27a9-8448-4975-9847-1b6c07c2af0c',
      'name': 'diabetes dataset',
      'version': '1'},
     'dataPa

# Getting Metrics

In [35]:
# Get logged metrics
metrics = run.get_metrics()

for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')

for file in run.get_file_names():
    print(file)

Regularization Rate 0.1
Accuracy 0.7893333333333333
AUC 0.8568650620553335
ROC aml://artifactId/ExperimentRun/dcid.mslearn-train-diabetes_1659680048_62c603ce/ROC_1659680057.png


ROC_1659680057.png
azureml-logs/60_control_log.txt
azureml-logs/70_driver_log.txt
logs/azureml/8_azureml.log
logs/azureml/dataprep/backgroundProcess.log
logs/azureml/dataprep/backgroundProcess_Telemetry.log


# Registering the environment for reuse purposes

In [36]:
experiment_env.register(workspace = ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20220504.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "experiment_env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "dependencies": [
                "python=3.6.2",
                "scik

# Creating another classification model using the registered env

In [44]:
import os 

experiment_folder = 'practice-arena/diabetes_training_tree'
os.makedirs(experiment_folder, exist_ok = True)

print(experiment_folder, ': Folder is created.')

practice-arena/diabetes_training_tree : Folder is created.


In [47]:
%%writefile $experiment_folder/diabetes_training.py

import argparse
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get script arguments
parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type = str, dest = 'training_dataset_id', help = 'Input: Training dataset')


# Get the experiment run context
run = Run.get_context()

# load the diabetes data (passed as an input dataset)
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# Calculating accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)

print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# Calculating AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])

print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))

# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')

# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')

run.log_image(name = "ROC", plot = fig)
plt.show()

os.makedirs('practice-arena/outputs', exist_ok = True)
joblib.dump(value = model, filename = 'practice-arena/outputs/diabetes_model.pkl')

run.complete()

Overwriting practice-arena/diabetes_training_tree/diabetes_training.py


# Getting the registered model

In [48]:
# Getting the environment registered
registered_env = Environment.get(workspace = ws, name = 'experiment_env')

# Getting the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Creating a script config file
script_config = ScriptRunConfig(source_directory = experiment_folder, script = 'diabetes_training.py',
                                arguments = ['--input-data', diabetes_ds.as_named_input('training_data')],
                                environment = registered_env,
                                docker_runtime_config = DockerConfiguration(use_docker = True)
                                )

# Submit the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace = ws, name = experiment_name)

run = experiment.submit(config = script_config)
RunDetails(run).show()

run.wait_for_completion(show_output = True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

RunId: mslearn-train-diabetes_1659681633_c14bdf06
Web View: https://ml.azure.com/runs/mslearn-train-diabetes_1659681633_c14bdf06?wsid=/subscriptions/3571f8dc-3527-4993-9d2b-ac0812d807fd/resourcegroups/aml-resources/workspaces/aml-workspace&tid=78c76086-2fb7-4f6a-b684-c129ba0ea713

Streaming azureml-logs/60_control_log.txt

[2022-08-05T06:40:34.325567] Using urllib.request Python 3.0 or later
Streaming log file azureml-logs/60_control_log.txt
Starting the daemon thread to refresh tokens in background for process with pid = 6130
Running: ['/bin/bash', '/tmp/azureml_runs/mslearn-train-diabetes_1659681633_c14bdf06/azureml-environment-setup/docker_env_checker.sh']

Found materialized image on target: azureml/azureml_db9ade57a69607a986c554e8d2229381


[2022-08-05T06:40:35.607572] Logging experiment running status in history service.
Running: ['docker', 'run', '--name', 'mslearn-train-diabetes_1659681633_c14bdf06', '--rm', '-v', '/tmp/azureml_runs/mslearn-train-diabetes_1659681633_c14bdf06:/a

{'runId': 'mslearn-train-diabetes_1659681633_c14bdf06',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2022-08-05T06:40:35.607634Z',
 'endTimeUtc': '2022-08-05T06:40:56.136792Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '257ed47a-2bc9-43f0-be47-3a07a100fcf4'},
 'inputDatasets': [{'dataset': {'id': '8e0a27a9-8448-4975-9847-1b6c07c2af0c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'diabetes_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data', 'DatasetConsumptionConfig:training_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'training_data': {'dataLocation': {'dataset': {'id': '8e0a27a9-8448-4975-9847-1b6c07c2af0c',
      'name': 'diabetes dataset',
      'version': '1'},
   

In [49]:
# Get logged metrics

metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))

print('\n')

for file in run.get_file_names():
    print(file)

Accuracy 0.8984444444444445
AUC 0.8834267090544071
ROC aml://artifactId/ExperimentRun/dcid.mslearn-train-diabetes_1659681633_c14bdf06/ROC_1659681645.png


ROC_1659681645.png
azureml-logs/60_control_log.txt
azureml-logs/70_driver_log.txt
logs/azureml/8_azureml.log
logs/azureml/dataprep/backgroundProcess.log
logs/azureml/dataprep/backgroundProcess_Telemetry.log


# View registered environments

In [50]:
from azureml.core import Environment

envs = Environment.list(workspace = ws)

print(envs)

{'experiment_env': {
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20220504.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "experiment_env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "dependencies": [
                "python=3.6.2",
  

# Create Compute Cluster

In [51]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'your-compute-name'

try:
    # Checking for any existing compute
    training_cluster = ComputeTarget(ws, cluster_name)

    print("Found existing cluster, use it")

except:
    
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_DS11_V2', max_nodes = 2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output = True)
    
    except Exception as ex:
        print(ex)




InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Run an experiment on Remote Compute

In [52]:
script_config = ScriptRunConfig(source_directory = experiment_folder, script = 'diabetes_training.py',
                                arguments = ['--input-data', diabetes_ds.as_named_input('training_data')],
                                environment = registered_env,
                                compute_target = cluster_name
                                )

# Submitting the experiment
experiment_name = 'mslearn-train-diabetes-on-remote-compute'
experiment = Experiment(ws, experiment_name)
run = experiment.submit(script_config)

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

# Checking the status of compute target

In [53]:
cluster_state = training_cluster.get_status()
print(cluster_state.allocation_state, cluster_state.current_node_count)

Steady 0


** It will take a while before the status changes from "steady" to "resizing".**

In [54]:
run.wait_for_completion()

{'runId': 'mslearn-train-diabetes-on-remote-compute_1659682337_b69e9657',
 'target': 'your-compute-name',
 'status': 'Finalizing',
 'startTimeUtc': '2022-08-05T07:04:19.607956Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': '257ed47a-2bc9-43f0-be47-3a07a100fcf4',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '8e0a27a9-8448-4975-9847-1b6c07c2af0c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'diabetes_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data', 'DatasetConsumptionConfig:training_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'your-compute-name',
  'dataReferences': {},
  'data': {'training_data': {'dataLocation': {'da

In [55]:
# Get logged metrics
metrics = run.get_metrics()

for key in metrics.keys():
        print(key, metrics.get(key))

print('\n')

for file in run.get_file_names():
    print(file)


Accuracy 0.8984444444444445
AUC 0.884077920653052
ROC aml://artifactId/ExperimentRun/dcid.mslearn-train-diabetes-on-remote-compute_1659682337_b69e9657/ROC_1659683118.png


ROC_1659683118.png
azureml-logs/20_image_build_log.txt
logs/azureml/dataprep/0/backgroundProcess.log
logs/azureml/dataprep/0/backgroundProcess_Telemetry.log
logs/azureml/dataprep/0/rslex.log.2022-08-05-07
system_logs/cs_capability/cs-capability.log
system_logs/hosttools_capability/hosttools-capability.log
system_logs/lifecycler/execution-wrapper.log
system_logs/lifecycler/lifecycler.log
system_logs/metrics_capability/metrics-capability.log
user_logs/std_log.txt


In [64]:
from azureml.core import Model

# Register the model
run.register_model(model_path='practice-arena/outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Compute cluster'}, properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')