## Define Excecution Context
1. The python environment for the script, including all python packages
2. The compute target on which the script will be run.It can be a local workstation or a remote compute target. 

## Connect to your workspace

In [1]:
import azureml.core
from azureml.core import Workspace

#Load the workspace from the saved config file
ws=Workspace.from_config()
print('Ready to use azureml {} to work with {}.format(azure.core.VERSION,ws.name))

SyntaxError: EOL while scanning string literal (<ipython-input-1-2ea0d6341b39>, line 6)

## Prepare data

In [2]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                        target_path='diabetes-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception unknown locale: UTF-8.


NameError: name 'ws' is not defined

## Create a training script
1. A folder for the new experiment
2. A training script whcih uses sklearn to train a model and matplotlib to plot the ROC curve.

In [3]:
import os 
#Createa a folder
experiement_folder='diabetes_training_logistic'
os.makedirs(experiment_folder, exits_ok=True)
print(experiment_folder,'folder created')

NameError: name 'experiment_folder' is not defined

In [4]:
%%writefile $experiment_folder/diabetes_training.py
#import libraries
import os 
import argparse
from azurecore import Run
import pandas as pd 
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['diabetes'].to_pandas_dataframe()

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Writing $experiment_folder/diabetes_training.py


FileNotFoundError: [Errno 2] No such file or directory: '$experiment_folder/diabetes_training.py'

## Define an environment
When you run a python script as an experiment in azureml, a conda environment is created to define the execution context for the script. Azureml provides a default environment which contains common packages like azureml-defaults package, pandas, and numpy. 
You can also define your own environment using conda or pip. 

In [5]:
from azureml.core import Environment
from azureml.core.conda_dependencies import conda_dependencies

#Create a python environment for the experiment
diabetes_env=Environment('diabetes-experiment-env')
diabetes_env.python.user_managed_dependencies=False #Let azureml manage dependencies
diabetes_env.docker.enabled=True  #Use a docker container

#Create a set of package dependencies
diabetes_packages=CondaDependencies.create(conda_packages=['scikit-learn'],pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])

#Add the dependencies to the environment
diabetes_env.python.conda_dependencies=diabetes_packages

ImportError: cannot import name 'conda_dependencies' from 'azureml.core.conda_dependencies' (/Users/xiaoyingliu/anaconda3/lib/python3.7/site-packages/azureml/core/conda_dependencies.py)

Now you can use the environment for the experiment by assigning it to an estimator

In [6]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment
from azureml.widgets import RunDetails

# Set the script parameters
script_params = {
    '--regularization': 0.1
}

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
                      inputs=[diabetes_ds.as_named_input('diabetes')],
                      script_params=script_params,
                      compute_target = 'local',
                      environment_definition = diabetes_env,
                      entry_script='diabetes_training.py')

# Create an experiment
experiment = Experiment(workspace = ws, name = 'diabetes-training')

# Run the experiment
run = experiment.submit(config=estimator)
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

ModuleNotFoundError: No module named 'azureml.widgets'

The experiment successfully used the environment, which inludes all the packges required. 
You can also register the environment into the workspace. 

In [7]:
#Register the environment
diabetes_env.register(workspace=ws)

NameError: name 'diabetes_env' is not defined

## Run an experiment on a remote compute target
In many cases, your local compute resources may not be sufficient to process a complex or long-running experiment that needs to process a large volume of data; and you may want to take advantage of the ability to dynamically create and use compute resources in the cloud

In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import computeTargetException
cluster_name=your_compute_cluster
try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

ImportError: cannot import name 'computeTargetException' from 'azureml.core.compute_target' (/Users/xiaoyingliu/anaconda3/lib/python3.7/site-packages/azureml/core/compute_target.py)

Reuse the registered environment

In [9]:
from azureml.train.estimator import Estimator
from azureml.core import Environment, Experiment
from azureml.widgets import RunDetails

# Get the environment
registered_env = Environment.get(ws, 'diabetes-experiment-env')

# Set the script parameters
script_params = {
    '--regularization': 0.1
}

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
                      inputs=[diabetes_ds.as_named_input('diabetes')],
                      script_params=script_params,
                      compute_target = cluster_name, # Run the experiment on the remote compute target
                      environment_definition = registered_env,
                      entry_script='diabetes_training.py')

# Create an experiment
experiment = Experiment(workspace = ws, name = 'diabetes-training')

# Run the experiment
run = experiment.submit(config=estimator)
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

ModuleNotFoundError: No module named 'azureml.widgets'

The experiment will take quite a lot longer because a container image must be built with the conda environment, and then the cluster nodes must be started and the image deployed before the script can be run. For a simple experiment like the diabetes training script, this may seem inefficient; but imagine you needed to run a more complex experiment with a large volume of data that would take several hours on your local workstation - dynamically creating more scalable compute may reduce the overall time significantly.
After the experiment has finished, you can get the metrics and files generated by the experiment run. The files will include logs for building the image and managing the compute.

In [None]:
# Get logged metrics
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file)