In [None]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os
import shutil

In [1]:
#ws = Workspace.get(name="udacity-project")
ws = Workspace.from_config()
ws.write_config(path='.azureml')
experiment_name='udacity-project'
exp = Experiment(workspace=ws, name=experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-134242
Azure region: southcentralus
Subscription id: 81cefad3-d2c9-4f77-a466-99a7f541c7bb
Resource group: aml-quickstarts-134242


In [2]:
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cpu_cluster_name = "OptimizePipe"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4,identity_type="SystemAssigned")
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
# Get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-10T18:10:58.418000+00:00', 'errors': None, 'creationTime': '2021-01-10T16:28:28.000380+00:00', 'modifiedTime': '2021-01-10T16:28:43.605725+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [None]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

In [3]:
# Specify parameter sampler
ps = RandomParameterSampling( 
    {
        '--C': choice(0.01, 0.1, 1, 10, 100), 
        '--max_iter': choice(25, 50, 100,150)
    }
)

# Specify a Policy
#any training runs whose best metric at interval 5 is less than 0.66 (1/(1+slack_factor)) will be terminated.
policy = BanditPolicy(slack_factor = 0.2, evaluation_interval=2, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")
script_folder = './training/'    
os.makedirs(script_folder, exist_ok=True)
shutil.copy('train.py', script_folder)

# Create a SKLearn estimator for use with train.py
# ************ 'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.***********
est = SKLearn(source_directory=script_folder, compute_target= cpu_cluster, entry_script='train.py')
# est = ScriptRunConfig(
#     source_directory='.',
#     script='train.py',
#     compute_target=cpu_cluster,
#     )

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
                                   hyperparameter_sampling = ps,
                                   primary_metric_name = 'Accuracy',
                                   primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                   max_total_runs=30,
                                   max_concurrent_runs = 3,
                                   policy = policy,
                                   estimator = est)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(config = hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6?wsid=/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourcegroups/aml-quickstarts-134242/workspaces/quick-starts-ws-134242

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-10T19:25:27.357153][API][INFO]Experiment created<END>\n""<START>[2021-01-10T19:25:27.933560][GENERATOR][INFO]Trying to sample '3' jobs from the hyperparameter space<END>\n""<START>[2021-01-10T19:25:28.564962][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-10T19:25:28.8153507Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6?wsid=/subscriptions/81cef

{'runId': 'HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6',
 'target': 'OptimizePipe',
 'status': 'Completed',
 'startTimeUtc': '2021-01-10T19:25:27.095037Z',
 'endTimeUtc': '2021-01-10T19:32:08.47441Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'c621cc3a-9fc7-44a3-9ca2-eafbaa9755e3',
  'score': '0.9072837632776934',
  'best_child_run_id': 'HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg134242.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=WAlKxedhVYqtIT7UQ8xWbxcCBMn%2B%2F8Oy23urUs%2B0InQ%3D&st=2021-01-10T19%3A22%3A21Z&se=2021-01-11T03%3A32%3A21Z&sp=r'}}

In [5]:
import joblib
# Get your best run and save the model from that run.
best_run_sklearn = hyperdrive_run.get_best_run_by_primary_metric()
# joblib.dump(best_run_sklearn, 'training/sklearn_bankmarketing_model.joblib')
SKLearn_Model = best_run_sklearn.register_model(model_name="sklearnBankmarketingModel", model_path='outputs/model.joblib')
# Get the metrics of the bestselected run
best_run_metrics = best_run_sklearn.get_metrics()
# Show the Accuracy of that run
print('Best Accuracy: {}'.format(best_run_metrics['Accuracy']))
best_run_sklearn

Run(Experiment: udacity-project,
Id: HD_f3acbfa3-c322-48f3-9695-78d31b8b99d6_0,
Type: azureml.scriptrun,
Status: Completed)
Best accuracy: 0.9072837632776934


# AutoML Part

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

url_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
#Input Data
ds = TabularDatasetFactory.from_delimited_files(url_path, infer_column_types=True, separator=',', header=True, encoding='utf8')

NameError: name 'clean_data' is not defined

In [None]:
# preview the first 10 rows of the dataset
   ds.take(10).to_pandas_dataframe()

In [None]:
from train import clean_data
# Use the clean_data function to clean your data.
x, y = clean_data(ds)
data = pd.concat([x,y],axis=1)
data.head()

In [None]:
from sklearn.model_selection import train_test_split
# TODO: Split data into train and test sets.
#training_data, validation_data = ds.random_split(percentage=0.8)
training_data,validation_data = train_test_split(data,test_size = 0.3,random_state = 42,shuffle=True)

In [None]:
from azureml.train.automl import AutoMLConfig
#convert the training dataset to a CSV file and store it under the training folder
training_data.to_csv('training/training_data.csv')
#Create an experiment for the AutoML testing script
exp = Experiment(workspace=ws, name="AutoML-ModelTesting")
datastore = ws.get_default_datastore()
#Create a new folder 'data' and store training dataset into it using datastore
if "data" not in os.listdir():
    os.mkdir("./data")
datastore.upload(src_dir='training/',target_path='data/')
# Get the dataset from the data folder
training_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore,('data/training_data.csv'))])
# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=training_dataset,
    label_column_name='age',
    n_cross_validations=3,
    iterations=40,
    max_concurrent_iterations=8,
    compute_target=compute_target)

In [None]:
# Submit your automl run
tag = {"Test": "Udacity_project_automl_model_testing"}
automl_run = exp.submit(config=automl_config,tags=tag, show_output = True)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

In [None]:
# Retrieve and save your best automl model.
#Using get.output() without params will return the best pipeline according to the primary metric  and its run.
best_automl_run, model = automl_run.get_output()
best_automl_run.register_model(model_name = "automl_bankmarketing_model", model_path = './outputs/')
#joblib.dump(best_automl_run, 'outputs/automl_bankmarketing_model.pkl')
# Get the metrics of the bestselected run
best_run_metrics = best_automl_run.get_metrics()
best_automl_run.final_estimator
# Show the Accuracy of that run
print('Best accuracy: {}'.format(best_run_metrics['Accuracy']))

#delete the azure MachineLearning Compute cluster
cpu_cluster.delete()