In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project1")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-131626
Azure region: southcentralus
Subscription id: 9e65f93e-bdd8-437b-b1e8-0647cd6098f7
Resource group: aml-quickstarts-131626


In [2]:
#Checking and printing existing compute targets
compute_targets= ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

my-compute ComputeInstance Succeeded
aml-compute AmlCompute Succeeded


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
from azureml.core.compute_target import ComputeTargetException

compute_cluster_name= "aml-compute"

#Check if compute cluster already exists
try:
    compute_cluster=ComputeTarget(workspace=ws, name=compute_cluster_name)
    print("Found existing cluster, use it...")
except ComputeTargetException:
    print("Creating new cluster...")
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)
    
compute_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it...
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        "--C": uniform(0.001, 10),
        "--max_iter": choice(50, 75, 100, 125, 150)
    }
)

# Specify a Policy
policy = BanditPolicy(
    evaluation_interval=1,
    slack_factor= 0.1
)

if "training" not in os.listdir():
    os.mkdir("./training")
    
script_folder = './training'
os.makedirs(script_folder, exist_ok=True)

shutil.copy('./train.py', script_folder)

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory= script_folder,
    compute_target= compute_cluster,
    entry_script= "train.py",
    vm_size="Standard_D2_V2",
    vm_priority="lowpriority"
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling= ps,
    policy= policy,
    primary_metric_name= "Accuracy",
    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=3
)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run=exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [6]:
hyperdrive_run.wait_for_completion(show_output= True)

RunId: HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422
Web View: https://ml.azure.com/experiments/udacity-project1/runs/HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422?wsid=/subscriptions/b968fb36-f06a-4c76-a15f-afab68ae7667/resourcegroups/aml-quickstarts-131570/workspaces/quick-starts-ws-131570

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-12-24T07:33:32.801140][API][INFO]Experiment created<END>\n""<START>[2020-12-24T07:33:33.409264][GENERATOR][INFO]Trying to sample '3' jobs from the hyperparameter space<END>\n""<START>[2020-12-24T07:33:33.701038][GENERATOR][INFO]Successfully sampled '3' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-12-24T07:33:34.2830488Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422
Web View: https://ml.azure.com/experiments/udacity-project1/runs/HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422?wsid=/subscriptions/b96

{'runId': 'HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422',
 'target': 'aml-compute',
 'status': 'Completed',
 'startTimeUtc': '2020-12-24T07:33:32.544884Z',
 'endTimeUtc': '2020-12-24T07:49:58.069627Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'd27993cd-ba1f-4fa5-805e-5ce23adb8768',
  'score': '0.9179059180576631',
  'best_child_run_id': 'HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422_11',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg131570.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_8c39e93e-3393-416c-b1e2-fb7ac3e9a422/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=mQBFsUQZyZmon8PO1bCGq%2FUbKGOSef9a4tG%2FSJWl8%2Fw%3D&st=2020-12-24T07%3A40%3A28Z&se=2020-12-24T15%3A50%3A28Z&sp=r'}}

In [7]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run= hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics=best_run.get_metrics()
print(best_run.get_details()['runDefinition']['arguments'])
print(best_run.get_file_names())
print('Best Run Accuracy:',best_run_metrics['Accuracy'])
model=best_run.register_model(model_name='bankmarketing-sklearn', model_path='outputs/model.joblib')

['--C', '5.30813457038507', '--max_iter', '125']
['azureml-logs/55_azureml-execution-tvmps_8c8e988e4c9d416a1e968532871673b60a84c738610cae68495068daa53e939f_d.txt', 'azureml-logs/65_job_prep-tvmps_8c8e988e4c9d416a1e968532871673b60a84c738610cae68495068daa53e939f_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_8c8e988e4c9d416a1e968532871673b60a84c738610cae68495068daa53e939f_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/104_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']
Best Run Accuracy: 0.9179059180576631


In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
path_to_data= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data=TabularDatasetFactory.from_delimited_files(path=path_to_data) 

In [14]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(data)

#split into train and test datasets
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.20)

#concatenate to form train and test datasets 
train_df=pd.concat([x_train, y_train], axis=1)
test_df=pd.concat([x_test, y_test], axis=1)

train_df

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
5146,45,1,0,1,0,8,4,152,1,999,...,0,0,1,0,0,0,0,0,0,0
12245,39,1,0,0,0,5,4,31,2,999,...,0,0,0,0,0,0,1,0,0,0
12926,48,1,0,0,0,11,5,218,1,999,...,0,0,0,0,0,0,0,1,0,0
8829,40,1,0,0,0,4,1,144,2,999,...,0,0,0,1,0,0,0,0,0,0
1853,46,1,0,0,0,7,1,76,3,999,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,58,0,0,1,0,6,5,152,3,999,...,1,0,0,0,0,0,0,1,0,0
4470,37,1,0,0,0,5,5,78,1,999,...,0,0,1,0,0,0,0,0,0,0
20176,32,1,0,1,0,11,4,88,2,999,...,0,0,0,0,0,0,1,0,0,0
3608,53,0,0,0,0,5,1,460,1,999,...,1,1,0,0,0,0,0,0,0,0


In [18]:
#save pandas dataframe as .csv and upload to datastore
if not os.path.isdir('data'):
    os.mkdir('data')
pd.DataFrame(train_df).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_df).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 1 files out of an estimated total of 2
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_4464da48939f4a3280efe9d0a3d1a617

In [20]:
from azureml.core.dataset import Dataset
#Load dataset as TabularDataset
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))

In [22]:
from azureml.train.automl.utilities import get_primary_metrics
get_primary_metrics("classification")

['accuracy',
 'AUC_weighted',
 'norm_macro_recall',
 'precision_score_weighted',
 'average_precision_score_weighted']

In [24]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    compute_target=compute_cluster,
    experiment_exit_score=0.99,
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=4)

In [25]:
# Submit your automl run

### YOUR CODE HERE ###
remote_run = exp.submit(automl_config, show_output = False)

Running on remote.


In [28]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project1,AutoML_6c34739a-8e83-4cb7-8336-cc5dea122ca7,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [27]:
remote_run.wait_for_completion()

{'runId': 'AutoML_6c34739a-8e83-4cb7-8336-cc5dea122ca7',
 'target': 'aml-compute',
 'status': 'Completed',
 'startTimeUtc': '2020-12-24T15:00:59.43494Z',
 'endTimeUtc': '2020-12-24T15:44:33.062737Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'aml-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"a43f1cc0-b85a-470d-8004-d9873f00c63c\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"bankmarketing/train_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-131626\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"9e65f93e-bdd8-437b-b1e8-0647cd6098f7\\\\\\", \\\

In [29]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show() 

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [36]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = remote_run.get_output()

best_run_metrics = best_run.get_metrics() 
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)


average_precision_score_weighted 0.956036448798656
recall_score_micro 0.9174506828528073
matthews_correlation 0.5671655095707122
AUC_macro 0.9480915189001689
precision_score_weighted 0.9144300370563748
weighted_accuracy 0.9534107820230536
AUC_weighted 0.9480915189001689
average_precision_score_macro 0.8262963313994565
AUC_micro 0.9810333574344722
f1_score_weighted 0.9157390581665614
norm_macro_recall 0.5438973067833254
precision_score_micro 0.9174506828528073
precision_score_macro 0.7957642079824525
balanced_accuracy 0.7719486533916626
f1_score_macro 0.7830786607804303
average_precision_score_micro 0.9817572259267604
recall_score_macro 0.7719486533916626
f1_score_micro 0.9174506828528073
log_loss 0.18887089143646768
recall_score_weighted 0.9174506828528073
accuracy 0.9174506828528073
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_6c34739a-8e83-4cb7-8336-cc5dea122ca7_26/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_6c34739a-8e83-4cb7-8336-cc5dea

In [37]:
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')

Model(workspace=Workspace.create(name='quick-starts-ws-131626', subscription_id='9e65f93e-bdd8-437b-b1e8-0647cd6098f7', resource_group='aml-quickstarts-131626'), name=automl_best_model.pkl, id=automl_best_model.pkl:2, version=2, tags={}, properties={})