In [15]:
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project-lancia")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: aml-wwe-ictx-dsplay
Azure region: westeurope
Subscription id: ec5ba19e-6205-418f-a52d-d0943090ca16
Resource group: rg-wwe-ictx-dsplayground


In [61]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

try:
    compute_cluster = ComputeTarget.create(ws, "alpha",  AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4))
except Exception:
    print("using exisiting cluster")
###temp
    compute_cluster = ComputeTarget(workspace=ws, name="alpha")


using exisiting cluster


In [17]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
        "--C": uniform(0.1, 0.99),
        "--max_iter": choice(25,50,100,200)
    }
)

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1,
                                         evaluation_interval = 1,
                                         delay_evaluation = 5)


if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn( compute_target=compute_cluster, entry_script="train.py", source_directory=".")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy = policy,
                                     primary_metric_name="Accuracy",
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)



In [18]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hdr = exp.submit(hyperdrive_config)
RunDetails(hdr).show()
hdr.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_1102f1ae-c57f-442e-b348-e03d4b716dc6
Web View: https://ml.azure.com/runs/HD_1102f1ae-c57f-442e-b348-e03d4b716dc6?wsid=/subscriptions/ec5ba19e-6205-418f-a52d-d0943090ca16/resourcegroups/rg-wwe-ictx-dsplayground/workspaces/aml-wwe-ictx-dsplay&tid=c16e514b-893e-4a01-9a30-b8fef514a650

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-08-17T10:15:36.361428][API][INFO]Experiment created<END>\n""<START>[2021-08-17T10:15:37.119835][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-08-17T10:15:37.334793][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_1102f1ae-c57f-442e-b348-e03d4b716dc6
Web View: https://ml.azure.com/runs/HD_1102f1ae-c57f-442e-b348-e03d4b716dc6?wsid=/subscriptions/ec5ba19e-6205-418f-a52d-d0943090ca16/resourcegroups/rg-wwe-ictx-dsplayground/workspaces/aml-wwe-ictx-dsplay&tid=c16e514b-893e-4a01-9a30-b8fef514a650



{'runId': 'HD_1102f1ae-c57f-442e-b348-e03d4b716dc6',
 'target': 'alpha',
 'status': 'Completed',
 'startTimeUtc': '2021-08-17T10:15:36.132833Z',
 'endTimeUtc': '2021-08-17T10:35:57.869309Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'cbc610a2-45a4-4a08-b632-25d0049e2c04',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1047-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0',
  'score': '0.9110722825087364',
  'best_child_run_id': 'HD_1102f1ae-c57f-442e-b348-e03d4b716dc6_17',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://sawweictddsplayaml.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_1102f1ae-c57f-442e-b348-e03d4b716dc6/azureml-logs/hyperdrive.txt?sv=2019-07-07&sr=b&sig=vsbOu9GFuNOBcg3w20vKNjo

In [27]:
import joblib
# Get your best run and save the model from that run.

best_run_hdr = hdr.get_best_run_by_primary_metric()
best_run_metrics_hdr = best_run_hdr.get_metrics()
best_params_hdr = best_run_hdr.get_details()['runDefinition']['arguments']

print("\n------------")
print('Best run ID: ', best_run_hdr.id,'\nBest run Accuracy: ', best_run_metrics_hdr['Accuracy'],'\n---------\nMetrics: ', best_run_metrics_hdr, '\nparams: ', best_params_hdr)
print("------------\n")

### YOUR CODE HERE ###
model = best_run_hdr.register_model(model_name='azmle-project-lancia', model_path='outputs/model.joblib' )


------------
Best run ID:  HD_1102f1ae-c57f-442e-b348-e03d4b716dc6_17 
Best run Accuracy:  0.9110722825087364 
---------
Metrics:  {'Regularization Strength:': 0.19176212672425075, 'Max iterations:': 200, 'Accuracy': 0.9110722825087364} 
params:  ['--C', '0.19176212672425075', '--max_iter', '200']
------------



In [28]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data_path_ ="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path=data_path_)


In [34]:
from train import clean_data

from sklearn.model_selection import train_test_split
from azureml.core import Dataset
from pandas import concat

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train data with target
train_data = concat([x_train, y_train], axis=1)
train_data.to_csv("./training/data/prepared_data.csv", index=False)
target_label = y_test.name

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='training/data/', target_path='udacity-lancia', overwrite=True)

# create a dataset referencing the cloud location
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('udacity-lancia/prepared_data.csv'))])

# Reload train data
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('udacity-lancia/prepared_data.csv'))])


Uploading an estimated of 2 files
Uploading training/data/.amlignore.amltmp
Uploaded training/data/.amlignore.amltmp, 1 files out of an estimated total of 2
Uploading training/data/prepared_data.csv
Uploaded training/data/prepared_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [47]:
compute_cluster

AmlCompute(workspace=Workspace.create(name='aml-wwe-ictx-dsplay', subscription_id='ec5ba19e-6205-418f-a52d-d0943090ca16', resource_group='rg-wwe-ictx-dsplayground'), name=alpha, id=/subscriptions/ec5ba19e-6205-418f-a52d-d0943090ca16/resourceGroups/rg-wwe-ictx-dsplayground/providers/Microsoft.MachineLearningServices/workspaces/aml-wwe-ictx-dsplay/computes/alpha, type=AmlCompute, provisioning_state=Succeeded, location=westeurope, tags=None)

In [48]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=dataset,
    label_column_name=target_label,
    n_cross_validations=3,
    compute_target= compute_cluster)

In [49]:
# Submit your automl run

automl = exp.submit(automl_config)
RunDetails(automl).show()
automl.wait_for_completion(show_output=True)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-lancia,AutoML_f6ec413f-d15d-48ba-88a2-253e16b11867,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-lancia,AutoML_f6ec413f-d15d-48ba-88a2-253e16b11867,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the sm

{'runId': 'AutoML_f6ec413f-d15d-48ba-88a2-253e16b11867',
 'target': 'alpha',
 'status': 'Completed',
 'startTimeUtc': '2021-08-17T13:26:57.20582Z',
 'endTimeUtc': '2021-08-17T14:14:36.075184Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'alpha',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"9e411132-f23a-4a9a-ac92-7a4fa9bd16f3\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.0", "azureml-train-automl-runtime": "1.28.0", "azureml-train-automl-client": "1.28.0", "azureml-tensorbo

In [59]:
# Retrieve and save your best automl model.

import joblib
# Get your best run and save the model from that run.

best_run_aml, best_model_aml = automl.get_output()
best_run_metrics = best_run_aml.get_metrics()

print("\n------------\n")
print('Best run ID: ', best_run_aml.id,'\n Best run Accuracy: ', best_run_metrics['accuracy'],'\n ----- \n Metrics: ', best_run_metrics)
print("\n------------\n")


### YOUR CODE HERE ###
model = best_run_aml.register_model(model_name='azmle-project-lancia-automl', model_path='outputs/model.pkl')

Package:azureml-automl-runtime, training version:1.33.0, current version:1.28.0.post2
Package:azureml-core, training version:1.33.0, current version:1.28.0
Package:azureml-dataprep, training version:2.20.1, current version:2.15.0
Package:azureml-dataprep-native, training version:38.0.0, current version:33.0.0
Package:azureml-dataprep-rslex, training version:1.18.0, current version:1.13.0
Package:azureml-dataset-runtime, training version:1.33.0, current version:1.28.0
Package:azureml-defaults, training version:1.33.0, current version:1.28.0
Package:azureml-interpret, training version:1.33.0, current version:1.28.0
Package:azureml-mlflow, training version:1.33.0, current version:1.28.0
Package:azureml-pipeline-core, training version:1.33.0, current version:1.28.0
Package:azureml-telemetry, training version:1.33.0, current version:1.28.0
Package:azureml-train-automl-client, training version:1.33.0, current version:1.28.0
Package:azureml-train-automl-runtime, training version:1.33.0, curre


------------

Best run ID:  AutoML_f6ec413f-d15d-48ba-88a2-253e16b11867_15 
 Best run Accuracy:  0.917261083314545 
 ----- 
 Metrics:  {'precision_score_macro': 0.7967583912474542, 'balanced_accuracy': 0.7689281392551202, 'average_precision_score_macro': 0.8236265727767828, 'accuracy': 0.917261083314545, 'AUC_micro': 0.9807896829491943, 'AUC_macro': 0.9475251647697848, 'AUC_weighted': 0.9475251647697848, 'recall_score_macro': 0.7689281392551202, 'precision_score_weighted': 0.9137068727595045, 'f1_score_macro': 0.7817680465405993, 'recall_score_weighted': 0.917261083314545, 'average_precision_score_micro': 0.9815609134104101, 'log_loss': 0.24254054359398702, 'f1_score_weighted': 0.9152054276617378, 'precision_score_micro': 0.917261083314545, 'f1_score_micro': 0.917261083314545, 'matthews_correlation': 0.564930386548181, 'weighted_accuracy': 0.9541068751651135, 'recall_score_micro': 0.917261083314545, 'average_precision_score_weighted': 0.9552065810345137, 'norm_macro_recall': 0.5378562