In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-253977
Azure region: eastus2
Subscription id: 81cefad3-d2c9-4f77-a466-99a7f541c7bb
Resource group: aml-quickstarts-253977


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "proj-compute-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Cluster already created.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Cluster already created.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C': choice(0.01, 0.1, 0.5, 1),
        '--max_iter': choice(20, 40, 80, 120, 160, 200)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='./conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory="./training",
                      script='./train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
# run hyperdrive experiment
hyperdrive_run = exp.submit(config=hyperdrive_config)

In [6]:
# get run details
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()
print('Best Run Id: ', best_run.id)
print('\n')
print('Best Run Metrics:', best_run_metrics)

joblib.dump(value=best_run.id, filename='outputs/hd-best-sdk.joblib')
# model = best_run.register_model(model_name='hd-best-sdk', model_path='./outputs/hd-best-sdk.joblib')

Best Run Id:  HD_9f082c20-013a-49dd-9f77-4f8a8661bd19_15


Best Run Metrics: {'Max iterations:': 200, 'Regularization Strength:': 1.0, 'Accuracy': 0.909686817188638}


['outputs/hd-best-sdk.joblib']

In [17]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
url_path = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
dataset = Dataset.Tabular.from_delimited_files(path=url_path)
azure_ml_df = dataset.to_pandas_dataframe()
print(azure_ml_df.head())

   age          job  marital    education  default housing loan    contact  \
0   57   technician  married  high.school       no      no  yes   cellular   
1   55      unknown  married      unknown  unknown     yes   no  telephone   
2   33  blue-collar  married     basic.9y       no      no   no   cellular   
3   36       admin.  married  high.school       no      no   no  telephone   
4   27    housemaid  married  high.school       no     yes   no   cellular   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         1      failure         -1.8   
1   may         thu  ...         2    999         0  nonexistent          1.1   
2   may         fri  ...         1    999         1      failure         -1.8   
3   jun         fri  ...         4    999         0  nonexistent          1.4   
4   jul         fri  ...         2    999         0  nonexistent          1.4   

   cons.price.idx  cons.conf.idx  euribor3m 

In [18]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)
ml_data = x.join(y)

In [19]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=ml_data,
    label_column_name='y',
    n_cross_validations=5,
    enable_early_stopping = True,
    enable_onnx_compatible_models = True
    )

In [20]:
# Submit your automl run
experiment = Experiment(ws, "automl_project_experiment")
run = experiment.submit(config=automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
automl_project_experiment,AutoML_21624172-8f95-42f3-9c30-42b9899bff0b,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.





********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|3692                          |1                               |32950                                 |
+------------------------------+--------------------------------+--------------------------------------+

*********************************************************************



SparseNormalizer XGBoostClassifier             0:01:27             0.9141    0.9142
    4   MaxAbsScaler LightGBM                          0:00:58             0.9140    0.9142
    5   MaxAbsScaler LightGBM                          0:00:59             0.8880    0.9142
    6   StandardScalerWrapper XGBoostClassifier        0:01:22             0.9091    0.9142
    7   MaxAbsScaler LogisticRegression                0:01:07             0.9085    0.9142
    8   StandardScalerWrapper ExtremeRandomTrees       0:00:58             0.8880    0.9142
    9   StandardScalerWrapper XGBoostClassifier        0:01:18             0.9140    0.9142
   10   SparseNormalizer LightGBM                      0:01:00             0.9044    0.9142
   11   StandardScalerWrapper XGBoostClassifier        0:01:18             0.9151    0.9151
   12   MaxAbsScaler LogisticRegression                0:01:05             0.9082    0.9151
   13   VotingEnsemble                                 0:00:31             0.9165    0.9

2024-02-23:21:45:30,497 INFO     [explanation_client.py:334] Using default datastore for uploads


Current status: EngineeredFeatureExplanations. Computation of engineered features completed
Current status: RawFeaturesExplanations. Computation of raw features started
Current status: RawFeaturesExplanations. Computation of raw features completed
Current status: BestRunExplainModel. Best run model explanations completed
********************************************************************************************


In [25]:
# Retrieve and save your best automl model.
from azureml.automl.runtime.onnx_convert import OnnxConverter

# getting best model and saving it out with onnx
automl_run, automl_best_model = run.get_output(return_onnx_model=True)

# convert and save the model
OnnxConverter.save_onnx_model(automl_best_model, file_path="./outputs/automl_best_model.onnx")

In [None]:
# delete resources after use
AmlCompute.delete(cpu_cluster)