## Setup

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-235168
Azure region: eastus2
Subscription id: 9a7511b8-150f-4a58-8528-3e7d50216c31
Resource group: aml-quickstarts-235168


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "eduProject1"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
try:
    cluster = ComputeTarget(ws, cluster_name)
    print("Found existing cluster!")
except ComputeTargetException:
    print("No cluster found, creating one...")
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", min_nodes=0, max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    cluster.wait_for_completion(show_output=True)


Found existing cluster!


## HyperDrive

In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "--C": uniform(0.01, 1.0),
    "--max-iter": choice(10, 20, 50, 100, 200)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
# Using SKLearn because ScriptRunConfig doesn't seem to work
# src = ### YOUR CODE HERE ###
compute_target = ws.compute_targets[cluster_name]
estimator = SKLearn(source_directory=".", compute_target=compute_target, entry_script="train.py")

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
# hyperdrive_config = ### YOUR CODE HERE ###
hyperdrive_config = HyperDriveConfig(
    estimator=estimator,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=5
)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_996f83d1-87a6-47f0-bbcf-e14004fb928c
Web View: https://ml.azure.com/runs/HD_996f83d1-87a6-47f0-bbcf-e14004fb928c?wsid=/subscriptions/9a7511b8-150f-4a58-8528-3e7d50216c31/resourcegroups/aml-quickstarts-235168/workspaces/quick-starts-ws-235168&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

[2023-06-06T15:36:51.554930][GENERATOR][INFO]Trying to sample '5' jobs from the hyperparameter space
[2023-06-06T15:36:51.9448943Z][SCHEDULER][INFO]Scheduling job, id='HD_996f83d1-87a6-47f0-bbcf-e14004fb928c_0' 
[2023-06-06T15:36:52.0821051Z][SCHEDULER][INFO]Scheduling job, id='HD_996f83d1-87a6-47f0-bbcf-e14004fb928c_1' 
[2023-06-06T15:36:52.2208024Z][SCHEDULER][INFO]Scheduling job, id='HD_996f83d1-87a6-47f0-bbcf-e14004fb928c_2' 
[2023-06-06T15:36:52.4147176Z][SCHEDULER][INFO]Scheduling job, id='HD_996f83d1-87a6-47f0-bbcf-e14004fb928c_4' 
[2023-06-06T15:36:52.3349755Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_996f83d1-87a6-47f0-bbcf-e14004fb928c_0

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process 'python' exited with status code 2. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\n  File \"train.py\", line 77, in <module>\n    main()\n  File \"train.py\", line 47, in main\n    args = parser.parse_args()\n  File \"/opt/miniconda/lib/python3.6/argparse.py\", line 1733, in parse_args\n    self.error(msg % ' '.join(argv))\n  File \"/opt/miniconda/lib/python3.6/argparse.py\", line 2389, in error\n    self.exit(2, _('%(prog)s: error: %(message)s\\n') % args)\n  File \"/opt/miniconda/lib/python3.6/argparse.py\", line 2376, in exit\n    _sys.exit(status)\nSystemExit: 2\n\n",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Execution failed. User process 'python' exited with status code 2. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\\n  File \\\"train.py\\\", line 77, in <module>\\n    main()\\n  File \\\"train.py\\\", line 47, in main\\n    args = parser.parse_args()\\n  File \\\"/opt/miniconda/lib/python3.6/argparse.py\\\", line 1733, in parse_args\\n    self.error(msg % ' '.join(argv))\\n  File \\\"/opt/miniconda/lib/python3.6/argparse.py\\\", line 2389, in error\\n    self.exit(2, _('%(prog)s: error: %(message)s\\\\n') % args)\\n  File \\\"/opt/miniconda/lib/python3.6/argparse.py\\\", line 2376, in exit\\n    _sys.exit(status)\\nSystemExit: 2\\n\\n\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [5]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print(f"Best run id: {best_run.id}")
print(f"Best run accuracy: {best_run_metrics['Accuracy']}")

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [None]:
best_run.get_file_names()

In [None]:
best_run.download_files()

In [None]:
model = best_run.register_model(
    model_name="model.joblib",
    model_path="outputs/model.joblib"
)

## AutoML

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dataset = TabularDatasetFactory.from_delimited_files(path=data_url)

In [7]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

In [9]:
# Add target to x dataframe
x["y"] = y

In [12]:
datastore = ws.get_default_datastore()
train_dataset = TabularDatasetFactory.register_pandas_dataframe(
    dataframe=x,
    target=datastore,
    name="train_dataset"
)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/f3002977-e2c0-423c-a04e-f7e820051c56/
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'job_admin.' -> 'job_admin_'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'education_basic.4y'

In [13]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_dataset,
    label_column_name="y",
    n_cross_validations=5,
    compute_target=compute_target
)

In [2]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(config=automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

In [None]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_automl_run, best_automl_model = automl_run.get_output()

In [None]:
best_automl_run.register_model(
    model_name="model.pkl",
    model_path="./outputs"
)

In [None]:
# Clean up
compute_target.delete()