# Hyperparameter Tuning using HyperDrive

import all the dependencies

In [1]:
import os
import joblib
import pandas as pd
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from sklearn.metrics import confusion_matrix
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig
from azureml.core.model import Model
from azureml.core.environment import Environment
from train import clean_data

print("SDK version:", azureml.core.VERSION)

experiment_name = 'Mushroom'
amlcompute_cluster_name = "comp"
ds_key = "Mushroom"
ds_description_text = "This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms"
project_folder = './mushroom-project'



SDK version: 1.20.0


## Workspace
create workspace from config and compute_target

In [2]:
ws = Workspace.from_config()
experiment=Experiment(ws, experiment_name)

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')
run = experiment.start_logging()

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
    # For a more detailed view of current AmlCompute status, use get_status().

quick-starts-ws-139396
aml-quickstarts-139396
southcentralus
81cefad3-d2c9-4f77-a466-99a7f541c7bb
Found existing cluster, use it.


## Dataset

In [3]:
found = False
if ds_key in ws.datasets.keys():
        found = True
        dataset = ws.datasets[ds_key]

if not found:
        # Create AML Dataset and register it into Workspace
        example_data = 'https://raw.githubusercontent.com/mixmasteru/MLEND-capstone/main/data/mushrooms.csv'
        dataset = Dataset.Tabular.from_delimited_files(example_data)
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=ds_key,
                                   description=ds_description_text)
df = dataset.to_pandas_dataframe()
df.describe()
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,True,p,False,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,True,a,False,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,True,l,False,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,True,p,False,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,False,n,False,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Environment
create sklearn environment from conda_dependencies.yml

In [4]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='sklearn_dependencies.yml')


## Hyperdrive Configuration



In [5]:
from azureml.core import ScriptRunConfig

if "training" not in os.listdir():
    os.mkdir("./training")

src = ScriptRunConfig(source_directory='./',
                      script='train.py',
                      compute_target=compute_target,
                      environment=sklearn_env)


# Create an early termination policy.
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#Specify parameter sampler
ps = RandomParameterSampling( {
             "C": uniform(0.1, 2.0),
             "max_iter": choice(50, 100, 250)
         }
     )


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=30,
                                     max_duration_minutes=30,
                                     max_concurrent_runs=4)


## Run Details

the `RunDetails` widget shows the different experiments.

In [6]:
#Submit experiment
hyperdrive_run = experiment.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_b1ca1680-5e10-484b-9e90-454175bac2b7
Web View: https://ml.azure.com/experiments/Mushroom/runs/HD_b1ca1680-5e10-484b-9e90-454175bac2b7?wsid=/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourcegroups/aml-quickstarts-139396/workspaces/quick-starts-ws-139396

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-23T20:07:02.873433][API][INFO]Experiment created<END>\n""<START>[2021-02-23T20:07:03.695374][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-23T20:07:03.976950][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-23T20:07:04.8213867Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_b1ca1680-5e10-484b-9e90-454175bac2b7
Web View: https://ml.azure.com/experiments/Mushroom/runs/HD_b1ca1680-5e10-484b-9e90-454175bac2b7?wsid=/subscriptions/81cefad3-d2c9-4f77-

{'runId': 'HD_b1ca1680-5e10-484b-9e90-454175bac2b7',
 'target': 'comp',
 'status': 'Completed',
 'startTimeUtc': '2021-02-23T20:07:02.514988Z',
 'endTimeUtc': '2021-02-23T20:38:44.033528Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'bb59235f-b117-441f-b695-83fbb6fd5457',
  'score': '1.0',
  'best_child_run_id': 'HD_b1ca1680-5e10-484b-9e90-454175bac2b7_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg139396.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_b1ca1680-5e10-484b-9e90-454175bac2b7/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=w3OBQ8FCKoxPc7MgUatjkItvLTUZJ4EtHQYDlomMtig%3D&st=2021-02-23T20%3A28%3A59Z&se=2021-02-24T04%3A38%3A59Z&sp=r'},
 'submittedBy': 'ODL_User 139396'}

## Best Model

get the best model from the hyperdrive experiments and display all the properties of the model

In [8]:
#Save the best model
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])
print(best_run.get_metrics())
best_run.get_file_names()
best_run.download_file(name="outputs/model.joblib", output_file_path="./outputs/")

best_model = joblib.load("./outputs/model.joblib")

#TODO: Save the best model
df_test = dataset.to_pandas_dataframe()
X_test, y_test = clean_data(df_test)

#y_test = df_test['class']
#X_test = df_test.drop(['class'], axis=1)

ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

['--C', '0.6023091332838605', '--max_iter', '100']
{'Regularization Strength:': 0.6023091332838605, 'Max iterations:': 100, 'Accuracy': 1.0}


Trying to unpickle estimator LogisticRegression from version 0.23.2 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.


Unnamed: 0,0,1
0,3896,2
1,0,4016
