# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [4]:
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.data.dataset_factory import TabularDatasetFactory
import os
import shutil

## Initialize Workspace

Create a workspace, if it doesn't exist, using the AzureML SDK

In [5]:
ws = Workspace.from_config()
    
ws.get_details()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: wsptest
Azure region: eastus2
Subscription id: c04b3d3f-4994-454d-96ff-aa3f2050b57f
Resource group: testingmlfunctionnalities


## Cluster

Get cluster if it exists else create one

In [7]:
# Create compute cluster if it doesn't exist
cpu_cluster_name = "Covid19Cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4,identity_type="SystemAssigned")
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
# Get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-24T14:11:51.667000+00:00', 'errors': None, 'creationTime': '2021-03-23T18:45:47.925166+00:00', 'modifiedTime': '2021-03-23T18:46:05.507627+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


## Dataset

In [8]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Covid19InfectionsDataset"
description_text = "Covid19 Vaccination DataSet from Github"
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./github/owid-covid-data.csv'],
                       target_path ='train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)
if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    original_path = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
    ds = TabularDatasetFactory.from_delimited_files(original_path, infer_column_types=True, separator=',', header=True)
    
    
    #ds = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/country_vaccinations.csv')])
    
    #Register Dataset in Workspace
    dataset = ds.register(workspace=ws,
                          name=key,
                          description=description_text)

df = dataset.to_pandas_dataframe().fillna(0)
df.describe()

Uploading an estimated of 1 files
Uploading ./github/owid-covid-data.csv
Uploaded ./github/owid-covid-data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,median_age,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
count,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,...,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0,76832.0
mean,674568.8,5147.986737,5085.558725,17281.97,114.519862,113.40752,8010.50717,66.915829,65.905319,163.119118,...,27.691658,7.876215,5.020545,17414.898298,236.145449,7.25385,23.487756,2.546513,69.505478,0.667259
std,4764448.0,32244.967634,31709.509318,109493.8,674.980293,653.026183,15910.302831,168.806053,141.874786,319.547557,...,12.422652,6.481276,4.36924,19628.435803,133.793149,4.297155,33.327043,2.519171,17.53344,0.246902
min,0.0,-74347.0,-6223.0,0.0,-1918.0,-232.143,0.0,-2153.437,-276.825,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,686.0,1.0,4.429,10.0,0.0,0.0,125.407,0.061,0.748,1.67725,...,19.6,3.008,1.756,2896.913,140.448,4.61,0.0,0.7,66.47,0.555
50%,7946.5,56.0,63.714,131.0,1.0,0.857,1052.2965,5.9185,7.618,19.914,...,29.0,5.344,3.212,10727.146,233.07,6.93,0.0,2.0,74.16,0.737
75%,88402.25,665.25,672.4645,1738.0,11.0,11.571,7332.148,56.556,62.49675,142.83975,...,38.0,13.26,8.353,25063.846,318.949,9.75,47.782,3.6,78.49,0.828
max,124202100.0,880902.0,739564.429,2734098.0,17903.0,14431.429,150016.178,8652.658,2648.773,2357.24,...,48.2,27.049,18.493,116935.6,724.417,30.53,98.999,13.8,86.75,0.957


In [9]:
experiment_name = 'Covid19HyperDriveExperiment'

experiment=Experiment(ws, experiment_name)

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [11]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor = 0.2, evaluation_interval=2, delay_evaluation=5)
from azureml.train.hyperdrive import GridParameterSampling
#Create the different params that you will be using during training
param_sampling = GridParameterSampling( 
    {
        '--C': choice(0.01, 0.1, 1, 10, 100), 
        '--max_iter': choice(25, 50, 100,150)
    }
)

if "HyperDrive_training" not in os.listdir():
    os.mkdir("./HyperDrive_training")
    os.mkdir("./HyperDrive_training/github")
    
training_folder = './HyperDrive_training/' 
githubDataset_folder = './HyperDrive_training/github/'   
os.makedirs(training_folder, exist_ok=True)
os.makedirs(githubDataset_folder, exist_ok=True)
shutil.copy('TrainCovid19Infections.py', training_folder)
shutil.copy('github/owid-covid-data.csv', githubDataset_folder)

#Create your estimator and hyperdrive config
estimator = SKLearn(source_directory=training_folder, compute_target= cpu_cluster, entry_script='TrainCovid19Infections.py')

hyperdrive_run_config = HyperDriveConfig(
                                   hyperparameter_sampling = param_sampling,
                                   primary_metric_name = 'Accuracy',
                                   primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                   max_total_runs=100,
                                   max_concurrent_runs = 3,
                                   policy = early_termination_policy,
                                   estimator = estimator)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [12]:
#TODO: Submit your experiment
tag = {"Covid19Infections": "Capstone project: Covid19 HyperDrive Experiment"}
remote_run = experiment.submit(hyperdrive_run_config,tags=tag, show_output=True)




## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

In [13]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_82c52506-96af-4781-9cd9-9a16e927ad99
Web View: https://ml.azure.com/experiments/Covid19HyperDriveExperiment/runs/HD_82c52506-96af-4781-9cd9-9a16e927ad99?wsid=/subscriptions/c04b3d3f-4994-454d-96ff-aa3f2050b57f/resourcegroups/testingmlfunctionnalities/workspaces/wsptest

Execution Summary
RunId: HD_82c52506-96af-4781-9cd9-9a16e927ad99
Web View: https://ml.azure.com/experiments/Covid19HyperDriveExperiment/runs/HD_82c52506-96af-4781-9cd9-9a16e927ad99?wsid=/subscriptions/c04b3d3f-4994-454d-96ff-aa3f2050b57f/resourcegroups/testingmlfunctionnalities/workspaces/wsptest



{'runId': 'HD_82c52506-96af-4781-9cd9-9a16e927ad99',
 'target': 'Covid19Cluster',
 'status': 'Canceled',
 'startTimeUtc': '2021-03-24T15:01:49.190732Z',
 'endTimeUtc': '2021-03-24T15:14:30.91303Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '2be996af-d266-4a92-9066-1ae191b2fea1'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://wspteststorage3a9c0a49cc.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_82c52506-96af-4781-9cd9-9a16e927ad99/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=kyoTjlyUQTOXlW3tFv6omUSkOJqnL062yh03%2FNhorGI%3D&st=2021-03-24T15%3A07%3

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Best Model

In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [None]:
import joblib
# Get your best run and save the model from that run.
best_run_HyperDr = remote_run.get_best_run_by_primary_metric()

# Get the metrics of the bestselected run
best_run_metrics = best_run_HyperDr.get_metrics()
# Show the Accuracy of that run
print('Best Accuracy: {}'.format(best_run_metrics['Accuracy']))
best_run_HyperDr

In [None]:
parameter_values = best_run_HyperDr.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run_HyperDr.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])
print('\n C:',parameter_values[1])
print('\n max_iter:',parameter_values[3])


In [None]:
#Save the best model
Hyp_DrCovid19_Model = best_run_HyperDr.register_model(model_name="HyperDrCovid19Model", model_path='outputs/model.joblib')
print(Hyp_DrCovid19_Model.name,": Version Number",Hyp_DrCovid19_Model.version, sep='\t')