# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice

## Dataset

### Overview
I am using credit card transactions data from Kaggle ( more details on readme file). Problem statement is to predict whether the transaction is fraud (1) or not (0). This is a supervised binary classification task. 

In [2]:
workspace = Workspace.from_config()
experiment_name = 'creditcardfraud'

experiment=Experiment(workspace, experiment_name)

print('Workspace name: ' + workspace.name, 
      'Azure region: ' + workspace.location, 
      'Subscription id: ' + workspace.subscription_id, 
      'Resource group: ' + workspace.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: AzureML_Nirmal_Test
Azure region: westus2
Subscription id: 10c5d508-c599-42ff-85c4-c15b92f298b5
Resource group: nirmal-test


In [3]:
from azureml.core import Workspace, Dataset


dataset = Dataset.get_by_name(workspace, name='creditcard')
df=dataset.to_pandas_dataframe()

In [4]:
# overview of data
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Hyperdrive Configuration


In [5]:
# Make sure the compute cluster is set up
# check if compute clusters are set up
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "automl-compute"
try:
    cpu_cluster = ComputeTarget(workspace=workspace, name=cpu_cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Explaination of termination policy, and param sampling

I have selected bandit policy for early termination. It defines an early termination policy based on slack criteria, and a frequency and delay interval for evaluation. Again the main reason for this policy selection is performance and saving resources. Any run that doesn't fall within the slack factor or slack amount of the evaluation metric with respect to the best performing run will be terminated, thus saving the compute resource. Concretely, the configuration used in my hyperdrive config will evaluate jobs every 1 step and will terminate jobs that are not within 10 percent slack of the best performing job at that particular step. On larger models, this strategy typically saves significant compute time with no impact on the performance of the best model trained.

In [6]:
# using logistic reg

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
from azureml.train.hyperdrive.parameter_expressions import choice
ps = RandomParameterSampling( {
                                "--C": uniform(0.1,1),
                                "--max_iter": choice(50,100,150,200)
                                })

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = '.', entry_script = 'training1.py', compute_target =  cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(hyperparameter_sampling = ps,                                    
                                     primary_metric_name = 'Accuracy',                                    
                                     max_total_runs = 24,                                    
                                     max_concurrent_runs = 4,                                   
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,  
                                     policy = policy,                                   
                                     estimator = est)

In [7]:
# Submit experiment

hyperdrive_run = experiment.submit(hyperdrive_run_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_855411d3-c568-4296-b9bf-5f2993eb9a3d
Web View: https://ml.azure.com/experiments/creditcardfraud/runs/HD_855411d3-c568-4296-b9bf-5f2993eb9a3d?wsid=/subscriptions/10c5d508-c599-42ff-85c4-c15b92f298b5/resourcegroups/nirmal-test/workspaces/AzureML_Nirmal_Test

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-15T00:32:22.318834][API][INFO]Experiment created<END>\n""<START>[2021-02-15T00:32:23.030582][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-15T00:32:23.409593][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_855411d3-c568-4296-b9bf-5f2993eb9a3d
Web View: https://ml.azure.com/experiments/creditcardfraud/runs/HD_855411d3-c568-4296-b9bf-5f2993eb9a3d?wsid=/subscriptions/10c5d508-c599-42ff-85c4-c15b92f298b5/resourcegroups/nirmal-test/workspaces/AzureML_Nirmal_Test



{'runId': 'HD_855411d3-c568-4296-b9bf-5f2993eb9a3d',
 'target': 'automl-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-15T00:32:22.141031Z',
 'endTimeUtc': '2021-02-15T01:10:28.916091Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '7e1380a8-af43-4b05-8e4b-cbdb88e7ce51',
  'score': '0.999180730788137',
  'best_child_run_id': 'HD_855411d3-c568-4296-b9bf-5f2993eb9a3d_3',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://azuremlnirmalt0578080815.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_855411d3-c568-4296-b9bf-5f2993eb9a3d/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=JQybu5esSJPnrhvrObr2%2Fi%2BngGc%2BUH%2BfPNp3WOjOhBI%3D&st=2021-02-15T01%3A00%3A29Z&se=2021-02-15T09%3A10%3A29Z&sp=r'}}

## Best Model



In [8]:
# get details of the best model
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run ID : ',best_run.id)
print('\nMetrics: ',best_run_metrics)

print('\nParameters:', parameter_values)
print('\nAccuracy: ', best_run_metrics['Accuracy'])

Best Run ID :  HD_855411d3-c568-4296-b9bf-5f2993eb9a3d_3

Metrics:  {'Regularization Strength:': 0.11220515418007793, 'Max iterations:': 200, 'Accuracy': 0.999180730788137}

Parameters: ['--C', '0.11220515418007793', '--max_iter', '200']

Accuracy:  0.999180730788137


In [10]:
print (best_run.log_confusion_matrix)

<bound method Run.log_confusion_matrix of Run(Experiment: creditcardfraud,
Id: HD_855411d3-c568-4296-b9bf-5f2993eb9a3d_3,
Type: azureml.scriptrun,
Status: Completed)>


In [9]:
#Save the best model
import joblib
from azureml.core.model import Model

model = best_run.register_model(
    model_name='hyperdrive_best_model', 
    model_path='./outputs/model.pkl', 
    model_framework=Model.Framework.SCIKITLEARN)



###  For deployment, AutoML is selected, so automl.ipynb file has the deployment code