# Hyperparameter Tuning using HyperDrive

Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pkg_resources

import sklearn  

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core import Environment

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import shutil


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.26.0


## Dataset

### Overview
The dataset used was Credit Card Fraud Detection dataset provided in the Kaggle website https://www.kaggle.com/mlg-ulb/creditcardfraud, where the goal is to predict whether a banking transaction would result in fraud based on several features. The dataset was upload to the Azure Blob Storage in the Notebooks section and it can be consumed as follows

In [2]:
ws = Workspace.from_config()

experiment_name = 'fraud_detection_hypervisor'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()
print(experiment)

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code FL2ANNQ7W to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-143132
Azure region: southcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-143132
Experiment(Name: fraud_detection_hypervisor,
Workspace: quick-starts-ws-143132)


It was created a Pandas dataframe to explore the dataset

In [3]:
dataset = Dataset.get_by_name(workspace=ws, name='Banking-Transactions')
transactions_df = dataset.to_pandas_dataframe()
transactions_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


This function is defined to explore the dataset and see the proportions of fraudalent and genuine transactions

In [4]:
# Calculate the fraction of data points that are fraudulent
def fraudulent_percentage(transaction_df):
    '''Calculate the fraction of all data points that have a 'Class' label of 1; fraudulent.
       :param transaction_df: Dataframe of all transaction data points; has a column 'Class'
       :return: A fractional percentage of fraudulent data points/all points
    '''
    # counts for all classes
    counts = transactions_df['Class'].value_counts()
    
    # get fraudulent and valid cnts
    fraud_cnts = counts[1]
    valid_cnts = counts[0]
    
    # calculate percentage of fraudulent data
    fraud_percentage = fraud_cnts/(fraud_cnts+valid_cnts)
    
    return fraud_percentage

Here you can see the proportions of fraudalent and genuine transactions

In [5]:
# call the function to calculate the fraud percentage
fraud_percentage = fraudulent_percentage(dataset)

print('Fraudulent percentage = ', fraud_percentage)
print('Total # of fraudulent pts: ', fraud_percentage*transactions_df.shape[0])
print('Out of (total) pts: ', transactions_df.shape[0])

Fraudulent percentage =  0.001727485630620034
Total # of fraudulent pts:  492.0
Out of (total) pts:  284807


In [6]:
# create the features dataframe
x = transactions_df.iloc[:, :-1]
# create the data label
y = transactions_df['Class']

# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50, shuffle=True)


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
# Create compute cluster
# Choose a name for the computer cluster:
amlcompute_cluster_name = "compute-cluster"

# Check if the cluster already exists:
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('The cluster already exists')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Creating...
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [8]:
from sklearn.ensemble import RandomForestClassifier
# Create an Azure environment
curated_env = Environment.get(workspace=ws, name='AzureML-Tutorial')
# Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

# Create the different params that you will be using during training
param_sampling = RandomParameterSampling({
    "--n_estimators": choice(1, 2, 4, 8, 16, 32, 64, 100, 150, 200),
    "--max_depth": choice(1, 2, 4, 8, 16, 32),
    "--min_samples_split": choice(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
    "--max_features": choice(1, 5, 10 , 15, 20, 25, 30)
})

#TODO: Create your estimator and hyperdrive config
if "training" not in os.listdir():
    os.mkdir("./training")

script_folder="./training"

# Create a copy the training script into the script_folder
shutil.copy("./train.py", script_folder)
# Create a ScriptRunConfig estimator for use with train.py
est = ScriptRunConfig(source_directory="./training",
        script="train.py",
        compute_target=aml_compute,
        environment=curated_env
)

hyperdrive_run_config = HyperDriveConfig(run_config=est,
                                     hyperparameter_sampling=param_sampling,
                                     policy=early_termination_policy,
                                     primary_metric_name='AUC_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [9]:
# Submit the experiment
hyperdrive_run = experiment.submit(config=hyperdrive_run_config)


## Run Details

Use the `RunDetails` widget to show the different experiments.

In [10]:
# Show run details with the widget
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_baa80ab3-bbd8-49c0-b965-903519dac4fa
Web View: https://ml.azure.com/runs/HD_baa80ab3-bbd8-49c0-b965-903519dac4fa?wsid=/subscriptions/d4ad7261-832d-46b2-b093-22156001df5b/resourcegroups/aml-quickstarts-143132/workspaces/quick-starts-ws-143132&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-04-19T09:42:23.261402][API][INFO]Experiment created<END>\n""<START>[2021-04-19T09:42:23.789565][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-04-19T09:42:23.947946][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-04-19T09:42:24.1334101Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_baa80ab3-bbd8-49c0-b965-903519dac4fa
Web View: https://ml.azure.com/runs/HD_baa80ab3-bbd8-49c0-b965-903519dac4fa?wsid=/subscriptions/d4ad7261-832d-46b2-b

{'runId': 'HD_baa80ab3-bbd8-49c0-b965-903519dac4fa',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-04-19T09:42:22.967539Z',
 'endTimeUtc': '2021-04-19T10:23:09.893907Z',
 'properties': {'primary_metric_config': '{"name": "AUC_weighted", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'b1221ea8-26c0-40f9-9156-90af2d6e46a6',
  'score': '0.9995505744220669',
  'best_child_run_id': 'HD_baa80ab3-bbd8-49c0-b965-903519dac4fa_4',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg143132.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_baa80ab3-bbd8-49c0-b965-903519dac4fa/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=G4QC6niJpVHG86%2BdW4tAuMNXtHRwIgdIbMHFZbrLtG8%3D&st=2021-04-19T10%3A13%3A28Z&se=2021-04-19T18%3A23%3A28Z&sp=r'},
 'submittedBy': 'ODL_User 143

## Best Model

In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [19]:
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(('Best model runId:{}\n\n' \
       'Best model hyperparameters:{}\n\n' \
       'Best model AUC weighted:{}').format(str(best_run.get_details()['runId']), str(best_run.get_details()['runDefinition']['arguments']),
        str(best_run.get_metrics()['AUC_weighted'])))

Best model runId:HD_baa80ab3-bbd8-49c0-b965-903519dac4fa_4

Best model hyperparameters:['--max_depth', '16', '--max_features', '15', '--min_samples_split', '20', '--n_estimators', '100']

Best model AUC weighted:0.9995505744220669


In [20]:
# list the names of the model files uploaded 
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_0a569660b6fc2ab87605ba46e6f188994533097b2cba449e294231d1b6424129_d.txt', 'azureml-logs/65_job_prep-tvmps_0a569660b6fc2ab87605ba46e6f188994533097b2cba449e294231d1b6424129_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_0a569660b6fc2ab87605ba46e6f188994533097b2cba449e294231d1b6424129_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/104_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


In [21]:
# Register and save the best model
model = best_run.register_model(model_name='best_hyperdrive_model', model_path='outputs/model.joblib')
model.download(target_dir='outputs_hyperdrive', exist_ok=True)

'outputs_hyperdrive/model.joblib'