# Azure ML Hyperdrive avec Scikit-Learn

<img src='https://github.com/retkowsky/images/blob/master/AzureMLservicebanniere.png?raw=true'>

**Efficiently tune hyperparameters** for your model using Azure Machine Learning.<br>
**Hyperparameter tuning** includes the following steps:
<br>
- Define the parameter search space<br>
- Specify a primary metric to optimize<br>
- Specify early termination criteria for poorly performing runs<br>
- Allocate resources for hyperparameter tuning<br>
- Launch an experiment with the above configuration<br>
- Visualize the training runs<br>
- Select the best performing configuration for your model<br>

Documentation Hyperdrive avec Azure ML :
https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters

## 1. Introduction

In [None]:
import sys
sys.version

In [None]:
import datetime
now = datetime.datetime.now()
print(now)

In [None]:
import azureml.core
print("Version Azure ML service : ", azureml.core.VERSION)

In [None]:
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Workspace Azure ML :', ws.name)

In [None]:
from azureml.core import ComputeTarget, Datastore, Dataset

print("Compute Targets:")
for compute_name in ws.compute_targets:
    compute = ws.compute_targets[compute_name]
    print("\t", compute.name, ':', compute.type)
    
print("Datastores:")
for datastore_name in ws.datastores:
    datastore = Datastore.get(ws, datastore_name)
    print("\t", datastore.name, ':', datastore.datastore_type)
    
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name)

## 2. Hyperdrive pour trouver la meilleure combinaison

The remote compute you created is a four-node cluster, and you can take advantage of this to execute multiple experiment runs in parallel. One key reason to do this is to try training a model with a range of different hyperparameter values.

Azure ML includes a feature called *hyperdrive* that enables you to randomly try different values for one or more hyperparameters, and find the best performing trained model based on a metric that you specify - such as *Accuracy* or *Area Under the Curve (AUC)*.

> **More Information**: For more information about Hyperdrive, see the [Azure ML documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters).

Let's run a Hyperdrive experiment on the remote compute you have provisioned. First, we'll create the experiment and its associated folder.

In [None]:
import os
from azureml.core import Experiment

In [None]:
# Expérimentation
hyperdrive_experiment_name = 'Exemple14-Scikit-Learn-HyperDrive'

In [None]:
hyperdrive_experiment = Experiment(workspace = ws, name = hyperdrive_experiment_name)

hyperdrive_experiment_folder = './' + hyperdrive_experiment_name
os.makedirs(hyperdrive_experiment_folder, exist_ok=True)

print("Expérimentation :", hyperdrive_experiment.name)

In [None]:
%%writefile $hyperdrive_experiment_folder/diabetes_training.py

import argparse
import joblib
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

run = Run.get_context()

print("Chargement des données...")
diabetes = run.input_datasets['diabetes'].to_pandas_dataframe()

X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Partitionnement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Modélisation
print('Régression logistique avec taux de régularisation', reg)
run.log('Taux de régularisation',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# Accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy =', acc)
run.log('Accuracy', np.float(acc))

# AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC =' + str(auc))
run.log('AUC', np.float(auc))

# Courbe de ROC
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Courbe de ROC')
run.log_image(name = "ROC", plot = fig)
plt.show()

os.makedirs('outputs', exist_ok=True)

joblib.dump(value=model, filename='outputs/diabetes.pkl')

run.complete()

In [None]:
!ls Exemple14-Scikit-Learn-HyperDrive/diabetes_training.py -l

In [None]:
#Viewing the yml file
with open(os.path.join('./Exemple14-Scikit-Learn-HyperDrive/diabetes_training.py'), 'r') as f:
    print(f.read())

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cpu-standardd4"

try:
    compute1 = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D4', min_nodes=1, max_nodes=10)
    compute1 = ComputeTarget.create(ws, cluster_name, compute_config)

compute1.wait_for_completion(show_output=True)

In [None]:
# Liste des clusters
liste = ws.compute_targets
for liste in liste:
    print("Ressources compute du workspace :", liste)

In [None]:
# Définition de tags pour le run
tagsdurun = {"Type": "test" , "Langage" : "Python" , "Framework" : "Scikit-Learn" , "Hyperdrive" : "Gridsearch"}

In [None]:
from azureml.train.hyperdrive import GridParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

# Grid Search
params = GridParameterSampling(
    {
        # Différentes valeurs du paramétre de régularisation à tester
        '--regularization': choice(0.0005, 0.005, 0.01, 0.1)
    }
)

# Policy Bandit is an early termination policy based on slack factor/slack amount and evaluation interval. 
# The policy early terminates any runs where the primary metric is not within the specified slack factor/slack amount 
# with respect to the best performing training run.

policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Données
diabetes_ds = ws.datasets.get("diabetes dataset")

# Définition estimateur
hyper_estimator = SKLearn(source_directory=hyperdrive_experiment_folder,
                           inputs=[diabetes_ds.as_named_input('diabetes')], # Données en entrée
                           compute_target = compute1, # Compute server
                           conda_packages=['pandas','ipykernel','matplotlib'], #Dépendances
                           pip_packages=['azureml-sdk','argparse','pyarrow'], 
                           entry_script='diabetes_training.py')  # script Python

# Configuration hyperdrive
hyperdrive = HyperDriveConfig(estimator=hyper_estimator, 
                          hyperparameter_sampling=params, # Paramétres
                          policy=policy, #Policy
                          primary_metric_name='Accuracy', #Métrique
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, #Optimisation de la métrique
                          max_total_runs=10,
                          max_concurrent_runs=8)


Documentation:
    https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.banditpolicy?view=azure-ml-py
        

In [None]:
# Run
hyperdrive_run = hyperdrive_experiment.submit(config=hyperdrive, tags=tagsdurun)
RunDetails(hyperdrive_run).show()

> Temps de traitement : autour de **10 minutes**

In [None]:
# Progression du run
hyperdrive_run.get_details()

### On récupère le best run :

In [None]:
best_hyperdrive_run = hyperdrive_run.get_best_run_by_primary_metric()
best_hyperdrive_run_metrics = best_hyperdrive_run.get_metrics()
hyperdrive_parameter_values = best_hyperdrive_run.get_details() ['runDefinition']['arguments']

In [None]:
print("Résultats du best run de l'hyperparameter Tuning :")
print()
print('Best Run ID =', best_hyperdrive_run.id)
print()
print('Regularization Rate optimal =', hyperdrive_parameter_values)
print()
print('Métriques :')
print(' - AUC =', best_hyperdrive_run_metrics['AUC'])
print(' - Accuracy =', best_hyperdrive_run_metrics['Accuracy'])

### On référence le meilleur modèle :

In [None]:
from azureml.core import Model

best_hyperdrive_run.register_model(model_path='outputs/diabetes.pkl', 
                                   model_name='Diabetes',
                                   tags={'Training context':'Hyperdrive'},
                                   properties={'AUC': best_hyperdrive_run_metrics['AUC'],
                                               'Accuracy': best_hyperdrive_run_metrics['Accuracy']})

> Le modèle est disponible dans le section **Models** d'Azure ML Studio

### Liste des modèles référencés dans le workspace Azure ML

In [None]:
# Liste des modèles référencés
for model in Model.list(ws):
    print(model.name, '- version =', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

### Suppression du compute cluster

In [None]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, "(" , ct.type, ") :", ct.provisioning_state)

In [None]:
# Suppression du cluster
#compute1.delete()

In [None]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, "(" , ct.type, ") :", ct.provisioning_state)

<img src="https://github.com/retkowsky/images/blob/master/Powered-by-MS-Azure-logo-v2.png?raw=true" height="300" width="300">