# Automated ML

Import of all the dependencies that we need to complete the project.

In [1]:
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

import azureml.core
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
from azureml.train.automl import AutoMLConfig

import os
import joblib
import pandas as pd
import numpy as np

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Dataset

### Overview

Dataset from Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020)

Heart failure is a common event caused by Cardiovascular diseases (CVDs) and this dataset contains 12 features that can be used to predict mortality by heart failure.

01- age : Age of the patient (years)
02- anaemia : Decrease of red blood cells or hemoglobin (boolean)
03- creatinine_phosphokinase : Level of the CPK enzyme in the blood (mcg/L)
04- diabetes : If the patient has diabetes (boolean)
05- ejection_fraction : Percentage of blood leaving the heart at each contraction (percentage)
06- high_blood_pressure : If the patient has hypertension (boolean)
07- platelets : Platelets in the blood (kiloplatelets/mL)
08- serum_creatinine : Level of serum creatinine in the blood (mg/dL)
09- serum_sodium :Level of serum sodium in the blood (mEq/L)
10- sex : Woman or man (binary)
11- smoking : If the patient smokes or not (boolean)
12- time : Follow-up period (days)

### Task

An Azure Auto ML will be performed to predict if the patient deceased during the follow-up period (DEATH_EVENT : boolean), based on the 12 clinical features. 

### Connect to a workspace

In [2]:
ws = Workspace.from_config()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-138652
Azure region: southcentralus
Subscription id: 976ee174-3882-4721-b90a-b5fef6b72f24
Resource group: aml-quickstarts-138652


### Create an Azure ML experiment

In [3]:
# choose a name for experiment
experiment_name = 'automl_heart_failure_experiment'
project_folder = './automl-model'
experiment=Experiment(ws, experiment_name)
experiment
run = experiment.start_logging()

### Create and explore dataset

In [4]:
# Data set was downloaded as a csv file and registered as data set in the workspace
dataset=Dataset.get_by_name(ws,name="heart_failure_clinical_records_dataset")
df = dataset.to_pandas_dataframe()
df.describe()
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
currDir=os.getcwd()
print(currDir)
os.listdir(currDir)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/notebook138652/code


['.config',
 '.ipynb_checkpoints',
 'automl-model',
 'automl.ipynb',
 'automl_errors.log',
 'azureml_automl.log',
 'heart_failure_clinical_records_dataset.csv',
 'hyperdrive_model.joblib',
 'hyperparameter_tuning.ipynb',
 'outputs',
 'train.py',
 'training',
 'Users']

### Create or Attach a Compute Resource

In [6]:
# Create compute cluster
# Use vm_size = "STANDARD_D3_V2" in provisioning configuration.
# max_nodes 5.

# Choose a name for CPU cluster
cluster_name = "my-cpu-cluster"

# Check if the compute target exists
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target, use it')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D3_V2', 
                                                           max_nodes=5)
    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# get a detailed status for the current cluster
print(compute_target.get_status().serialize())

Found existing compute target, use it
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 1, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-02-12T02:41:33.717000+00:00', 'errors': None, 'creationTime': '2021-02-12T02:01:27.724229+00:00', 'modifiedTime': '2021-02-12T02:01:44.155997+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D3_V2'}


## AutoML Configuration

We didn't explicitly specified either a validation_data or n_cross_validation parameter, automated ML applies default techniques depending on the number of rows provided in the single training_data=dataset. Dataset is less than 1,000 rows, 10 folds are used.

In [7]:
# AutoML settings
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
}

# AutoML config
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="DEATH_EVENT",   
                             path = project_folder,
                             enable_early_stopping= True,
                             debug_log = "automl_errors.log",
                             **automl_settings
)

In [8]:
# Submit the experiment
remote_run = experiment.submit(automl_config)

Running on remote.


## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

Using the `RunDetails` widget to show the different experiments.

In [9]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl_heart_failure_experiment,AutoML_415e87b7-c932-44e2-9303-71d0e67f8df7,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


In [10]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Beginning model selection.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
              
DETAILS:      
+---------------------------------+
|Number of folds                  |
|10                               |
+---------------------------------+

****************************************************************************************************

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

*************

{'runId': 'AutoML_415e87b7-c932-44e2-9303-71d0e67f8df7',
 'target': 'my-cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-12T02:42:56.837842Z',
 'endTimeUtc': '2021-02-12T03:05:21.070692Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'my-cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"45e12751-b944-494f-9a2d-1bfb2673fb6e\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/02-12-2021_023204_UTC/heart_failure_clinical_records_dataset.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-138652\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"9

## Best Model

Getting the best model from the automl experiments and displaying all the properties of the model.

In [11]:
# Retrieve and save the best automl model.
automl_best_run, fitted_automl_best_model = remote_run.get_output()
best_run_metrics = automl_best_run.get_metrics()


Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


In [13]:
print('=========================== Best Run ID ===========================')
print(automl_best_run.id)
print('=========================== Best Run ===========================')
print(automl_best_run)
print('=========================== Best Model ===========================')
print(fitted_automl_best_model)
print('=========================== Best Run File Names ===========================')
print(automl_best_run.get_file_names())
print('=========================== Best Run Metrics ===========================')
print(best_run_metrics)

AutoML_415e87b7-c932-44e2-9303-71d0e67f8df7_43
Run(Experiment: automl_heart_failure_experiment,
Id: AutoML_415e87b7-c932-44e2-9303-71d0e67f8df7_43,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                objective=None,
                                                                                        

In [36]:
# Metric in best run

for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    
print('\nAccuracy of Best Run',best_run_metrics['accuracy'],sep='\n')
print(automl_best_run)

average_precision_score_micro 0.9252470516459855
AUC_weighted 0.9136457500800752
precision_score_macro 0.8915326605761388
norm_macro_recall 0.6951883950653974
precision_score_micro 0.8796551724137931
f1_score_weighted 0.8745402351846987
AUC_micro 0.922386180472982
AUC_macro 0.9136457500800752
matthews_correlation 0.7351420424664441
accuracy 0.8796551724137931
average_precision_score_macro 0.9029492426074833
recall_score_macro 0.8475941975326986
f1_score_micro 0.8796551724137931
recall_score_micro 0.8796551724137931
precision_score_weighted 0.8939134261540558
recall_score_weighted 0.8796551724137931
f1_score_macro 0.8544847640857043
log_loss 0.37523434416524704
average_precision_score_weighted 0.9271023106244887
weighted_accuracy 0.8997600271088567
balanced_accuracy 0.8475941975326986
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_415e87b7-c932-44e2-9303-71d0e67f8df7_43/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_415e87b7-c932-44e2-9303-71d0e67f

In [37]:
print('=========================== Best Run Properties ===========================')
model_name = automl_best_run.properties['model_name']
model_name



'AutoML415e87b7c43'

In [38]:
#TODO: Save the best model

os.makedirs('./outputs', exist_ok=True)
joblib.dump(fitted_automl_best_model, filename='outputs/automl_best_model.joblib')

['outputs/automl_best_model.joblib']

## Model Deployment

We have to deploy only one of the two models we trained.

In the cell below, register the model, create an inference config and deploy the model as a web service.

In [39]:
#Register the model

description = 'AutoML Model trained on heart failure dataset to predict if the patient deceased during the follow-up period'
tags = None
model = remote_run.register_model(model_name=model_name, description=description, tags=tags)
print(model.name, model.id, model.version, sep='\t')

AutoML415e87b7c43	AutoML415e87b7c43:4	4


In [40]:
#create an inference config and deploy the model as a web service on Azure Container Instance

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment


environment = Environment.get(ws, "AzureML-AutoML")

inference_config = InferenceConfig(entry_script='score.py',
                                   environment=environment)
service_name = 'heart-failure-automl-deploy'
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True
                      )
service.wait_for_deployment(show_output=True)

scoring_uri = service.scoring_uri
print(scoring_uri)

WebserviceException: WebserviceException:
	Message: Conflict of operation, another operation on same entity is already running in workspace quick-starts-ws-138652.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Conflict of operation, another operation on same entity is already running in workspace quick-starts-ws-138652."
    }
}

In [31]:
service.update(enable_app_insights=True)

Sending a request to the web service we deployed to test it.

In [32]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'age': "75",
            'anaemia': "0",
            'creatinine_phosphokinase': "582",
            'diabetes': "0",
            'ejection_fraction': "20",
            'high_blood_pressure': "1",
            'platelets': "265000",
            'serum_creatinine': "1.9",
            'serum_sodium': "130",
            'sex': "1",
            'smoking': "0",
            'time': "4",
        },
        {
            'age': "42",
            'anaemia': "0",
            'creatinine_phosphokinase': "320",
            'diabetes': "0",
            'ejection_fraction': "31",
            'high_blood_pressure': "0",
            'platelets': "221000",
            'serum_creatinine': "1",
            'serum_sodium': "130",
            'sex': "1",
            'smoking': "0",
            'time': "144",
        },
    ],
}

body = str.encode(json.dumps(data))

url = 'http://d31eb118-db76-45d2-a8bf-6054fa981909.southcentralus.azurecontainer.io/score'
api_key = '' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

URLError: <urlopen error [Errno -2] Name or service not known>

Print the logs of the web service and delete the service

In [28]:
service.get_logs()

'2021-02-12T03:39:06,077507700+00:00 - rsyslog/run \n2021-02-12T03:39:06,086125200+00:00 - gunicorn/run \n2021-02-12T03:39:06,080827200+00:00 - iot-server/run \n2021-02-12T03:39:06,179104200+00:00 - nginx/run \n/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)

In [29]:
service.delete()