# Automated ML

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.dataset import Dataset
from azureml.train.automl.utilities import get_primary_metrics
from azureml.train.automl import AutoMLConfig
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment
from azureml.automl.core.shared import constants
import joblib

## Dataset

### Overview

The dataset used for this project is the [Heart Failure Prediction](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data) dataset taken from Kaggle.

This dataset contains 12 features that can be used to predict mortality by heart failure:

- age: Age of the patient
- amaemia: Decrease of red blood cells or hemoglobin
- creatinine_phosphokinase: Level of the CPK enzyme in the blood (mcg/L)
- diabetes: If the patient has diabetes
- ejection_fraction: Percentage of blood leaving the heart at each contraction
- high_blood_pressure: If the patient has hypertension
- platelets: Platelets in the blood (kiloplatelets/mL)
- serum_creatinine: Level of serum creatinine in the blood (mg/dL)
- serum_sodium: Level of serum sodium in the blood (mEq/L)
- sex: Woman or man
- smoking: If the patient smokes or not
- time: Follow-up period (days)

The target column is DEATH_EVENT which tells if the patient deceased during the follow-up period. The task performed in this project is to predict whether or not a death event occurs.

In [2]:
## Creating a new Experiment
ws = Workspace.from_config()
experiment_name = 'heart-failure-automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-138271
Azure region: southcentralus
Subscription id: cdbe0b43-92a0-4715-838a-f2648cc7ad21
Resource group: aml-quickstarts-138271


In [3]:
#Checking and printing existing compute targets
compute_targets= ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

notebook138271 ComputeInstance Succeeded
aml-compute AmlCompute Succeeded


In [4]:
#Create compute cluster
compute_cluster_name= "cpu-compute"

#Check if compute cluster already exists
try:
    compute_cluster=ComputeTarget(workspace=ws, name=compute_cluster_name)
    print("Found existing cluster, use it...")
except ComputeTargetException:
    print("Creating new cluster...")
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)
    
compute_cluster.wait_for_completion(show_output=True)

Creating new cluster...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
#Get data
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://raw.githubusercontent.com/neha7598/azure-ml-capstone/main/data/heart_failure_clinical_records_dataset.csv"

path_to_data= "https://raw.githubusercontent.com/neha7598/azure-ml-capstone/main/data/heart_failure_clinical_records_dataset.csv"
data=TabularDatasetFactory.from_delimited_files(path=path_to_data)

In [6]:
data = data.to_pandas_dataframe()
x=data.drop('DEATH_EVENT',axis=1)
y=data['DEATH_EVENT']

##split into train and test datasets
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.20)

#concatenate to form train and test datasets 
train_df=pd.concat([x_train, y_train], axis=1)
test_df=pd.concat([x_test, y_test], axis=1)

train_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
120,60.0,1,737,0,60,1,210000.00,1.50,135,1,1,95,0
109,45.0,0,292,1,35,0,850000.00,1.30,142,1,1,88,0
143,61.0,1,84,0,40,1,229000.00,0.90,141,0,0,110,0
63,45.0,0,582,0,35,0,385000.00,1.00,145,1,0,61,1
155,60.0,1,231,1,25,0,194000.00,1.70,140,1,0,120,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,65.0,0,94,1,50,1,188000.00,1.00,140,1,0,29,1
65,60.0,0,68,0,20,0,119000.00,2.90,127,1,1,64,1
135,75.0,0,582,0,40,0,263358.03,1.18,137,1,0,107,0
247,64.0,0,143,0,25,0,246000.00,2.40,135,1,0,214,0


In [7]:
#save pandas dataframe as .csv and upload to datastore
if not os.path.isdir('data'):
    os.mkdir('data')
pd.DataFrame(train_df).to_csv("data/train_data.csv", index=False)
pd.DataFrame(test_df).to_csv("data/test_data.csv", index=False)

ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='heart-failure', overwrite=True, show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 1 files out of an estimated total of 2
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_0e32e7b53a8945c3a4fa96ca64b39a3c

In [8]:
#Load dataset as TabularDataset
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('heart-failure/train_data.csv'))

In [9]:
get_primary_metrics("classification")

['average_precision_score_weighted',
 'norm_macro_recall',
 'accuracy',
 'AUC_weighted',
 'precision_score_weighted']

## AutoML Configuration
The AutoML Config class is a way of leveraging the AutoML SDK to automate machine learning. 

The AutoML settings file is a dictionary that specifies all the parameters controlling the experiment like experiment_timeout_minutes, whether or not to enable early stoppong, number of cross validations, the primary metric, etc.

In the AutoML Config, we specify the task, the training data to be used, the target column name, the compute target and we pass the automl settings dictionary.

In [10]:
# automl settings 
automl_settings = {
    "enable_early_stopping" : True,
    "experiment_timeout_minutes": 30,
    "n_cross_validations": 4
    "featurization": 'auto',
    "primary_metric": 'accuracy',
    "verbosity": logging.INFO
}

# automl config 
automl_config = AutoMLConfig(
    task='classification',
    debug_log = 'automl_errors.log'
    training_data=train_data,
    label_column_name='DEATH_EVENT',
    compute_target=compute_cluster,
    **automl_settings
)

In [11]:
# Submit your experiment
remote_run = experiment.submit(automl_config)

Running on remote.


## Run Details

In [12]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [13]:
remote_run.wait_for_completion()

{'runId': 'AutoML_8460f68a-b31a-4570-b383-18f00fbb3873',
 'target': 'cpu-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-09T16:40:56.037052Z',
 'endTimeUtc': '2021-02-09T17:23:14.503627Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'cpu-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"ab475806-eb9e-4e28-b3f9-0d0ce12d3aeb\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"heart-failure/train_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-138271\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"cdbe0b43-92a0-4715-838a-f2648cc7ad21\\\\\\", \\

## Best Model

In [14]:
#get best model and print all the metrics
best_run, fitted_model = remote_run.get_output()

best_run_metrics = best_run.get_metrics() 
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


AUC_macro 0.92239835700362
f1_score_micro 0.8869350282485876
f1_score_weighted 0.8837461572203347
AUC_micro 0.9328057271218361
precision_score_macro 0.887601530503087
average_precision_score_weighted 0.930592954643481
accuracy 0.8869350282485876
average_precision_score_micro 0.9347014518807041
log_loss 0.3505280354352506
recall_score_micro 0.8869350282485876
precision_score_micro 0.8869350282485876
recall_score_weighted 0.8869350282485876
AUC_weighted 0.9223983570036202
matthews_correlation 0.7384355460607587
balanced_accuracy 0.8525323725981622
recall_score_macro 0.8525323725981622
f1_score_macro 0.8649791170651248
weighted_accuracy 0.913339964633068
norm_macro_recall 0.7050647451963242
precision_score_weighted 0.8878058876823796
average_precision_score_macro 0.9148365559723477
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_8460f68a-b31a-4570-b383-18f00fbb3873_24/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_8460f68a-b31a-4570-b383-18f00fbb387

In [None]:
# Details of best model as well as the parameters selected
print(best_run)
print(fitted_model)

In [15]:
#Save the best model
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')
joblib.dump(fitted_model, filename= "outputs/automl_model.pkl")

Model(workspace=Workspace.create(name='quick-starts-ws-138271', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-138271'), name=automl_best_model.pkl, id=automl_best_model.pkl:1, version=1, tags={}, properties={})

## Model Deployment

In [18]:
# Download score.py and env file
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'inference/score.py')
best_run.download_file(constants.CONDA_ENV_FILE_PATH,'automl_env.yml')

In [19]:
# Register the model
model_name = best_run.properties['model_name']
description = 'AutoML Model trained on heart failure data to predict if death event occurs or not'
tags = None
model = remote_run.register_model(model_name = model_name, description = description, tags = tags)

print(remote_run.model_id) # This will be written to the script file later in the notebook.

AutoML8460f68ab24


In [20]:
# Create inference config
inference_config = InferenceConfig(entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 1, 
                                               tags = {'area': "hfData", 'type': "automl_classification"}, 
                                               description = 'Heart Failure Prediction')

aci_service_name = 'automl-heart-failure'
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automl-heart-failure
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [26]:
# Enable Application Insights
aci_service.update(enable_app_insights=True)

In [27]:
print("State "+ aci_service.state)
print("Swagger URI " + aci_service.swagger_uri)
print("Scoring URI " + aci_service.scoring_uri)

State Healthy
Swagger URI http://7a93d622-7028-4861-8b9a-7f3dfb46df1c.southcentralus.azurecontainer.io/swagger.json
Scoring URI http://7a93d622-7028-4861-8b9a-7f3dfb46df1c.southcentralus.azurecontainer.io/score


In the cell below, a request is sent to the web service deployed to test it.

In [29]:
import requests
import json

# URL for the web service, should be similar to:
# 'http://8530a665-66f3-49c8-a953-b82a2d312917.eastus.azurecontainer.io/score'
scoring_uri = aci_service.scoring_uri
# If the service is authenticated, set the key or token

# Two sets of data to score, so we get two results back
data = {"data":
        [
          {
            "age": 70.0,
            "anaemia": 1,
            "creatinine_phosphokinase": 4020,
            "diabetes": 1,
            "ejection_fraction": 32,
            "high_blood_pressure": 1,
            "platelets": 234558.23,
            "serum_creatinine": 1.4,
            "serum_sodium": 125,
            "sex": 0,
            "smoking": 1,
            "time": 12
          },
          {
            "age": 75.0,
            "anaemia": 0,
            "creatinine_phosphokinase": 4221,
            "diabetes": 0,
            "ejection_fraction": 22,
            "high_blood_pressure": 0,
            "platelets": 404567.23,
            "serum_creatinine": 1.1,
            "serum_sodium": 115,
            "sex": 1,
            "smoking": 0,
            "time": 7
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())

{"result": [1, 1]}


In [30]:
#Print the logs of the deployed service
aci_service.get_logs()

'2021-02-09T17:42:45,189026127+00:00 - iot-server/run \n2021-02-09T17:42:45,190888064+00:00 - rsyslog/run \n2021-02-09T17:42:45,192040387+00:00 - gunicorn/run \nrsyslogd: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libuuid.so.1: no version information available (required by rsyslogd)\n/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6

In [None]:
#Delete the deployed service
aci_service.delete()