# Automated ML

## Import libraries for Azure Machine Learning SDK

In [None]:
import os
import csv
import joblib
import logging
import sklearn
import pkg_resources

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn import datasets

import azureml.core
from azureml.core import Workspace, Experiment, Model

from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.webservice import LocalWebservice
from azureml.core.conda_dependencies import CondaDependencies

from azureml.train.automl import AutoMLConfig

from azureml.pipeline.steps import AutoMLStep
from azureml.contrib.pipeline.steps import ParallelRunStep
from azureml.contrib.pipeline.steps import ParallelRunConfig

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Initialize Workspace
Initialize a workspace object from persisted configuration. Make sure the config file is present at .\config.json

In [None]:
ws = Workspace.from_config()

print('Workspace name:\t'  + ws.name,
      'Resource group:\t'  + ws.resource_group,
      'Azure region:\t'    + ws.location,
      'Subscription id:\t' + ws.subscription_id, sep='\n')

## Create an Azure ML experiment

In [None]:
# Name for experiment
experiment_name = 'automl-heart-failure-experiment'

experiment=Experiment(ws, experiment_name)
run = experiment.start_logging()

experiment

### Create or Attach an AmlCompute Target
We will need to create a compute target for our AutoML run. We will use ***vm_size = Standard_DS3_v2*** in our provisioning configuration and select ***max_nodes*** to be no greater than 4.

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Name for the CPU cluster
amlcompute_cluster_name = "automl-cpu-compute-cluster"

# Verify that cluster does not exist already
try:
    amlcompute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    amlcompute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', max_nodes=4)
    amlcompute_target = ComputeTarget.create(ws, amlcompute_cluster_name, amlcompute_config)

amlcompute_target.wait_for_completion(show_output=True)

In [None]:
compute_targets = ws.compute_targets

for i, key in enumerate(compute_targets):
    print(f"{i+1}. Compute target\n\tname: {compute_targets[key].name}\n\tType: {compute_targets[key].type}")

In [None]:
# For a more detailed view of current AmlCompute status, use get_status().
print(amlcompute_target.get_status().serialize())

## Dataset

### Overview
The dataset used for this project is the ***Heart Failure Clinical Records*** dataset, which can be found [here](https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records) in the UCI Machine Learning Repository. 

This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.

The task we are concerned with is to predict whether the patient died during the follow-up period. We will target the DEATH_EVENT column and since it is a boolean variable, the task is binary classification.

In [None]:
from azureml.core.dataset import Dataset

# Try to load the dataset from the Workspace. Otherwise, create it from the file
description_text = "Health Failure dataset from UCI ML-Repository for mortality prediction for the Capstone Project."
key = "HealthFailure Dataset"      # the key to match the dataset name

dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv"

if key in ws.datasets.keys():
    dataset = ws.datasets[key]
    print("The Dataset was found!")
else:
    dataset = Dataset.Tabular.from_delimited_files(dataset_url) # Create AML Dataset and register it into Workspace
    dataset = dataset.register(workspace=ws, name=key, description=description_text) # Register Dataset in Workspace

df = dataset.to_pandas_dataframe()

In [None]:
df.describe()

In [None]:
df.head()

### Prepare the datasets for the Automation

In [None]:
from sklearn.model_selection import train_test_split
from azureml.data.dataset_factory import TabularDatasetFactory

# Split the dataset into training and testing datasets
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True)

# Save training data to csv file
train_df.to_csv("./data/train_data.csv", index=False)

# Read saved training data and create a dataset in Azure ML
data_store = ws.get_default_datastore()
data_store.upload(src_dir="./data", target_path="training_data")
train_ds = TabularDatasetFactory.from_delimited_files(path=[(data_store, 'training_data/train_data.csv')])

### Review the Training Dataset Result

In [None]:
train_ds.take(5).to_pandas_dataframe()

## AutoML Configuration

- ***experiment_timeout_minutes = 20***: Specifies how long (in minutes) our experiment should run. In previous projects we could not set more than 30 minutes. We could use more in this project, but it's not needed for such a small training set. To reduce the time taken to train, experiment_timeout_minutes of 20 was chosen.

- ***max_concurrent_iterations = 4***: The maximum number of iterations that could be run in parallel. It is recommended to create a dedicated cluster per experiment and adjust the number of max_concurrent_iterations of your experiment to the number of nodes in the cluster. In this way you use all nodes of the cluster at the same time with the desired number of concurrent child runs/iterations. So I set the value to 4.

- ***primary_metric = 'accuracy'***: The metric that is optimized by automated machine learning for model selection. We have set the "accuracy"/"AUC_weighted".

- ***compute_target = amlcompute_target*** : The compute target with specific vm_size and max_nodes used to run the experiment. The local compute was chosen as this may be slower but generally provides better results.

- ***task = 'classification'*** : We have a classification task to do, we want to predict whether the person will have heart failure or not. In other words, we're trying to predict the DEATH_EVENT.

- ***training_data = train_ds*** : The data (80% of the total dataset) on which used in the experiment to train the algorithm.

- ***label_column_name = "DEATH_EVENT"*** : The target variable to predict.

- ***path = project_folder*** : The full path to the Azure ML folder of the project './capstone-project'.

- ***enable_early_stopping = True*** : Early stopping is enabled so if a run is not performing well, it can stop early, again to save time and if not performing well continuing seems uncessary.

- ***featurization = 'auto'*** : indicator of whether the featurization step should be performed automatically or not, or whether a custom featurization should be used. I used "Auto" so the featurization step should be automatic.

- ***debug_log = "automl_errors.log"*** : The debug information are written to the automl_errors.log.

- ***enable_onnx_compatible_models = False*** : Whether to enable or disable enforcing the ONNX-compatible models.

- ***blocked_models = ['XGBoostClassifier']*** : What algorithm we want from AutoML to not run. I selected XGBoostClassifier, the answer could be found in the forum (Link). For those who don't have access, I have to say that it is for compatibility issues. So the lack of time to make the  XGBoostClassifier to run make me to enforce the AutoML to not run this specific algorithm.


In [None]:
import xgboost

project_folder = './capstone-project'

# Define automl settings
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'AUC_weighted'
}

# Define automl configuration settings
automl_config = AutoMLConfig(compute_target = amlcompute_target,
                             task = "classification",
                             training_data = train_ds,
                             label_column_name = "DEATH_EVENT",   
                             path = project_folder,
                             enable_early_stopping = True,
                             featurization = 'auto',
                             debug_log = "automl_errors.log",
                             enable_onnx_compatible_models = False,    # --> Addition
                             blocked_models = ['XGBoostClassifier'], # --> Addition
                             **automl_settings
                            )

In [None]:
# Submit the experiment to the compute target 
remote_run = experiment.submit(automl_config, show_output=True)

In [None]:
remote_run.wait_for_completion(show_output=True)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

Use the `RunDetails` widget to show the different experiments.

In [None]:
from azureml.widgets import RunDetails

RunDetails(remote_run).show()
for children_run in remote_run.get_children():
    print('-----------------------------------')
    print(children_run)

## Best Model

Get the best model from the automl experiments and display all the properties of the model.

In [None]:
# get the best model
best_run, best_model = remote_run.get_output()

In [None]:
best_model

In [None]:
from pprint import pprint

# parameter details of the best model
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(best_model)

In [None]:
best_run

In [None]:
best_run.get_tags()

In [None]:
best_run.get_metrics()

In [None]:
for primary_metric in best_run.get_metrics():
    metric=best_run_metrics[primary_metric]
    print(primary_metric,metric)

In [None]:
best_run.get_metrics(name='AUC_weighted')

In [None]:
best_run.get_details()

In [None]:
best_run.get_properties()

### Test the best model

In [None]:
# Split into x and y tests
y_test = test_df['DEATH_EVENT']
x_test = test_df.drop(['DEATH_EVENT'],axis=1)

In [None]:
from sklearn.metrics import confusion_matrix

# Test the best model and create a confusion matrix
ypred = best_model.predict(x_test)
cmatrix = confusion_matrix(y_test, ypred)

# Visualize the confusion matrix
##pd.DataFrame(cmatrix)
pd.DataFrame(cmatrix).style.background_gradient(cmap='Blues', low=0, high=0.9)

### Save the best model

In [None]:
best_run.get_file_names()

In [None]:
best_run.register_model(model_name='best_run_automl', model_path='./outputs/')

In [1]:
from azureml.automl.core.shared import constants

# create inference folder
inference_folder = 'inference'
if inference_folder not in os.listdir():
    os.mkdir(inference_folder)

# Save the best model
##joblib.dump(best_model, filename = inference_folder + '/best_automl_model.joblib')
##joblib.dump(value=best_model, filename='output/best-automl.pkl')

# Save the best model, scoring script, and conda env files in inference folder
best_run.download_file('outputs/scoring_file_v_1_0_0.py', inference_folder + '/best_automl_score.py')
best_run.download_file('outputs/model.pkl', inference_folder + '/best_automl_model.pkl')

best_run.download_file('outputs/conda_env_v_1_0_0.yml', inference_folder + 'automl_conda_env.yml')
##best_run.download_file(constants.CONDA_ENV_FILE_PATH, inference_folder + 'automl_conda_env.yml')

NameError: name 'os' is not defined

### Save the environment

In [None]:
# get the list of environments
Environment.list(workspace=ws).keys()

In [None]:
# save the environment
my_env = Environment.get(workspace=ws, name="AzureML-AutoML")
my_env.save_to_directory('env', overwrite=True)

my_env

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

Register the model, create an inference config and deploy the model as a web service.

In [None]:
# Register the model
from azureml.core.resource_configuration import ResourceConfiguration

model_name = best_run.properties['model_name']
local_file = inference_folder + '/best_automl_model.pkl'

run_id = best_run.id
experiment_name = best_run.experiment.name

model = Model.register(workspace = ws,
                       model_name = model_name,                        # Name of the registered model in your workspace.
                       model_path = local_file,                        # Local file to upload and register as a model.
                       model_framework = Model.Framework.SCIKITLEARN,  # Framework used to create the model.
                       model_framework_version = sklearn.__version__,  # Version of scikit-learn used to create the model.
                       description = 'Best autoML model to predict motality caused by heart failure.',
                       tags={'area': 'heart-failure', 'type': 'classification'})

print('Model name:', model.name)
print('Model id:', model.id)
print('Model version:', model.version)

In [None]:
# create inference configuration
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

env = Environment.from_conda_specification(name="my_env", file_path=inference_folder + 'automl_conda_env.yml')
env
#### Siehe Oben my_env --> Ist es gleich?

inference_config = InferenceConfig(entry_script=inference_folder + '/best_automl_score.py', environment=env)

# display the environment file
with open(file_path=inference_folder + 'automl_conda_env.yml', 'r') as file:
    env_file = file.read()
    print(env_file)

In [None]:
# Model Deployment
from azureml.core.webservice import AciWebservice

# define deployment configuration
aci_deployment_config = AciWebservice.deploy_configuration(cpu_cores=1,
                                                           memory_gb=1,
                                                           tags={'area': "heart-failure", 'type': "classification"},
                                                           description="Predict heart failure mortality using classification model",
                                                           auth_enabled=True,
                                                           enable_app_insights=True)

# deploy model as webservice using Azure Container Instance(ACI)
aci_service = Model.deploy(workspace = ws, 
                           name = "aci-heart-failure-deploy", 
                           models = [model], 
                           inference_config = inference_config, 
                           deployment_config = aci_deployment_config, 
                           overwrite=True)

aci_service.wait_for_deployment(show_output=True)

In [None]:
# get the active api endpoint for scoring
print(f"Service State: {aci_service.state}\n")
print(f"Scoring URI:   {aci_service.scoring_uri}\n")
print(f"Swagger URI:   {aci_service.swagger_uri}\n")

## Consuming the model
Send a request to the web service you deployed to test it.

In [None]:
# Send a request to the web service
import json
import requests

'''
input_data = json.dumps({
    "data": [
            [75.0, 0.0, 582.0, 0.0, 20.0, 1.0, 265000.0, 1.9, 130.0, 1.0, 0.0, 4.0],
            [80.0, 1.0, 123.0, 0.0, 35.0, 1.0, 388000.0, 9.4, 133.0, 1.0, 1.0, 10.0],
            [62.0, 0.0, 61.0, 1.0, 38.0, 1.0, 155000.0, 1.1, 143.0, 1.0, 1.0, 270.0],
            [50.0, 1.0, 111.0, 0.0, 20.0, 0.0, 210000.0, 1.9, 137.0, 1.0, 0.0, 7.0]
        ]
    })
'''

# 4 sets of data to score, so we get two results back
test_sample = test_df.sample(n=4)
labels = test_sample.pop('DEATH_EVENT')


# Convert to JSON string
input_data = json.dumps({"data": test_sample.to_dict(orient='records')})
with open("input_data.json", 'w') as _f:
    _f.write(input_data)

print(input_data)

response = requests.post(aci_service.scoring_uri, data=input_data, headers={'Content-Type':'application/json'})

In [None]:
print(f"Predictions from Service: {response.json()}\n")
print(f"Data Labels: {labels.tolist()}")

## Print the logs of the web service and delete the service

In [None]:
# Print the log of the webservice
print(aci_service.get_logs())

In [None]:
# Delete the webservice, model, and shut down the compute cluster
aci_service.delete()
model.delete()
amlcompute_target.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
