# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv
import pkg_resources
import json
import requests

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

import sklearn
from sklearn import datasets
from sklearn.metrics import confusion_matrix

import azureml.core
from azureml.core import Model
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.dataset import Dataset
from azureml.core.environment import Environment
from azureml.core.experiment import Experiment
from azureml.core.model import InferenceConfig
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.webservice import AciWebservice
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.59.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.


TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [2]:
ws = Workspace.from_config()
print(
    'Workspace name: ' + ws.name, 
    'Azure region: ' + ws.location, 
    'Subscription id: ' + ws.subscription_id, 
    'Resource group: ' + ws.resource_group, sep = '\n'
    )

# Choose a name for the run history container in the workspace
experiment_name = 'heart-failure-capstone-aml-experiment'
project_folder = './Capstone-Project'

experiment = Experiment(ws, experiment_name)
print(experiment)

Workspace name: eunmldevamlwstrdigi
Azure region: northeurope
Subscription id: cf11c61d-e6ca-4f6b-b8df-d2a77e8a4d04
Resource group: seq00963-nprd-eun-mldev-aml-trdigi
Experiment(Name: heart-failure-capstone-aml-experiment,
Workspace: eunmldevamlwstrdigi)


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [3]:
# Choose a name for your CPU cluster
compute_cluster_name = "atul-trdigi-compute"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
        #vm_priority = 'lowpriority', # optional
        min_nodes=0,
        max_nodes=5)
    compute_target = ComputeTarget.create(ws, compute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())

Found existing cluster, use it.

Running
{'errors': [], 'creationTime': '2025-01-28T10:09:36.982856+00:00', 'createdBy': {'userObjectId': '38452539-2a0d-4246-a767-79ca0fafda78', 'userTenantId': 'db1e96a8-a3da-442a-930b-235cac24cd5c', 'userName': None}, 'modifiedTime': '2025-03-17T06:09:15.360977+00:00', 'state': 'Running', 'vmSize': 'Standard_E4s_v3'}


In [4]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "heart-failure-ds"
description_text = "Heart failure clinical records dataset from the UCI repository (https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records)"
data_path = 'https://github.com/mishra-atul5001/MLE-with-Azure-ML/blob/main/Captsone-Project/dataset/heart_failure_clinical_records_dataset.csv'

if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    # Create AML Dataset and register it into Workspace
    dataset = Dataset.Tabular.from_delimited_files(data_path)        
    # Register Dataset in Workspace
    dataset = dataset.register(
        workspace=ws,
        name=key,
        description=description_text)


df = dataset.to_pandas_dataframe()
df.info()
df.describe()

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Columns: 4081 entries, <!DOCTYPE html> to Column4081
dtypes: object(4081)
memory usage: 54.5+ MB


Unnamed: 0,<!DOCTYPE html>,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column4072,Column4073,Column4074,Column4075,Column4076,Column4077,Column4078,Column4079,Column4080,Column4081
count,1752,180,140,92,76,25,8,7,7,7,...,1,1,1,1,1,1,1,1,1,1
unique,1063,128,71,27,63,16,7,6,6,6,...,1,1,1,1,1,1,1,1,1,1
top,</svg>,var(--color-fg-muted,&quot;context&quot;:&quot;solutions&quot;,&quot;tag&quot;:&quot;link&quot;,&quot;originating_url&quot;:&quot;https://gith...,#656d76)));}/*!sc*/,sans-serif,Apple Color Emoji,Segoe UI Emoji;-webkit-font-smoothing:subpixel...,var(--fgColor-onEmphasis,...,"appPayload:{""helpUrl"":""https://docs.github.com""","findFileWorkerPath:""/assets-cdn/worker/find-fi...","findInFileWorkerPath:""/assets-cdn/worker/find-...",githubDevUrl:null,"enabled_features:{""code_nav_ui_events"":false",overview_shared_code_dropdown_button:false,react_blob_overlay:false,copilot_smell_icebreaker_ux:true,accessible_code_button:true,github_models_repo_tab:false}}}</script>
freq,90,11,13,48,6,4,2,2,2,2,...,1,1,1,1,1,1,1,1,1,1


In [5]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 60,
    "max_concurrent_iterations": 5,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(
    compute_target=compute_cluster_name,
    task="classification",
    training_data=dataset,
    label_column_name="DEATH_EVENT",   
    path=project_folder,
    enable_early_stopping=True,
    featurization='auto',
    debug_log='automl_errors.log',
    **automl_settings
    )

In [7]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output=True)

ConfigException: ConfigException:
	Message: Conflicting or duplicate values are provided for arguments: [{
    "script": null,
    "arguments": [],
    "target": "atul-trdigi-compute",
    "framework": "Python",
    "communicator": "None",
    "maxRunDurationSeconds": null,
    "nodeCount": 1,
    "priority": null,
    "environment": {
        "name": "default-environment",
        "version": null,
        "environmentVariables": {
            "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
        },
        "python": {
            "userManagedDependencies": false,
            "interpreterPath": "python",
            "condaDependenciesFile": null,
            "baseCondaEnvironment": null,
            "condaDependencies": {
                "name": "project_environment",
                "dependencies": [
                    "python=3.8.13",
                    {
                        "pip": [
                            "azureml-defaults"
                        ]
                    }
                ],
                "channels": [
                    "anaconda",
                    "conda-forge"
                ]
            }
        },
        "docker": {
            "enabled": false,
            "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20250121.v1",
            "baseDockerfile": null,
            "buildContext": null,
            "sharedVolumes": true,
            "shmSize": "2g",
            "arguments": [],
            "baseImageRegistry": {
                "address": null,
                "username": null,
                "password": null,
                "registryIdentity": null
            },
            "platform": {
                "os": "Linux",
                "architecture": "amd64"
            }
        },
        "spark": {
            "repositories": [],
            "packages": [],
            "precachePackages": true
        },
        "databricks": {
            "mavenLibraries": [],
            "pypiLibraries": [],
            "rcranLibraries": [],
            "jarLibraries": [],
            "eggLibraries": []
        },
        "r": null,
        "inferencingStackVersion": null,
        "assetId": null
    },
    "history": {
        "outputCollection": true,
        "snapshotProject": true,
        "directoriesToWatch": [
            "logs"
        ]
    },
    "spark": {
        "configuration": {
            "spark.app.name": "Azure ML Experiment",
            "spark.yarn.maxAppAttempts": 1
        }
    },
    "docker": {
        "useDocker": true,
        "sharedVolumes": true,
        "arguments": [],
        "shmSize": "2g"
    },
    "hdi": {
        "yarnDeployMode": "cluster"
    },
    "tensorflow": {
        "workerCount": 1,
        "parameterServerCount": 1
    },
    "mpi": {
        "processCountPerNode": 1,
        "nodeCount": 1
    },
    "pytorch": {
        "communicationBackend": "nccl",
        "processCount": null,
        "nodeCount": 1
    },
    "paralleltask": {
        "maxRetriesPerWorker": 0,
        "workerCountPerNode": 1,
        "terminalExitCodes": null
    },
    "dataReferences": {},
    "data": {},
    "datacaches": [],
    "outputData": {},
    "sourceDirectoryDataStore": null,
    "amlcompute": {
        "vmSize": null,
        "vmPriority": null,
        "retainCluster": false,
        "name": null,
        "clusterMaxNodeCount": null
    },
    "autoClusterComputeSpecification": null,
    "kubernetescompute": {
        "instanceType": null
    },
    "credentialPassthrough": false,
    "command": "",
    "environmentVariables": {},
    "applicationEndpoints": {}
}]
	InnerException: None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Conflicting or duplicate values are provided for arguments: [{\n    \"script\": null,\n    \"arguments\": [],\n    \"target\": \"atul-trdigi-compute\",\n    \"framework\": \"Python\",\n    \"communicator\": \"None\",\n    \"maxRunDurationSeconds\": null,\n    \"nodeCount\": 1,\n    \"priority\": null,\n    \"environment\": {\n        \"name\": \"default-environment\",\n        \"version\": null,\n        \"environmentVariables\": {\n            \"EXAMPLE_ENV_VAR\": \"EXAMPLE_VALUE\"\n        },\n        \"python\": {\n            \"userManagedDependencies\": false,\n            \"interpreterPath\": \"python\",\n            \"condaDependenciesFile\": null,\n            \"baseCondaEnvironment\": null,\n            \"condaDependencies\": {\n                \"name\": \"project_environment\",\n                \"dependencies\": [\n                    \"python=3.8.13\",\n                    {\n                        \"pip\": [\n                            \"azureml-defaults\"\n                        ]\n                    }\n                ],\n                \"channels\": [\n                    \"anaconda\",\n                    \"conda-forge\"\n                ]\n            }\n        },\n        \"docker\": {\n            \"enabled\": false,\n            \"baseImage\": \"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20250121.v1\",\n            \"baseDockerfile\": null,\n            \"buildContext\": null,\n            \"sharedVolumes\": true,\n            \"shmSize\": \"2g\",\n            \"arguments\": [],\n            \"baseImageRegistry\": {\n                \"address\": null,\n                \"username\": null,\n                \"password\": null,\n                \"registryIdentity\": null\n            },\n            \"platform\": {\n                \"os\": \"Linux\",\n                \"architecture\": \"amd64\"\n            }\n        },\n        \"spark\": {\n            \"repositories\": [],\n            \"packages\": [],\n            \"precachePackages\": true\n        },\n        \"databricks\": {\n            \"mavenLibraries\": [],\n            \"pypiLibraries\": [],\n            \"rcranLibraries\": [],\n            \"jarLibraries\": [],\n            \"eggLibraries\": []\n        },\n        \"r\": null,\n        \"inferencingStackVersion\": null,\n        \"assetId\": null\n    },\n    \"history\": {\n        \"outputCollection\": true,\n        \"snapshotProject\": true,\n        \"directoriesToWatch\": [\n            \"logs\"\n        ]\n    },\n    \"spark\": {\n        \"configuration\": {\n            \"spark.app.name\": \"Azure ML Experiment\",\n            \"spark.yarn.maxAppAttempts\": 1\n        }\n    },\n    \"docker\": {\n        \"useDocker\": true,\n        \"sharedVolumes\": true,\n        \"arguments\": [],\n        \"shmSize\": \"2g\"\n    },\n    \"hdi\": {\n        \"yarnDeployMode\": \"cluster\"\n    },\n    \"tensorflow\": {\n        \"workerCount\": 1,\n        \"parameterServerCount\": 1\n    },\n    \"mpi\": {\n        \"processCountPerNode\": 1,\n        \"nodeCount\": 1\n    },\n    \"pytorch\": {\n        \"communicationBackend\": \"nccl\",\n        \"processCount\": null,\n        \"nodeCount\": 1\n    },\n    \"paralleltask\": {\n        \"maxRetriesPerWorker\": 0,\n        \"workerCountPerNode\": 1,\n        \"terminalExitCodes\": null\n    },\n    \"dataReferences\": {},\n    \"data\": {},\n    \"datacaches\": [],\n    \"outputData\": {},\n    \"sourceDirectoryDataStore\": null,\n    \"amlcompute\": {\n        \"vmSize\": null,\n        \"vmPriority\": null,\n        \"retainCluster\": false,\n        \"name\": null,\n        \"clusterMaxNodeCount\": null\n    },\n    \"autoClusterComputeSpecification\": null,\n    \"kubernetescompute\": {\n        \"instanceType\": null\n    },\n    \"credentialPassthrough\": false,\n    \"command\": \"\",\n    \"environmentVariables\": {},\n    \"applicationEndpoints\": {}\n}]",
        "details_uri": "https://aka.ms/AutoMLConfig",
        "inner_error": {
            "code": "BadArgument",
            "inner_error": {
                "code": "ArgumentMismatch",
                "inner_error": {
                    "code": "ConflictingValueForArguments"
                }
            }
        }
    }
}

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [None]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)
remote_run

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [None]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)
remote_run

In [None]:
#TODO: Save the best model
best_run, fitted_model = remote_run.get_output()
remote_run.get_metrics()

In [None]:
print(best_run)
print(fitted_model)

In [None]:
dataset_test = Dataset.Tabular.from_delimited_files(path=data_path)
df_test = dataset_test.to_pandas_dataframe()
df_test = df_test[pd.notnull(df_test['DEATH_EVENT'])]

y_test = df_test['DEATH_EVENT']
X_test = df_test.drop(['DEATH_EVENT'], axis=1)

ypred = fitted_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [None]:
# Save the model, scoring script and conda environment of the best run
inference_folder_name = 'inference'
automl_model = os.path.join(inference_folder_name, 'model.pkl')
score_script = os.path.join(inference_folder_name, 'score.py')
conda_env = os.path.join(inference_folder_name, 'conda_env.yml')
best_run.download_file('outputs/model.pkl', automl_model)
best_run.download_file('outputs/scoring_file_v_1_0_0.py', score_script)
best_run.download_file('outputs/conda_env_v_1_0_0.yml', conda_env)

In [None]:
# Register the model
model = Model.register(
    workspace=ws,
    model_name=best_run.properties['model_name'],
    model_path=automl_model,
    model_framework=Model.Framework.SCIKITLEARN,
    model_framework_version=sklearn.__version__,
    description='Auto ML model predicting deaths caused by heart failure'
    )

print('Name:', model.name)
print('Version:', model.version)

In [None]:
# Create inference configuration
env = Environment.from_conda_specification(name="env", file_path=conda_env)
inference_conf = InferenceConfig(entry_script=score_script, environment=env)

# Display the environment file
with open(conda_env, 'r') as file:
    env_file = file.read()
    print(env_file)

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
# Define deployment configuration
deployment_conf = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    description='Predicting deaths caused by heart failure',
    enable_app_insights=True)

# Deploy model as webservice using Azure Container Instance (ACI)
service_name = "aci-heart-failure-web"

service = Model.deploy(
    workspace=ws,
    name=service_name, 
    models=[model], 
    inference_config=inference_conf, 
    deployment_config=deployment_conf, 
    overwrite=True)

service.wait_for_deployment(show_output=True)

print(service.state)

In [None]:
import json
import requests

test_data = json.dumps({
    "data": [
        [55, 0, 1820, 0, 38, 0, 270000, 1.2, 139, 0, 0, 271],
        [55, 0, 1199, 0, 20, 0, 263358.03, 1.83,134, 1, 1, 241],
        [65, 1, 258, 1, 25, 0, 198000, 1.4, 129, 1, 0, 235],
        [50, 0, 196, 0, 45, 0, 395000, 1.6, 136, 1, 1, 285]
    ]})

response = requests.post(
    service.scoring_uri, 
    data=test_data, 
    headers={'Content-Type':'application/json'})

print("Results:", response.json())

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
print(service.get_logs())

In [None]:
# Delete the web service and the model
service.delete()
model.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
