# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.pipeline.core import Pipeline

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.51.0


In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl-experiment'

experiment=Experiment(ws, experiment_name)

In [3]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "my-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
# For a more detailed view of current AmlCompute status, use get_status().

InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.


TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [4]:
project_name = 'azureml-spaceship-titanic'
project_folder = os.getcwd().split(project_name)[0] + project_name

In [11]:
import sys
sys.path.append(project_folder)

from src.pipelines.preprocess import preprocess_data

df = pd.read_csv('../data/01_raw/train.csv')
processed_df = preprocess_data(df).drop(columns=["PassengerId"])


In [12]:
dataset_name = "Spaceship_Dataset"
description_text = "Data to predict which passengers are transported to an alternate dimension"

# Create AML Dataset and register it into Workspace
dataset = Dataset.Tabular.register_pandas_dataframe(processed_df, ws.get_default_datastore(), dataset_name)   
#Register Dataset in Workspace
dataset = dataset.register(workspace=ws,
                                name=dataset_name,
                                description=description_text,
                                create_new_version=True
                        )


df = dataset.to_pandas_dataframe()
df.describe()

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/a4b6936c-442b-4bac-864c-198d247c09b0/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,People_in_Cabin_Num,People_in_Cabin_Deck,Family_Size,Group_Size
count,8512.0,8510.0,8485.0,8510.0,8505.0,8494.0,8494.0,8493.0,8693.0
mean,224.687617,458.077203,173.729169,311.138778,304.854791,8.037203,1952.105957,5.428117,2.035546
std,666.717663,1611.48924,604.696458,1136.705535,1145.717189,5.214669,963.741893,2.891901,1.596347
min,0.0,0.0,0.0,0.0,0.0,1.0,5.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,4.0,779.0,3.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,7.0,2559.0,5.0,1.0
75%,47.0,76.0,27.0,59.0,46.0,12.0,2794.0,7.0,3.0
max,14327.0,29813.0,23492.0,22408.0,24133.0,28.0,2794.0,18.0,8.0


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [13]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="Transported",   
                             path = project_folder + '/automl-pipeline',
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [14]:
from azureml.pipeline.core import PipelineData, TrainingOutput
from azureml.widgets import RunDetails

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)


pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

pipeline_run = experiment.submit(pipeline)
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()


Created step automl_module [e3f1265b][ac048c96-3ccc-4b76-bb16-0680f8ec9077], (This step will run and generate new outputs)
Submitted PipelineRun 95e42c09-466d-4a3f-9818-118135873222
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/95e42c09-466d-4a3f-9818-118135873222?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-250931/workspaces/quick-starts-ws-250931&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 95e42c09-466d-4a3f-9818-118135873222
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/95e42c09-466d-4a3f-9818-118135873222?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-250931/workspaces/quick-starts-ws-250931&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 6bb99c33-2d80-47ab-aee0-157c61c494b2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/6bb99c33-2d80-47ab-aee0-157c61c494b2?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-250931/workspaces/quick-starts-ws-250931&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
StepRun( automl_module ) Status: NotStarted
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished

No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by sett

'Finished'

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [15]:
import json

metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

Downloading azureml/6bb99c33-2d80-47ab-aee0-157c61c494b2/metrics_data
Downloaded azureml/6bb99c33-2d80-47ab-aee0-157c61c494b2/metrics_data, 1 files out of an estimated total of 1


Unnamed: 0,6bb99c33-2d80-47ab-aee0-157c61c494b2_9,6bb99c33-2d80-47ab-aee0-157c61c494b2_2,6bb99c33-2d80-47ab-aee0-157c61c494b2_12,6bb99c33-2d80-47ab-aee0-157c61c494b2_6,6bb99c33-2d80-47ab-aee0-157c61c494b2_10,6bb99c33-2d80-47ab-aee0-157c61c494b2_15,6bb99c33-2d80-47ab-aee0-157c61c494b2_23,6bb99c33-2d80-47ab-aee0-157c61c494b2_18,6bb99c33-2d80-47ab-aee0-157c61c494b2_19,6bb99c33-2d80-47ab-aee0-157c61c494b2_31,...,6bb99c33-2d80-47ab-aee0-157c61c494b2_41,6bb99c33-2d80-47ab-aee0-157c61c494b2_16,6bb99c33-2d80-47ab-aee0-157c61c494b2_26,6bb99c33-2d80-47ab-aee0-157c61c494b2_36,6bb99c33-2d80-47ab-aee0-157c61c494b2_30,6bb99c33-2d80-47ab-aee0-157c61c494b2_44,6bb99c33-2d80-47ab-aee0-157c61c494b2_40,6bb99c33-2d80-47ab-aee0-157c61c494b2_49,6bb99c33-2d80-47ab-aee0-157c61c494b2_46,6bb99c33-2d80-47ab-aee0-157c61c494b2_45
weighted_accuracy,[0.7368115160608307],[0.737882179505414],[0.8030715042964353],[0.7828932985223007],[0.7877042351700548],[0.8018613535856437],[0.7470062642301012],[0.7973311620526973],[0.7291270966656457],[0.8004164858843837],...,[0.7282364991827636],[0.7923867247549156],[0.7934046167403004],[0.7971789214281614],[0.7947992993963725],[0.7323488315962584],[0.7656484549808833],[0.772470645900893],[0.7322331284684699],[0.7899963722689289]
AUC_micro,[0.8086904158169231],[0.8292465103643746],[0.8928054315265345],[0.8789662073058645],[0.8714667612293412],[0.8925552238663639],[0.8442426220742126],[0.8921250412367568],[0.8069097749337889],[0.8917124896015359],...,[0.8082080328313986],[0.8822438407341032],[0.8812140550455535],[0.8880184663503652],[0.8835601030335266],[0.7950402564300162],[0.8510566942927832],[0.8460725716445577],[0.8130594180165026],[0.8733170446316739]
precision_score_micro,[0.7376047772860067],[0.7388706926459626],[0.8030599942397755],[0.7830426182769686],[0.7876447629640587],[0.8019094183642217],[0.7480737909067066],[0.7975381908686227],[0.729551579936536],[0.8004140151488984],...,[0.7290925248182382],[0.7922454783150256],[0.7933958556716733],[0.7970772696726082],[0.7947768325895623],[0.7332333512715016],[0.766248752606454],[0.7720002423518805],[0.7328884445241696],[0.7900605395315065]
balanced_accuracy,[0.7383988256755011],[0.7398601167171766],[0.803048501893762],[0.7831921442217945],[0.7875852661002153],[0.801957531983566],[0.7491422560596902],[0.7977454360144622],[0.7299764721126634],[0.8004115688910537],...,[0.7299493935552158],[0.7921041122052997],[0.7933871240694247],[0.7969755389074202],[0.7947543767542355],[0.7341187438180206],[0.7668496599010597],[0.7715294619449778],[0.7335444252275739],[0.790124788907252]
log_loss,[0.5751922936880788],[0.5070073230419038],[0.42135740405926175],[0.4528796477588033],[0.4678639988297668],[0.47130626504763984],[0.5012676157240782],[0.41354960566619625],[0.6089782495564333],[0.41270191818065055],...,[0.5479245784084461],[0.4288554251628653],[0.436456985622304],[0.4205547251156317],[0.42809651214175376],[0.5982530063529926],[0.5209839743444338],[0.48575971127218237],[0.5364954928778355],[0.4834546381649763]
recall_score_macro,[0.7383988256755011],[0.7398601167171766],[0.803048501893762],[0.7831921442217945],[0.7875852661002153],[0.801957531983566],[0.7491422560596902],[0.7977454360144622],[0.7299764721126634],[0.8004115688910537],...,[0.7299493935552158],[0.7921041122052997],[0.7933871240694247],[0.7969755389074202],[0.7947543767542355],[0.7341187438180206],[0.7668496599010597],[0.7715294619449778],[0.7335444252275739],[0.790124788907252]
AUC_weighted,[0.8077551141351792],[0.8330764219736091],[0.8900658520042888],[0.8776166874170152],[0.8690534000172933],[0.8891821736237734],[0.8467935061760787],[0.8887991272400236],[0.799062914202138],[0.8889028874378818],...,[0.8270685462872542],[0.882416322735753],[0.8780861457554576],[0.8870809381500334],[0.8811952244932245],[0.7965371028048315],[0.8516857725851209],[0.8410531923157823],[0.8261207554127862],[0.8713168688095129]
precision_score_macro,[0.7482531927109872],[0.750554023107146],[0.80349918601929],[0.7841341185393839],[0.7883796742418658],[0.8020207275114268],[0.7565853430985022],[0.7979994484018159],[0.7312085471793769],[0.8004273293280432],...,[0.742193248357526],[0.7924360042288732],[0.7936100820469839],[0.7976161977576738],[0.7949253278620652],[0.747125791091932],[0.7704949063106353],[0.7730983884155523],[0.7446613725027258],[0.7902525303535753]
recall_score_weighted,[0.7376047772860067],[0.7388706926459626],[0.8030599942397755],[0.7830426182769686],[0.7876447629640587],[0.8019094183642217],[0.7480737909067066],[0.7975381908686227],[0.729551579936536],[0.8004140151488984],...,[0.7290925248182382],[0.7922454783150256],[0.7933958556716733],[0.7970772696726082],[0.7947768325895623],[0.7332333512715016],[0.766248752606454],[0.7720002423518805],[0.7328884445241696],[0.7900605395315065]
average_precision_score_micro,[0.8073004870647242],[0.8319711373223694],[0.8945374407752608],[0.881409298858285],[0.8724520854844696],[0.8939997137752297],[0.8447517748617254],[0.8942834443163378],[0.8077028510007133],[0.8936381042274445],...,[0.7864711071072542],[0.8836788731951574],[0.8819416058248194],[0.8885979176385289],[0.8839560271296473],[0.7909475786168892],[0.8520591095468392],[0.8422291341110917],[0.7984351517272875],[0.8731912474252271]


## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [16]:
import pickle

# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

Downloading azureml/6bb99c33-2d80-47ab-aee0-157c61c494b2/model_data
Downloaded azureml/6bb99c33-2d80-47ab-aee0-157c61c494b2/model_data, 1 files out of an estimated total of 1


PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mn...
                                                  PreFittedSoftVotingClassifier(classification_labels=array([0, 1]), estimators=[('35', Pipeline(memory=None, steps=[('sparsenormalizer', Normalizer(copy=True, norm='l1')), ('lightgbmclassifier', LightGBMClassifier(boosting_type='gbdt', colsample_bytree=0.3966666666666666, learning_rate=0.06842421052631578, max_bin=360, max_depth=2, min_child_weight=8, min_data_in_leaf=1e-05, min_split_gain=0.9473684210526315, n_estimators=600, n_jobs=1, num_leaves=200, problem_info=ProblemInfo(gpu_training

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

Deployed from Studio



TODO: In the cell below, send a request to the web service you deployed to test it.

In [29]:
import requests
import json

# URL for the web service, should be similar to:
# 'http://8530a665-66f3-49c8-a953-b82a2d312917.eastus.azurecontainer.io/score'
scoring_uri = 'http://091674f3-7d52-4713-a563-29fdf9673dcd.westeurope.azurecontainer.io/score'
# If the service is authenticated, set the key or token
key = '2cloAPK95e4LlPLSINN0HXiMNWGOcJb5'

# Two sets of data to score, so we get two results back
data = {
    "Inputs": {
        "data":
            [
              {
        "HomePlanet": "Europa",
        "CryoSleep": "False",
        "Destination": "TRAPPIST-1e",
        "VIP": "False",
        "RoomService": 109.00,
        "FoodCourt": 1000,
        "ShoppingMall": 25.0,
        "Spa": 200.0,
        "VRDeck": 2.0,
        "Cabin_Deck": "B",
        "Cabin_Side": "P",
        "Cabin_Region": "A",
        "People_in_Cabin_Num": 14,
        "People_in_Cabin_Deck": 700,
        "Family_Size": 4,
        "Group_Size": 2,
        "Age_Cat": "Pre_Adult"
        }
      ]
    },
    "GlobalParameters": {
        "method": "predict"
    }
}
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())


{'Results': [True]}


TODO: In the cell below, print the logs of the web service and delete the service

In [30]:
%%writefile logs.py
from azureml.core import Workspace, Webservice
from azureml.exceptions import WebserviceException
import time

# Load the Azure ML workspace
ws = Workspace.from_config()

# Name of the web service
service_name = 'web-service-automl'

# Function to attempt to delete the service
def try_delete_service(service):
    try:
        service.delete()
        print("Web service deleted successfully.")
        return True
    except WebserviceException as e:
        print(f"Error deleting web service: {str(e)}")
        return False

# Get a reference to the web service
service = Webservice(workspace=ws, name=service_name)

# Print logs
print("Web service logs:")
print(service.get_logs())

# Wait for deployment operation to complete
try:
    service.wait_for_deployment()
except WebserviceException as e:
    # Ignore the exception related to the operation status
    if "No operation endpoint" not in str(e) and "Long running operation information not known" not in str(e):
        raise

# Attempt to delete the deployed web service with retries
max_retries = 3
retry_count = 0

while retry_count < max_retries:
    if try_delete_service(service):
        break
    else:
        retry_count += 1
        print(f"Retrying deletion (attempt {retry_count}/{max_retries})...")
        time.sleep(10)  # Wait for 10 seconds before retrying

if retry_count == max_retries:
    print("Max retries reached. Unable to delete the web service.")


Overwriting logs.py


In [31]:
!python logs.py

Web service logs:
2024-01-29T02:19:59,463994100+00:00 - rsyslog/run 
2024-01-29T02:19:59,473020200+00:00 - gunicorn/run 
2024-01-29T02:19:59,481847500+00:00 | gunicorn/run | 
2024-01-29T02:19:59,486636100+00:00 | gunicorn/run | ###############################################
2024-01-29T02:19:59,493631400+00:00 | gunicorn/run | AzureML Container Runtime Information
2024-01-29T02:19:59,499912100+00:00 | gunicorn/run | ###############################################
2024-01-29T02:19:59,504231000+00:00 | gunicorn/run | 
2024-01-29T02:19:59,520512700+00:00 | gunicorn/run | 
2024-01-29T02:19:59,538671200+00:00 - nginx/run 
2024-01-29T02:19:59,545640900+00:00 | gunicorn/run | AzureML image information: openmpi4.1.0-ubuntu20.04, Materializaton Build:20231216.v3
2024-01-29T02:19:59,547958300+00:00 | gunicorn/run | 
2024-01-29T02:19:59,550264400+00:00 | gunicorn/run | 
2024-01-29T02:19:59,556724100+00:00 | gunicorn/run | PATH environment variable: /azureml-envs/azureml-automl/bin:/opt/miniconda/

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
