# Automated ML


In [1]:
!pip install xgboost==0.90



In [2]:
import json
import sys
import os
import numpy as np
import pandas as pd
import shutil
import joblib
import requests

from sklearn.model_selection import train_test_split

#from TrainCovid19Infections import clean_data

from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.run import Run
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails

from azureml.train.automl import AutoMLConfig

from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.train.automl import constants
from azureml.automl.core.shared import constants

## Initialize Workspace

In [3]:
# Get current workspace from config
ws = Workspace.from_config()
    
ws.get_details()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

project_folder = './capstone-project'
experiment_name = 'Covid19AutoMlExp'
experiment=Experiment(ws, experiment_name)
experiment

Workspace name: OptimizePipeline
Azure region: eastus2
Subscription id: c04b3d3f-4994-454d-96ff-aa3f2050b57f
Resource group: testingMLFunctionnalities


Name,Workspace,Report Page,Docs Page
Covid19AutoMlExp,OptimizePipeline,Link to Azure Machine Learning studio,Link to Documentation


## Create Cluster

Get cluster if it exists else create one

In [4]:
# Create compute cluster
cpu_cluster_name = "CovidClusLatest"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_v2',max_nodes=4,identity_type="SystemAssigned")
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
# Get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-26T14:26:28.455000+00:00', 'errors': None, 'creationTime': '2021-03-26T11:54:17.136535+00:00', 'modifiedTime': '2021-03-26T11:54:34.832874+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS12_V2'}


## Dataset

### Overview

I Chose a COVID-19 World Wide infections Dataset that holds a track of Covid19's propagation rate, new cases, patients with chronic deseases and death rate per country. 

Since the covid-19 pandemic is among the hottest subjects in the world, and as a member of the society being interested in such statistics calculation can help further scientists or even regular people to better understand the global effect of this deadly virus all over the world.

I used a Covid19 World Dataset provided by *Our World in Data* Github repository containing the historical data about this pandemic.<br> available 

This dataset contains a total number of 59 feature. Below an explanation of some of them:

  * `iso_code` : ISO 3166-1 alpha-3 – three-letter country codes.
  * `continent` : Continent of the geographical location
  * `location` : Geographical location
  * `total_cases` : Total confirmed cases of COVID-19
  * `total_deaths` : Total deaths attributed to COVID-19
  * `new_deaths` : New deaths attributed to COVID-19
  * `reproduction_rate` : Real-time estimate of the effective reproduction rate (R) of COVID-19.
  * `icu_patients` : Number of COVID-19 patients in intensive care units (ICUs) on a given day
  * `hosp_patients` : Number of COVID-19 patients in hospital on a given day.
  * `new_tests` : New tests for COVID-19 (only calculated for consecutive days)
  * `total_tests` : Total tests for COVID-19.
  * `tests_units` : Units used by the location to report its testing data
  * `total_vaccinations` : Total number of COVID-19 vaccination doses administered
  * `people_vaccinated` : Total number of people who received at least one vaccine dose.
  * `population` : Population in 2020.
  * `population_density` : Number of people divided by land area, measured in square kilometers, most recent year available
  * `cardiovasc_death_rate` : Death rate from cardiovascular disease in 2017 (annual number of deaths per 100,000 people)
  * `diabetes_prevalence` : Diabetes prevalence (% of population aged 20 to 79) in 2017
  * `life_expectancy` : Life expectancy at birth in 2019.
  * `new_cases` : New confirmed cases of COVID-19.
Out of a 59 feature, I will be using only 38 including the ones mentioned above, along with my target column: `new_cases`. 


In [5]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Covid19InfectionsDataset"
description_text = "Covid19 Vaccination DataSet from Github"
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./github/owid-covid-data.csv'],
                       target_path ='train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    original_path = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
    ds = TabularDatasetFactory.from_delimited_files(original_path, infer_column_types=True, separator=',', header=True, index=False)
    
    #Register Dataset in Workspace
    dataset = ds.register(workspace=ws,
                          name=key,
                          description=description_text)

df = dataset.to_pandas_dataframe()
df.describe()

Uploading an estimated of 1 files
Uploading ./github/owid-covid-data.csv
Uploaded ./github/owid-covid-data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,median_age,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
count,75908.0,75906.0,74905.0,66569.0,66727.0,74905.0,75495.0,75493.0,74497.0,66169.0,...,70027.0,69223.0,69633.0,70285.0,70889.0,71817.0,35587.0,64836.0,73433.0,70831.0
mean,693358.9,5267.220799,5263.57307,20212.25,132.896039,117.117854,8290.152675,68.93774,68.706857,192.249491,...,30.544706,8.788616,5.569119,19140.773439,257.327124,7.803056,50.978244,3.033377,73.152441,0.727647
std,4850189.0,32669.047339,32264.587983,118679.3,725.851618,663.371238,16214.56048,171.077001,144.35415,340.565288,...,9.114126,6.233355,4.25696,19762.848508,118.590918,3.948874,31.766562,2.466047,7.550791,0.150082
min,1.0,-74347.0,-6223.0,1.0,-1918.0,-232.143,0.001,-2153.437,-276.825,0.001,...,15.1,1.144,0.526,661.24,79.37,0.99,1.188,0.1,53.28,0.394
25%,762.0,2.0,6.143,40.0,0.0,0.0,161.791,0.151,1.04,5.903,...,22.2,3.441,2.043,4466.507,167.295,5.29,20.859,1.3,67.88,0.602
50%,8610.0,62.0,75.286,265.0,2.0,1.143,1137.508,6.71,8.864,33.396,...,29.7,6.378,3.871,12951.839,242.648,7.11,49.839,2.4,74.53,0.748
75%,92381.75,695.0,721.0,2567.0,16.0,12.571,7797.146,59.765,67.241,204.623,...,39.1,14.312,8.678,27216.445,329.635,10.08,83.241,4.0,78.73,0.848
max,125491700.0,880902.0,739564.429,2755210.0,17903.0,14431.429,151258.655,8652.658,2648.773,2416.171,...,48.2,27.049,18.493,116935.6,724.417,30.53,98.999,13.8,86.75,0.957


In [6]:
def clean_data(ds):
    x_df = ds.to_pandas_dataframe().fillna(0)
#     print(len(x_df.columns)) #59
    y_df = x_df.pop("new_cases").apply(lambda s: 1 if s > 1 else 0)
    x_df['tested_units']= np.where(x_df.tests_units == 'tests performed', True, False)
    x_df['testing_units'] = x_df.pop("tested_units").apply(lambda s: 1 if s == True else 0)
#     print(len(x_df.columns)) #59
    iso_codes = pd.get_dummies(x_df.iso_code, prefix="iso_code")
    continent = pd.get_dummies(x_df.continent, prefix="continent")
    x_df = x_df.join([iso_codes,continent])
#     print(len(x_df.columns)) #281
    x_df.drop([
        "tests_units",
        "location",
        "date",
        "iso_code",
        "continent",
        "hosp_patients_per_million",
        "weekly_icu_admissions",
        "weekly_icu_admissions_per_million",
        "weekly_hosp_admissions",
        "weekly_hosp_admissions_per_million",
        "new_tests_smoothed_per_thousand",
        "new_tests_smoothed","new_tests_per_thousand",
        "new_cases_per_million",
        "new_cases_smoothed_per_million",
        "new_cases_smoothed",
        "people_vaccinated",
        "people_fully_vaccinated",
        "new_vaccinations",
        "new_vaccinations_smoothed",
        "total_vaccinations_per_hundred",
        "people_vaccinated_per_hundred",
        "people_fully_vaccinated_per_hundred",
        "new_vaccinations_smoothed_per_million",
        "total_vaccinations"],
         inplace=True, axis=1)
#     print(len(x_df.columns))#256
    return x_df,y_df

In [7]:
# Use the clean_data function fro data cleaning:
x, y = clean_data(dataset)
data = pd.concat([x,y],axis=1)
data.head(10)

Unnamed: 0,total_cases,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,...,iso_code_ZMB,iso_code_ZWE,continent_0,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,new_cases
0,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
1,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
2,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
5,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
6,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
7,1.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
8,2.0,0.0,0.0,0.0,0.051,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,0
9,4.0,0.0,0.0,0.0,0.103,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1


In [8]:
# Split data into train and test sets.
training_data,validation_data = train_test_split(data,test_size = 0.5,random_state = 42,shuffle=True)
validation_data.head(10)

Unnamed: 0,total_cases,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,...,iso_code_ZMB,iso_code_ZWE,continent_0,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,new_cases
23278,57466.0,897.0,17.0,18.143,499.863,7.802,0.148,0.158,0.85,0.0,...,0,0,0,1,0,0,0,0,0,1
17617,9354.0,137.0,1.0,0.143,825.842,12.095,0.088,0.013,1.12,0.0,...,0,0,0,0,0,0,1,0,0,1
52153,8782.0,124.0,0.0,-0.143,205.774,2.905,0.0,-0.003,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1
27455,47005.0,306.0,0.0,0.429,1512.731,9.848,0.0,0.014,1.11,0.0,...,0,0,0,1,0,0,0,0,0,0
18010,18883.0,104.0,8.0,2.714,21558.422,118.735,9.133,3.099,1.14,16.0,...,0,0,0,0,0,1,0,0,0,1
24882,880.0,3.0,2.0,0.429,158.824,0.541,0.361,0.077,1.44,22.0,...,0,0,0,0,0,1,0,0,0,1
5904,86185.0,340.0,0.0,0.429,50649.895,199.814,0.0,0.252,0.95,0.0,...,0,0,0,0,1,0,0,0,0,1
65459,253908.0,28403.0,0.0,2.571,5430.632,607.489,0.0,0.055,1.41,0.0,...,0,0,0,0,0,1,0,0,0,0
40766,2575.0,54.0,0.0,0.0,67519.731,1415.948,0.0,0.0,1.13,0.0,...,0,0,0,0,0,1,0,0,0,1
70360,3702.0,68.0,0.0,0.0,447.17,8.214,0.0,0.0,1.07,0.0,...,0,0,0,1,0,0,0,0,0,1


In [9]:
# Create necessary folders
if "automl_training" not in os.listdir():
    os.mkdir("./automl_training")
if "data" not in os.listdir():
    os.mkdir("./data")
if "outputs" not in os.listdir():
    os.mkdir("./outputs")
if "training" not in os.listdir():
    os.mkdir("./training")
# store training_dataset into it using datastore   
shutil.copy('TrainCovid19Infections.py', './automl_training/')

## AutoML Configuration

The settings used below refers to a classification task within a number of settings chosen based on the existing workspace and cluster configuration restrictions.

The parameters used here are:

* `n_cross_validation = 5` : Since our dataset is small. We apply cross validation with 3 folds instead of train/validation data split.
* `primary_metric = 'accuracy'` : The primary metric parameter determines the metric to be used during model training for optimization. Accuracy primary metric is chosen for binary classification dataset.
* `enable_early_stopping = True` : Whether to enable early termination if the score is not improving in the short term.
* `experiment_timeout_hours = 1.0` : Maximum amount of time in hours that all iterations combined can take before the experiment terminates.
* `max_concurrent_iterations = 4` : To help manage child runs and when they can be performed, we match the number of maximum concurrent iterations of our experiment to the number of nodes in the cluster. So, we get a dedicated cluster per experiment.
* `task = 'classification'` : This specifies the experiment type as classification.
* `compute_target = cpu_cluster` : Azure Machine Learning Managed Compute is a managed service that enables the ability to train machine learning models on clusters of Azure virtual machines. Here compute target is set to cpu_cluster which is already defined with 'STANDARD_D2_V2' and maximum nodes equal to 4.
* `training_data = train_data` : This specifies the training data to be used in this experiment which is set to train_data which is a part of the dataset uploaded to the datastore.
* `label_column_name = 'new_cases'` : The target column here is set to DEATH_EVENT which has values 1 if the patient deceased or 0 if the patient survived.
* `featurization= 'auto'` : This indicates that as part of preprocessing, data guardrails and featurization steps are performed automatically.
* `model_explainability = True`: Whether to enable explaining the best AutoML model at the end of all AutoML training iterations.
* `path=project_folder`:
* `debug_log = "Covid_automl_errors.log"`: The log file to write debug information to. If not specified, 'automl.log' is used.

In [10]:
#convert the training dataset to a CSV file and store it under the training folder
training_data.to_csv('automl_training/training_data.csv', index=False)
datastore.upload(src_dir='automl_training/',target_path='data/',overwrite = True)
# Get the dataset from the data folder
training_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore,('data/training_data.csv'))])

Uploading an estimated of 2 files
Uploading automl_training/TrainCovid19Infections.py
Uploaded automl_training/TrainCovid19Infections.py, 1 files out of an estimated total of 2
Uploading automl_training/training_data.csv
Uploaded automl_training/training_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [11]:
# Configure Automl settings
automl_settings = {
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "experiment_timeout_hours": 1.0,
    "max_concurrent_iterations": 4,
}
automl_config = AutoMLConfig(task = 'classification',
                             compute_target = cpu_cluster,
                             training_data = training_dataset,
                             label_column_name = 'new_cases',
                             featurization= 'auto',
                             path=project_folder,
                             model_explainability=True,
                             debug_log = "Covid_automl_errors.log",
                             **automl_settings)

In [12]:
# Experiment Submission
tag = {"Covid19Infections": "Capstone project: Covid19 AutoML Experiment"}
remote_run = experiment.submit(automl_config,tags=tag, show_output=True)

Running on remote.
No run_configuration provided, running on CovidClusLatest with default configuration
Running on remote compute: CovidClusLatest
Parent Run ID: AutoML_24c154d4-6295-4f91-861d-28b07fe9b979

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:    

        26    VotingEnsemble                                0:06:35       0.9202    0.9202
        27    StackEnsemble                                 0:06:41       0.9198    0.9202


## Run Details

Based on features, parameters and built-in assumptions: some models' assumptions match the characteristics of the data, so we may get a good fit. However, when a model assumptions do not match the characteristics of the data, we may get a bad fit.

Using Automated Machine Learning, we can clearly deduce that _VotingEnsemble_ holds the Best metric 0.9991 among others like _MaxAbsScaler, LightGBM_ with a slight difference of 0.0001.


In [13]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

        26    VotingEnsemble                                0:06:35       0.9202    0.9202
        27    StackEnsemble                                 0:06:41       0.9198    0.9202


{'runId': 'AutoML_24c154d4-6295-4f91-861d-28b07fe9b979',
 'target': 'CovidClusLatest',
 'status': 'Completed',
 'startTimeUtc': '2021-03-26T21:53:23.529305Z',
 'endTimeUtc': '2021-03-26T23:20:33.575839Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'CovidClusLatest',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"c57b5170-469a-47f7-a991-7a1e0b20f57b\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/training_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"testingMLFunctionnalities\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"c04b3d3f-4994-454d-96ff-aa3f2050b57f\\\\\\

## Best Model

In [14]:
explaining_model_run_id = remote_run.id
print(explaining_model_run_id)
explaining_model_run = Run(experiment=experiment, run_id=explaining_model_run_id)
explaining_model_run.wait_for_completion()

AutoML_24c154d4-6295-4f91-861d-28b07fe9b979


{'runId': 'AutoML_24c154d4-6295-4f91-861d-28b07fe9b979',
 'target': 'CovidClusLatest',
 'status': 'Completed',
 'startTimeUtc': '2021-03-26T21:53:23.529305Z',
 'endTimeUtc': '2021-03-26T23:20:33.575839Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'CovidClusLatest',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"c57b5170-469a-47f7-a991-7a1e0b20f57b\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/training_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"testingMLFunctionnalities\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"c04b3d3f-4994-454d-96ff-aa3f2050b57f\\\\\\

In [15]:
# Retrieve and save best model.
best_automl_run, best_automl_model = remote_run.get_output()

Package:azureml-automl-runtime, training version:1.24.0, current version:1.22.0
Package:azureml-core, training version:1.24.0.post1, current version:1.22.0
Package:azureml-dataprep, training version:2.11.2, current version:2.9.1
Package:azureml-dataprep-native, training version:30.0.0, current version:29.0.0
Package:azureml-dataprep-rslex, training version:1.9.1, current version:1.7.0
Package:azureml-dataset-runtime, training version:1.24.0, current version:1.22.0
Package:azureml-defaults, training version:1.24.0, current version:1.22.0
Package:azureml-interpret, training version:1.24.0, current version:1.22.0
Package:azureml-mlflow, training version:1.24.0, current version:1.22.0
Package:azureml-pipeline-core, training version:1.24.0, current version:1.22.0
Package:azureml-telemetry, training version:1.24.0, current version:1.22.0
Package:azureml-train-automl-client, training version:1.24.0, current version:1.22.0
Package:azureml-train-automl-runtime, training version:1.24.0, current 

In [16]:
# Get the metrics of the best selected run
best_run_metrics = best_automl_run.get_metrics()

# Print all metrics of the best run model
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    
print(best_automl_model._final_estimator)

log_loss 0.20861757476304632
precision_score_micro 0.9201884482550327
matthews_correlation 0.7933580167024229
AUC_micro 0.978518592106402
balanced_accuracy 0.8979976981580524
average_precision_score_weighted 0.9728992376647035
norm_macro_recall 0.7959953963161046
accuracy 0.9201884482550327
AUC_macro 0.9718314370368437
f1_score_macro 0.896635292292878
average_precision_score_micro 0.9791358236664423
average_precision_score_macro 0.9571161660281767
f1_score_micro 0.9201884482550327
recall_score_macro 0.8979976981580524
weighted_accuracy 0.9340860275437762
f1_score_weighted 0.920326841240939
precision_score_weighted 0.9205326189882964
recall_score_weighted 0.9201884482550327
AUC_weighted 0.9718314370368437
precision_score_macro 0.895378382937453
recall_score_micro 0.9201884482550327
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_24c154d4-6295-4f91-861d-28b07fe9b979_26/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_24c154d4-6295-4f91-861d-28b07fe9b97

In [17]:
best_automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
Covid19AutoMlExp,AutoML_24c154d4-6295-4f91-861d-28b07fe9b979_26,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [18]:
joblib.dump(best_automl_model, filename="./outputs/automl-model.pkl")

['./outputs/automl-model.pkl']

In [19]:
arr = os.listdir('./outputs/')
print(arr)

['automl-model.pkl', 'DataCleaningReport.html', 'model.joblib']


In [43]:
# register the best model
best_model_registered = remote_run.register_model(
                                            model_name = "Covid19-automl-model",
                                            tags={'Area': "Pandemic", 'Type': "Classification",'Method of execution':'Auto ML'},
                                            )
#print(best_model.name, best_model.id, sep='\t')
print(best_model_registered)

Model(workspace=Workspace.create(name='OptimizePipeline', subscription_id='c04b3d3f-4994-454d-96ff-aa3f2050b57f', resource_group='testingMLFunctionnalities'), name=Covid19-automl-model, id=Covid19-automl-model:5, version=5, tags={'Area': 'Pandemic', 'Type': 'Classification', 'Method of execution': 'Auto ML'}, properties={})


## Model Deployment
As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I will be deploying it.

In [42]:
model = Model(ws, 'Covid19-automl-model')

env = best_automl_run.get_environment()
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')
best_automl_run.download_file(constants.CONDA_ENV_FILE_PATH, 'env.yml')

inference_config = InferenceConfig(entry_script='score.py', environment=env)

deployment_config = AciWebservice.deploy_configuration(
                                                       cpu_cores=1,
                                                       memory_gb=1,
                                                       description='Covid19 new cases prediction',
                                                       auth_enabled=True,
                                                       enable_app_insights= True,
                                                       collect_model_data = True)

service = Model.deploy(workspace=ws,
                       name='aci-covid19-service',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.....................................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [44]:
print("Key " + service.get_keys()[0])
print("Swagger URI : "+service.swagger_uri)
print("Scoring URI : "+service.scoring_uri)

Key 4hU0ujUZdVfVfGqkLfI08Kuu3jDpb8Xb
Swagger URI : http://b9138058-bfe9-4d78-a97b-b1a37ef287a5.eastus2.azurecontainer.io/swagger.json
Scoring URI : http://b9138058-bfe9-4d78-a97b-b1a37ef287a5.eastus2.azurecontainer.io/score


In [45]:
# testing the endpoint
x_test = validation_data.sample(2)
y_test = x_test['new_cases']
x_test.drop(['new_cases'], inplace=True, axis=1)

In [46]:
Covid19DataTesting= json.dumps({'data': x_test.to_dict(orient='records')})
print(Covid19DataTesting)

{"data": [{"total_cases": 39699.0, "total_deaths": 418.0, "new_deaths": 0.0, "new_deaths_smoothed": 9.0, "total_cases_per_million": 7793.112, "total_deaths_per_million": 82.055, "new_deaths_per_million": 0.0, "new_deaths_smoothed_per_million": 1.767, "reproduction_rate": 1.13, "icu_patients": 0, "icu_patients_per_million": 0, "hosp_patients": 0, "new_tests": "2054.0", "total_tests": "130021.0", "total_tests_per_thousand": "25.524", "positive_rate": "0.405", "tests_per_case": "2.5", "stringency_index": 61.11, "population": 5094114.0, "population_density": 96.079, "median_age": 33.6, "aged_65_older": 9.468, "aged_70_older": 5.694, "gdp_per_capita": 15524.995, "extreme_poverty": "1.3", "cardiovasc_death_rate": 137.973, "diabetes_prevalence": 8.78, "female_smokers": "6.4", "male_smokers": "17.4", "handwashing_facilities": 83.841, "hospital_beds_per_thousand": 1.13, "life_expectancy": 80.28, "human_development_index": 0.81, "testing_units": 0, "iso_code_AFG": 0, "iso_code_AGO": 0, "iso_code

In [47]:
headers = {'Content-type': 'application/json'}
headers['Authorization'] = f'Bearer {service.get_keys()[0]}'

# Make the request and display the response
response = requests.post(service.scoring_uri, Covid19DataTesting, headers=headers)
print('Prediction :', response.text)

# Print original labels
print('True Values :', y_test.values)

Prediction : "{\"result\": [0, 0]}"
True Values : [0 0]


In [48]:
print(service.get_logs())

2021-03-27T00:23:25,657974400+00:00 - iot-server/run 
2021-03-27T00:23:25,663403500+00:00 - gunicorn/run 
2021-03-27T00:23:25,678535400+00:00 - rsyslog/run 
2021-03-27T00:23:25,677837300+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [32]:
#service.delete()
#cpu_cluster.delete()