# Automated ML


In [1]:
!pip install xgboost==0.90



In [2]:
import json
import sys
import os
import numpy as np
import pandas as pd
import shutil
import joblib
import requests

from sklearn.model_selection import train_test_split

#from TrainCovid19Infections import clean_data

from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.run import Run
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails

from azureml.train.automl import AutoMLConfig

from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.train.automl import constants

## Initialize Workspace

In [3]:
# Get current workspace from config
ws = Workspace.from_config()
    
ws.get_details()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

project_folder = './capstone-project'
# choose a name for experiment
experiment_name = 'Cov19InfectionAutoMlExperiment'
experiment=Experiment(ws, experiment_name)
experiment

Workspace name: wsptest
Azure region: eastus2
Subscription id: c04b3d3f-4994-454d-96ff-aa3f2050b57f
Resource group: testingmlfunctionnalities


Name,Workspace,Report Page,Docs Page
Cov19InfectionAutoMlExperiment,wsptest,Link to Azure Machine Learning studio,Link to Documentation


## Create Cluster

Get cluster if it exists else create one

In [4]:
# Create compute cluster
cpu_cluster_name = "Covid19Cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4,identity_type="SystemAssigned")
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
# Get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-24T06:29:36.370000+00:00', 'errors': None, 'creationTime': '2021-03-23T18:45:47.925166+00:00', 'modifiedTime': '2021-03-23T18:46:05.507627+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


## Dataset

### Overview

I Chose a COVID-19 World Wide infections Dataset that holds a track of Covid19's propagation rate, new cases, patients with chronic deseases and death rate per country. 

Since the covid-19 pandemic is among the hottest subjects in the world, and as a member of the society being interested in such statistics calculation can help further scientists or even regular people to better understand the global effect of this deadly virus all over the world.

I used Github to download the Dataset.

In [5]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Covid19InfectionsDataset"
description_text = "Covid19 Vaccination DataSet from Github"
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./github/owid-covid-data.csv'],
                       target_path ='train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    original_path = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
    ds = TabularDatasetFactory.from_delimited_files(original_path, infer_column_types=True, separator=',', header=True, index=False)
    
    #Register Dataset in Workspace
    dataset = ds.register(workspace=ws,
                          name=key,
                          description=description_text)

df = dataset.to_pandas_dataframe()
df.describe()

Uploading an estimated of 1 files
Uploading ./github/owid-covid-data.csv
Uploaded ./github/owid-covid-data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,median_age,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
count,75510.0,75508.0,74507.0,66191.0,66349.0,74507.0,75099.0,75097.0,74101.0,65793.0,...,69651.0,68851.0,69259.0,69897.0,70507.0,71425.0,35397.0,64497.0,72994.0,70453.0
mean,686378.9,5238.25445,5244.254204,20060.25,132.613755,116.946415,8195.359285,68.461816,68.334267,190.487865,...,30.544892,8.788681,5.569217,19133.882745,257.31559,7.801589,50.982041,3.033532,73.147567,0.727661
std,4805132.0,32519.174052,32187.539243,117730.8,724.695465,662.824803,16045.694839,170.43472,143.889953,337.683165,...,9.113515,6.233321,4.256902,19752.180623,118.576136,3.947493,31.766762,2.466281,7.55019,0.150079
min,1.0,-74347.0,-6223.0,1.0,-1918.0,-232.143,0.001,-2153.437,-276.825,0.001,...,15.1,1.144,0.526,661.24,79.37,0.99,1.188,0.1,53.28,0.394
25%,755.0,2.0,6.143,40.0,0.0,0.0,159.694,0.148,1.035,5.852,...,22.2,3.441,2.043,4466.507,167.295,5.29,20.859,1.3,67.88,0.602
50%,8527.5,61.5,74.571,264.0,2.0,1.143,1128.698,6.651,8.796,32.973,...,29.7,6.378,3.871,12951.839,242.648,7.11,49.839,2.4,74.53,0.748
75%,91471.25,689.0,717.3575,2531.5,16.0,12.429,7689.39,59.305,66.65,202.633,...,39.1,14.312,8.678,27216.445,329.635,10.08,83.241,4.0,78.73,0.848
max,124202100.0,880902.0,739564.429,2734098.0,17903.0,14431.429,150016.178,8652.658,2648.773,2357.24,...,48.2,27.049,18.493,116935.6,724.417,30.53,98.999,13.8,86.75,0.957


In [6]:
# preview the first 10 rows of the dataset
len(df.columns)

59

In [7]:
if 'tests_units' in df.columns:
    print ('true')
else:
    print('false')

true


In [8]:
def clean_data(ds):
    x_df = ds.to_pandas_dataframe().fillna(0)
    print(len(x_df.columns)) #59
    y_df = x_df.pop("new_cases").apply(lambda s: 1 if s > 1 else 0)
    x_df['tested_units']= np.where(x_df.tests_units == 'tests performed', True, False)
    x_df['testing_units'] = x_df.pop("tested_units").apply(lambda s: 1 if s == True else 0)
    print(len(x_df.columns)) #59
    iso_codes = pd.get_dummies(x_df.iso_code, prefix="iso_code")
    continent = pd.get_dummies(x_df.continent, prefix="continent")
    x_df = x_df.join([iso_codes,continent])
    print(len(x_df.columns)) #281
    x_df.drop([
        "tests_units",
        "location",
        "date",
        "iso_code",
        "continent",
        "hosp_patients_per_million",
        "weekly_icu_admissions",
        "weekly_icu_admissions_per_million",
        "weekly_hosp_admissions",
        "weekly_hosp_admissions_per_million",
        "new_tests_smoothed_per_thousand",
        "new_tests_smoothed","new_tests_per_thousand",
        "people_vaccinated",
        "people_fully_vaccinated",
        "new_vaccinations",
        "new_vaccinations_smoothed",
        "total_vaccinations_per_hundred",
        "people_vaccinated_per_hundred",
        "people_fully_vaccinated_per_hundred",
        "new_vaccinations_smoothed_per_million",
        "total_vaccinations"],
         inplace=True, axis=1)
    print(len(x_df.columns))#259
    return x_df,y_df

In [9]:
# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

59
59
281
259


In [10]:
if 'testing_units' in x.columns:
    print ('true')
else:
    print('false')

true


In [11]:
data = pd.concat([x,y],axis=1)
data.head(10)

Unnamed: 0,total_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,...,iso_code_ZMB,iso_code_ZWE,continent_0,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,new_cases
0,1.0,0.0,0.0,0.0,0.0,0.026,0.026,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,1.0,0.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,1.0,0.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,1.0,0.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
5,1.0,0.143,0.0,0.0,0.0,0.026,0.0,0.004,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
6,1.0,0.143,0.0,0.0,0.0,0.026,0.0,0.004,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
7,1.0,0.0,0.0,0.0,0.0,0.026,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8,2.0,0.143,0.0,0.0,0.0,0.051,0.026,0.004,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
9,4.0,0.429,0.0,0.0,0.0,0.103,0.051,0.011,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1


In [12]:
for col in data.columns:
    print(col)

total_cases
new_cases_smoothed
total_deaths
new_deaths
new_deaths_smoothed
total_cases_per_million
new_cases_per_million
new_cases_smoothed_per_million
total_deaths_per_million
new_deaths_per_million
new_deaths_smoothed_per_million
reproduction_rate
icu_patients
icu_patients_per_million
hosp_patients
new_tests
total_tests
total_tests_per_thousand
positive_rate
tests_per_case
stringency_index
population
population_density
median_age
aged_65_older
aged_70_older
gdp_per_capita
extreme_poverty
cardiovasc_death_rate
diabetes_prevalence
female_smokers
male_smokers
handwashing_facilities
hospital_beds_per_thousand
life_expectancy
human_development_index
testing_units
iso_code_AFG
iso_code_AGO
iso_code_AIA
iso_code_ALB
iso_code_AND
iso_code_ARE
iso_code_ARG
iso_code_ARM
iso_code_ATG
iso_code_AUS
iso_code_AUT
iso_code_AZE
iso_code_BDI
iso_code_BEL
iso_code_BEN
iso_code_BFA
iso_code_BGD
iso_code_BGR
iso_code_BHR
iso_code_BHS
iso_code_BIH
iso_code_BLR
iso_code_BLZ
iso_code_BMU
iso_code_BOL
iso_co

In [13]:
if 'testing_units' in data.columns:
    print ('true')
else:
    print('false')

true


In [14]:
len(data.columns)

260

In [15]:
# Split data into train and test sets.

training_data,validation_data = train_test_split(data,test_size = 0.5,random_state = 42,shuffle=True)
validation_data.head(10)

Unnamed: 0,total_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,...,iso_code_ZMB,iso_code_ZWE,continent_0,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,new_cases
46690,7.0,0.714,1.0,0.0,0.143,0.19,0.027,0.019,0.027,0.0,...,0,0,0,1,0,0,0,0,0,0
8147,2557.0,8.714,41.0,0.0,0.0,210.918,0.0,0.719,3.382,0.0,...,0,0,0,1,0,0,0,0,0,0
55587,1054273.0,11525.714,19861.0,502.0,445.0,27856.475,328.352,304.538,524.776,13.264,...,0,0,0,0,0,1,0,0,0,1
43936,1439.0,93.571,74.0,3.0,5.0,309.485,33.551,20.124,15.915,0.645,...,0,0,0,1,0,0,0,0,0,1
33097,1424596.0,6472.857,58038.0,79.0,79.571,16960.899,78.542,77.064,690.987,0.941,...,0,0,0,0,1,0,0,0,0,1
65146,470973.0,8345.714,29152.0,58.0,32.571,10073.258,173.565,178.5,623.508,1.241,...,0,0,0,0,0,1,0,0,0,1
9093,244380.0,953.429,11508.0,38.0,33.429,20935.44,103.144,81.678,985.862,3.255,...,0,0,0,0,0,0,0,0,1,1
35384,45764.0,1361.429,1042.0,8.0,4.857,361.838,12.627,10.764,8.239,0.063,...,0,0,0,0,1,0,0,0,0,1
40166,142671.0,869.143,2340.0,10.0,15.286,20763.359,156.157,126.489,340.548,1.455,...,0,0,0,1,0,0,0,0,0,1
30730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [16]:
len(validation_data.columns)

260

In [17]:
# Create necessary folders
if "automl_training" not in os.listdir():
    os.mkdir("./automl_training")
if "data" not in os.listdir():
    os.mkdir("./data")
if "outputs" not in os.listdir():
    os.mkdir("./outputs")
if "training" not in os.listdir():
    os.mkdir("./training")
# store training_dataset into it using datastore
script_folder = './automl_training/'    
os.makedirs(script_folder, exist_ok=True)
shutil.copy('TrainCovid19Infections.py', script_folder)
project_folder = './pipeline-project'

## AutoML Configuration
TODO: Explain why you chose the automl settings and cofiguration you used below.
The settings used below refers to a classification task within a number of settings chosen based on the existing workspace and cluster configuration restrictions 

In [18]:
# Removing index column before submitting the run
training_data.reset_index(drop=True, inplace=True)
training_data.head(10)

Unnamed: 0,total_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,...,iso_code_ZMB,iso_code_ZWE,continent_0,continent_Africa,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_South America,new_cases
0,135.0,18.429,0.0,0.0,0.0,38.863,7.197,5.305,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
1,765489.0,18254.143,44057.0,241.0,135.571,11276.095,314.719,268.894,648.985,3.55,...,0,0,0,0,0,1,0,0,0,1
2,10524.0,764.857,266.0,20.0,23.714,1032.098,62.569,75.01,26.087,1.961,...,0,0,0,0,0,1,0,0,0,1
3,12102.0,75.857,252.0,1.0,0.857,1778.504,10.434,11.148,37.034,0.147,...,0,0,0,0,0,1,0,0,0,1
4,676.0,8.714,15.0,1.0,0.143,81.655,0.604,1.053,1.812,0.121,...,0,0,0,1,0,0,0,0,0,1
5,37340.0,889.0,207.0,12.0,8.714,1281.541,30.34,30.511,7.104,0.412,...,0,0,0,0,1,0,0,0,0,1
6,1327.0,2.286,82.0,0.0,0.0,262.373,1.186,0.452,16.213,0.0,...,0,0,0,1,0,0,0,0,0,1
7,117113.0,1382.0,1865.0,1.0,7.143,925.967,13.86,10.927,14.746,0.008,...,0,0,0,0,1,0,0,0,0,1
8,6255.0,185.857,77.0,0.0,0.429,299.235,5.789,8.891,3.684,0.0,...,0,0,0,1,0,0,0,0,0,1
9,481.0,39.714,4.0,0.0,0.571,42.97,17.063,3.548,0.357,0.0,...,0,0,0,1,0,0,0,0,0,1


In [19]:
#convert the training dataset to a CSV file and store it under the training folder
training_data.to_csv('automl_training/training_data.csv', index=False)

# Get the dataset from the data folder
#datastore.upload_files(files = ['automl_training/training_data.csv'],
#                       target_path ='./data/',
#                       overwrite = True,
#                       show_progress = True)

datastore.upload(src_dir='automl_training/',target_path='data/',overwrite = True)
# Get the dataset from the data folder
training_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore,('data/training_data.csv'))])
td = training_dataset.take(10).to_pandas_dataframe()

Uploading an estimated of 2 files
Uploading automl_training/TrainCovid19Infections.py
Uploaded automl_training/TrainCovid19Infections.py, 1 files out of an estimated total of 2
Uploading automl_training/training_data.csv
Uploaded automl_training/training_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [20]:
len(td.columns)

260

In [21]:
for col in td.columns:
    print(col)

total_cases
new_cases_smoothed
total_deaths
new_deaths
new_deaths_smoothed
total_cases_per_million
new_cases_per_million
new_cases_smoothed_per_million
total_deaths_per_million
new_deaths_per_million
new_deaths_smoothed_per_million
reproduction_rate
icu_patients
icu_patients_per_million
hosp_patients
new_tests
total_tests
total_tests_per_thousand
positive_rate
tests_per_case
stringency_index
population
population_density
median_age
aged_65_older
aged_70_older
gdp_per_capita
extreme_poverty
cardiovasc_death_rate
diabetes_prevalence
female_smokers
male_smokers
handwashing_facilities
hospital_beds_per_thousand
life_expectancy
human_development_index
testing_units
iso_code_AFG
iso_code_AGO
iso_code_AIA
iso_code_ALB
iso_code_AND
iso_code_ARE
iso_code_ARG
iso_code_ARM
iso_code_ATG
iso_code_AUS
iso_code_AUT
iso_code_AZE
iso_code_BDI
iso_code_BEL
iso_code_BEN
iso_code_BFA
iso_code_BGD
iso_code_BGR
iso_code_BHR
iso_code_BHS
iso_code_BIH
iso_code_BLR
iso_code_BLZ
iso_code_BMU
iso_code_BOL
iso_co

In [22]:
automl_settings = {
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "experiment_timeout_hours": 1.0,
    "max_concurrent_iterations": 4,
}
automl_config = AutoMLConfig(task = 'classification',
                             compute_target = cpu_cluster,
                             training_data = training_dataset,
                             label_column_name = 'new_cases',
                             featurization= 'auto',
                             path=project_folder,
                             model_explainability=True,
                             debug_log = "Covid_automl_errors.log",
                             **automl_settings)

In [23]:
# Experiment Submission
tag = {"Covid19Infections": "Capstone project: Covid19 AutoML Experiment"}
remote_run = experiment.submit(automl_config,tags=tag, show_output=True)

Running on remote.
No run_configuration provided, running on Covid19Cluster with default configuration
Running on remote compute: Covid19Cluster
Parent Run ID: AutoML_909f336d-d72b-412a-be6b-48822da21efa

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:      

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?
Supervised Learning in general is based on labled 

In [24]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_909f336d-d72b-412a-be6b-48822da21efa',
 'target': 'Covid19Cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-03-24T09:21:27.901891Z',
 'endTimeUtc': '2021-03-24T10:54:32.371061Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'Covid19Cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"03128a39-5d5e-4d95-8b74-86932e60a1a7\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/training_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"testingmlfunctionnalities\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"c04b3d3f-4994-454d-96ff-aa3f2050b57f\\\\\\",

In [25]:
# List best models of HyperDrive Run and AutoML Run to compare the accuracy of the models and choose the best among them to deploy
for model in Model.list(ws):
    print(model.name)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print('\t',tag_name,':',tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print('\t',prop_name,':',prop)
    print("\n")

## Best Model

explaining_model_run_id = remote_run.id
print(explaining_model_run_id)
explaining_model_run = Run(experiment=experiment, run_id=explaining_model_run_id)
explaining_model_run.wait_for_completion()

In [26]:
# Retrieve and save best model.
best_automl_run, best_automl_model = remote_run.get_output()

In [27]:
# Get the metrics of the best selected run
best_run_metrics = best_automl_run.get_metrics()

# Print all metrics of the best run model
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    
print(best_automl_model._final_estimator)

balanced_accuracy 0.9990008846407088
average_precision_score_weighted 0.999969072216558
precision_score_micro 0.9991929185108045
AUC_weighted 0.9999714485969312
f1_score_macro 0.9989551373812627
precision_score_macro 0.9989098170233598
accuracy 0.9991929185108045
f1_score_weighted 0.999192960365184
matthews_correlation 0.9979106235069866
f1_score_micro 0.9991929185108045
AUC_macro 0.9999714485969312
norm_macro_recall 0.9980017692814176
recall_score_macro 0.9990008846407088
precision_score_weighted 0.9991932726316282
log_loss 0.05273565879153332
average_precision_score_macro 0.9999497925240302
recall_score_micro 0.9991929185108045
average_precision_score_micro 0.9999764958325599
weighted_accuracy 0.9993127847418493
AUC_micro 0.9999761307332247
recall_score_weighted 0.9991929185108045
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_909f336d-d72b-412a-be6b-48822da21efa_30/confusion_matrix
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_909f336d-d72b-412a-be6b-48822da

In [28]:
best_automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
Cov19InfectionAutoMlExperiment,AutoML_909f336d-d72b-412a-be6b-48822da21efa_30,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [29]:
joblib.dump(best_automl_model, filename="./outputs/automl-model.pkl")

['./outputs/automl-model.pkl']

In [30]:
arr = os.listdir('./outputs/')
print(arr)

['automl-model.pkl', 'model.joblib']


In [31]:
from azureml.core.model import Model
model = Model.list(ws)
print (model)

[]


In [32]:
# register the best model
best_model_registered = remote_run.register_model(
                                           
                                            model_name = "Covid19-automl-model",
                                            tags={'Area': "Pandemic", 'Type': "Classification",'Method of execution':'Auto ML'},
                                            )
#print(best_model.name, best_model.id, sep='\t')
print(best_model_registered)

Model(workspace=Workspace.create(name='wsptest', subscription_id='c04b3d3f-4994-454d-96ff-aa3f2050b57f', resource_group='testingmlfunctionnalities'), name=Covid19-automl-model, id=Covid19-automl-model:5, version=5, tags={'Area': 'Pandemic', 'Type': 'Classification', 'Method of execution': 'Auto ML'}, properties={})


## Model Deployment

As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I deploy it in the cell below, register it, create an inference config and deploy the model as a web service.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [33]:
with open('score.py') as f:
    print(f.read())

import json
import numpy as np
import os
import joblib
import pandas as pd

def init():
    #This function initialises the model. The model file is retrieved used within the script.
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'automl-model.pkl') #name of model file (.sav or .pkl)
    print("Found model:", os.path.isfile(model_path)) #To check whether the model is actually present on the location we are looking at
    model = joblib.load(model_path)

#Input the data as json and returns the predictions in json. All preprocessing  steps are specific to this model and usecase
def run(data):
    try:
        #data = np.array(json.loads(data))
        data = json.loads(data)['data'] # raw = pd.read_json(data) 
        data = pd.DataFrame.from_dict(data)

         #prediction steps 
        result = model.predict(data)

        #packaging steps 
        #result = pred.to_json()

        # You can return any data type, as long as it is JSON serializable.
    

In [34]:
from azureml.automl.core.shared import constants

#model = Model(ws, 'Covid19-automl-model')

#environment = best_automl_run.get_environment()
#best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')
#best_automl_run.download_file(constants.CONDA_ENV_FILE_PATH, 'environment.yml')
model = Model.register(model_path="outputs/automl-model.pkl",
                       model_name="best-trained-model",
                       workspace = ws)
env = Environment.get(workspace=ws, name="AzureML-AutoML")
inference_config = InferenceConfig(entry_script='score.py', environment=env)

#inference_config = InferenceConfig(entry_script='score.py', environment=environment)

deployment_config = AciWebservice.deploy_configuration(
                                                       cpu_cores=1,
                                                       memory_gb=1,
                                                       description='Covid19 new cases prediction',
                                                       auth_enabled=True,
                                                       enable_app_insights= True,
                                                       collect_model_data = True)

service = Model.deploy(workspace=ws,
                       name='aci-covid19-new-service',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)
print(service.state)

Registering model best-trained-model
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-03-24 11:00:01+00:00 Creating Container Registry if not exists.
2021-03-24 11:00:01+00:00 Registering the environment.
2021-03-24 11:00:02+00:00 Use the existing image.
2021-03-24 11:00:03+00:00 Submitting deployment to compute..
2021-03-24 11:00:12+00:00 Checking the status of deployment aci-covid19-new-service..
2021-03-24 11:04:22+00:00 Checking the status of inference endpoint aci-covid19-new-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [35]:
with open('score.py') as f:
    print(f.read())

import json
import numpy as np
import os
import joblib
import pandas as pd

def init():
    #This function initialises the model. The model file is retrieved used within the script.
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'automl-model.pkl') #name of model file (.sav or .pkl)
    print("Found model:", os.path.isfile(model_path)) #To check whether the model is actually present on the location we are looking at
    model = joblib.load(model_path)

#Input the data as json and returns the predictions in json. All preprocessing  steps are specific to this model and usecase
def run(data):
    try:
        #data = np.array(json.loads(data))
        data = json.loads(data)['data'] # raw = pd.read_json(data) 
        data = pd.DataFrame.from_dict(data)

         #prediction steps 
        result = model.predict(data)

        #packaging steps 
        #result = pred.to_json()

        # You can return any data type, as long as it is JSON serializable.
    

In [36]:
print("Key " + service.get_keys()[0])
print("Swagger URI : "+service.swagger_uri)
print("Scoring URI : "+service.scoring_uri)

Key 7AzxqQdvInt3V54h9Q1hxodbzaax8uUu
Swagger URI : http://e6cceb57-1fff-4e98-8a81-2a4cf33ec61c.eastus2.azurecontainer.io/swagger.json
Scoring URI : http://e6cceb57-1fff-4e98-8a81-2a4cf33ec61c.eastus2.azurecontainer.io/score


In [37]:
# testing the endpoint
x_test = validation_data.sample(2)
y_test = x_test['new_cases']
x_test.drop(['new_cases'], inplace=True, axis=1)

In [38]:
Covid19DataTesting= json.dumps({'data': x_test.to_dict(orient='records')})
print(Covid19DataTesting)

{"data": [{"total_cases": 2162.0, "new_cases_smoothed": 4.857, "total_deaths": 25.0, "new_deaths": 0.0, "new_deaths_smoothed": 0.0, "total_cases_per_million": 448.34, "new_cases_per_million": 0.0, "new_cases_smoothed_per_million": 1.007, "total_deaths_per_million": 5.184, "new_deaths_per_million": 0.0, "new_deaths_smoothed_per_million": 0.0, "reproduction_rate": 0.74, "icu_patients": 0, "icu_patients_per_million": 0, "hosp_patients": 0, "new_tests": 0, "total_tests": 0, "total_tests_per_thousand": 0, "positive_rate": "0.002", "tests_per_case": "603.9", "stringency_index": 22.22, "population": 4822233.0, "population_density": 18.206, "median_age": 37.9, "aged_65_older": 15.322, "aged_70_older": 9.72, "gdp_per_capita": 36085.843, "extreme_poverty": 0, "cardiovasc_death_rate": 128.797, "diabetes_prevalence": 8.08, "female_smokers": "14.8", "male_smokers": "17.2", "handwashing_facilities": 0.0, "hospital_beds_per_thousand": 2.61, "life_expectancy": 82.29, "human_development_index": 0.931, 

In [39]:
headers = {'Content-type': 'application/json'}
headers['Authorization'] = f'Bearer {service.get_keys()[0]}'

# Make the request and display the response
response = requests.post(service.scoring_uri, Covid19DataTesting, headers=headers)
print('Prediction :', response.text)

# Print original labels
print('True Values :', y_test.values)

Prediction : [0, 1]
True Values : [0 1]


In [40]:
print(service.get_logs())

2021-03-24T11:04:38,565826500+00:00 - rsyslog/run 
2021-03-24T11:04:38,577894600+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-03-24T11:04:38,595641100+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_2b14f450572e78de640d54eaabed5e4d/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-03-24T11:04:38,605604100+00:00 - iot-server/run 
rsyslogd

In [41]:
#service.delete()
#cpu_cluster.delete()