# Automated ML


In [1]:
import json
import sys
import os
import numpy as np
import pandas as pd
import shutil
import joblib
import requests
import onnxruntime

from sklearn.model_selection import train_test_split

from TrainCovid19Infections import clean_data

from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice

from azureml.core.run import Run

from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.automl.runtime.onnx_convert import OnnxConverter, OnnxInferenceHelper

from azureml.exceptions import ComputeTargetException
from azureml.widgets import RunDetails

from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.automl import AutoMLConfig
from azureml.train.hyperdrive.parameter_expressions import uniform

from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.train.automl import constants


def apply_constant2(scope, input_names, output_name, container, operator_name=None, value=None):
    assert len(input_names) == 0  # only a placeholder to standardize the argument list.
    return apply_constant(scope, output_name, container, operator_name, value)

  "'{0}', source=\n{1}".format(k, source))
def apply_greater_or_equal(scope, input_names, output_name, container, operator_name=None):
    _convert_compare_equal(scope, input_names, output_name, container, operator_name, 'GreaterEqual', 'Less',
                           'GreaterOrEqual')

  "'{0}', source=\n{1}".format(k, source))
def apply_less_or_equal(scope, input_names, output_name, container, operator_name=None):
    _convert_compare_equal(scope, input_names, output_name, container, operator_name, 'LessEqual', 'Greater',
                           'LessOrEqual')

  "'{0}', source=\n{1}".format(k, source))
def apply_relu_6(scope, input_name, output_name, container, operator_name=None, zero_value=0.0):
    name_relu = _c

ImportError: cannot import name 'calculate_linear_classifier_output_shapes'

## Initialize Workspace

In [None]:
# Get current workspace from config
ws = Workspace.from_config()
    
ws.get_details()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

project_folder = './capstone-project'
# choose a name for experiment
experiment_name = 'Covid19VaccinationExperiment'
experiment=Experiment(ws, experiment_name)
experiment

## Create Cluster

Get cluster if it exists else create one

In [None]:
# Create compute cluster
cpu_cluster_name = "Covid19Cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('A cluster with the same name already exists. If you are trying to create a new one please use a new cluster name')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4,identity_type="SystemAssigned")
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
# Get a detailed status for the current cluster. 
print(cpu_cluster.get_status().serialize())

## Dataset

### Overview

I Chose a COVID-19 World Vaccination Dataset that holds a track of the world vaccination including the name of the country, Which vaccines have been used by country, and how many have been vaccinated by Country.

Since the covid-19 vaccination is among the hottest subjects in the world, and as a member of the society being interested in such statistic calculations can help further scientists or even regular people to better understand the global effect of this vaccine all over the world.

I used Kaggle's API to download the Dataset.

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [None]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Covid19InfectionsDataset"
description_text = "Covid19 Vaccination DataSet from Github"
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./github/owid-covid-data.csv'],
                       target_path ='train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)
if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    original_path = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
    ds = TabularDatasetFactory.from_delimited_files(original_path, infer_column_types=True, separator=',', header=True)
    
    
    #ds = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/country_vaccinations.csv')])
    
    #Register Dataset in Workspace
    dataset = ds.register(workspace=ws,
                          name=key,
                          description=description_text)

df = dataset.to_pandas_dataframe().fillna(0)
df.describe()

In [None]:
# preview the first 10 rows of the dataset
df.head(10)

In [None]:
# Use the clean_data function to clean your data.
x, y = clean_data(df)
data = pd.concat([x,y],axis=1)
data.head()

In [None]:
# Split data into train and test sets.
training_data,validation_data = train_test_split(data,test_size = 0.3,random_state = 42,shuffle=True)

In [None]:
# Create necessary folders
if "automl_training" not in os.listdir():
    os.mkdir("./automl_training")
if "data" not in os.listdir():
    os.mkdir("./data")
if "outputs" not in os.listdir():
    os.mkdir("./outputs")
if "training" not in os.listdir():
    os.mkdir("./training")
# store training_dataset into it using datastore
script_folder = './automl_training/'    
os.makedirs(script_folder, exist_ok=True)
shutil.copy('TrainCovid19Infections.py', script_folder)
project_folder = './pipeline-project'

## AutoML Configuration
TODO: Explain why you chose the automl settings and cofiguration you used below.
The settings used below refers to a classification task within a number of settings chosen based on the existing workspace and cluster configuration restrictions 

In [None]:
#convert the training dataset to a CSV file and store it under the training folder
training_data.to_csv('training/training_data.csv')
#Create an experiment for the AutoML testing script
exp = Experiment(workspace=ws, name="Covid19AutoMlExperiment")

# Get the dataset from the data folder
datastore.upload_files(files = ['training/training_data.csv'],
                       target_path ='./data/',
                       overwrite = True,
                       show_progress = True)
training_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore,('./data/training_data.csv'))])
#training_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/country_vaccinations.csv')])
automl_settings = {
    "n_cross_validations": 3,
    "primary_metric": 'accuracy',
    "enable_early_stopping": True,
    "experiment_timeout_hours": 1.5,
    "max_concurrent_iterations": 3,
}
automl_config = AutoMLConfig(task = 'classification',
                             compute_target = cpu_cluster,
                             training_data = training_dataset,
                             label_column_name = 'new_cases',
                             featurization= 'auto',
                             path=project_folder,
                             debug_log = "Covid_automl_errors.log",
                             enable_onnx_compatible_models=True,
                             **automl_settings)

In [None]:
# Experiment Submission
tag = {"Covid19Infections": "Capstone project: Covid19 AutoML Experiment"}
remote_run = experiment.submit(automl_config,tags=tag, show_output=True)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?
Supervised Learning in general is based on labled 

In [None]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

In [None]:
# List best models of HyperDrive Run and AutoML Run to compare the accuracy of the models and choose the best among them to deploy
for model in Model.list(ws):
    print(model.name)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print('\t',tag_name,':',tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print('\t',prop_name,':',prop)
    print("\n")

## Best Model

In [None]:
explaining_model_run_id = remote_run.id
print(explaining_model_run_id)
explaining_model_run = Run(experiment=experiment, run_id=explaining_model_run_id)
explaining_model_run.wait_for_completion()

In [None]:
# Retrieve and save best model.
best_automl_run, best_automl_model = remote_run.get_output()

In [None]:


# Get the metrics of the best selected run
best_run_metrics = best_automl_run.get_metrics()

# Print all metrics of the best run model
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)
    
print(best_automl_model._final_estimator)

In [None]:
joblib.dump(best_automl_model, filename="outputs/automl-model.pkl")

In [None]:
arr = os.listdir('./outputs/')
print(arr)

In [None]:
# register the best model

best_model_registered = best_automl_run.register_model(
                                            model_name = "best_automl_covid19_model",
                                            tags={'Area': "Pandemic", 'Type': "Classification",'Method of execution':'Auto ML'},
                                            properties={'Accuracy': best_run_metrics['accuracy']})
#print(best_model.name, best_model.id, sep='\t')
print(best_model_registered)

## Model Deployment

As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I deploy it in the cell below, register it, create an inference config and deploy the model as a web service.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [None]:
with open('score.py') as f:
    print(f.read())

In [None]:
model = Model(ws, 'best_automl_covid19_model')

environment = best_automl_run.get_environment()
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')
best_automl_run.download_file(constants.CONDA_ENV_FILE_PATH, 'environment.yml')


inference_config = InferenceConfig(entry_script='score.py', environment=environment)

deployment_config = AciWebservice.deploy_configuration(
                                                       cpu_cores=1,
                                                       memory_gb=1,
                                                       description='Covid19 new cases prediction',
                                                       auth_enabled=True,
                                                       enable_app_insights= True,
                                                       collect_model_data = True)

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True)
service.validate_configuration()
service.wait_for_deployment(show_output=True)

print(service.state)
service.print_deploy_configuration()

In [None]:
print("State : "+service.state)
print("Key " + service.get_keys()[0])
print("Swagger URI : "+service.swagger_uri)
print("Scoring URI : "+service.scoring_uri)

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
#Importing the dataset for testing 

Covid19InfectionsSample = df.sample(2) 
y_df = Covid19InfectionsSample["new_cases"]
Covid19InfectionsSample.drop(['new_cases'], inplace=True, axis=1)
x_df = Covid19InfectionsSample

Covid19DataTesting= json.dumps({'data': x_df.to_dict(orient='records')})

print(Covid19DataTesting)

In [None]:
headers = {'Content-type': 'application/json'}
headers['Authorization'] = f'Bearer {service.get_keys()[0]}'
# Make the request and display the response
response = requests.post(service.scoring_uri, Covid19DataTesting, headers=headers)
print('Prediction :', response.text)

# Print original labels
print('True Values :', y_df.values)

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
print(service.get_logs())

# Deploy ONNX Best Model

In [None]:
# retrieve the best model from the remote_run to deploy it
best_run, onnx_mdl = remote_run.get_output(return_onnx_model=True)
# Save the best Model
onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

# ONNX Prediction

In [None]:
if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res
Covid19InfectionsSample = df.sample(2) 
y_df = Covid19InfectionsSample["new_cases"]
Covid19InfectionsSample.drop(['new_cases'], inplace=True, axis=1)
x_df = Covid19InfectionsSample

Covid19DataTesting= json.dumps({'data': x_df.to_dict(orient='records')})

print(Covid19DataTesting)

if python_version_compatible:
    #test_df = Covid19InfectionsSample
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(Covid19InfectionsSample)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')

In [None]:
#service.delete()
#cpu_cluster.delete()