
## Train AutoML Classification with Titanic Dataset

# Connect to your workspace

Let's get started by connecting to the AML workspace leveraging the Azure ML SDK 

In [1]:
import azureml.core
from azureml.core import Workspace
import pandas as pd

from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.train.automl import AutoMLConfig

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with mm-hackathon-prep


# Explore Dataset

You're going to use a Python script to train a machine learning model based on the Titanic datset found in your data folder.  

In [2]:
df1 = pd.read_csv('./Data/Train1.csv')
df2 = pd.read_csv('./Data/Train2.csv')
print(df1.shape)
print(df2.shape)
df = df1.merge(df2, on = 'passenger_id', how = 'inner')

(917, 6)
(917, 8)


## View Data

In [3]:
df.head(5)

Unnamed: 0,passenger_id,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket
0,501,8.05,,S,,0.0,3.0,"Webber, Mr. James",male,,0.0,0.0,SOTON/OQ 3101316
1,588,21.0,,S,"Ilfracombe, Devon",0.0,2.0,"Phillips, Mr. Escott Robert",male,43.0,0.0,1.0,S.O./P.P. 2
2,402,24.15,,S,,0.0,3.0,"Van Impe, Miss. Catharina",female,10.0,0.0,2.0,345773
3,1193,15.5,,Q,,0.0,3.0,"McEvoy, Mr. Michael",male,,0.0,0.0,36568
4,686,211.3375,B3,S,"St Louis, MO",1.0,1.0,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",female,43.0,0.0,1.0,24160


In [4]:
import numpy as np
df.drop(['passenger_id'], axis=1, inplace=True)
df["survived"] = df["survived"].apply(np.int64)

In [5]:
import os
script_folder = os.path.join(os.getcwd(), "automl_train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/titantic-dataset-private/Supplemental_Folders/Additional_Notebooks/automl_train


In [6]:
df.to_csv('./automl_train/titanic.csv')
df.head(2)

Unnamed: 0,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket
0,8.05,,S,,0,3.0,"Webber, Mr. James",male,,0.0,0.0,SOTON/OQ 3101316
1,21.0,,S,"Ilfracombe, Devon",0,2.0,"Phillips, Mr. Escott Robert",male,43.0,0.0,1.0,S.O./P.P. 2


In [7]:
df.dtypes

fare         float64
cabin         object
embarked      object
home.dest     object
survived       int64
pclass       float64
name          object
sex           object
age          float64
sibsp        float64
parch        float64
ticket        object
dtype: object

## Split the data into train and test sets

Split the data into a test and training dataset, this will allow us to have data points to test the model on data that the model hasn't see before in order to measure true accuracy.  

In [8]:
from sklearn.model_selection import train_test_split

print(df.shape)
x_train, X_test = train_test_split(df, test_size=0.2, random_state=223)

# Use the same function above for the validation set
X_train, X_val = train_test_split(x_train,  test_size=0.2, random_state= 8)


print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("X_val shape: {}".format(X_val.shape))

label = "survived"

X_train.to_csv('./automl_train/train.csv', index = False)
X_test.to_csv('./automl_train/test.csv', index = False)
X_val.to_csv('./automl_train/validate.csv', index = False)


(917, 12)
X_train shape: (586, 12)
X_test shape: (184, 12)
X_val shape: (147, 12)


In [9]:
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir='./automl_train', target_path='titanic-aml', 
        overwrite=True, show_progress=True)


"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 4 files
Uploading ./automl_train/test.csv
Uploaded ./automl_train/test.csv, 1 files out of an estimated total of 4
Uploading ./automl_train/titanic.csv
Uploaded ./automl_train/titanic.csv, 2 files out of an estimated total of 4
Uploading ./automl_train/train.csv
Uploaded ./automl_train/train.csv, 3 files out of an estimated total of 4
Uploading ./automl_train/validate.csv
Uploaded ./automl_train/validate.csv, 4 files out of an estimated total of 4
Uploaded 4 files


$AZUREML_DATAREFERENCE_9a8ee725f141407ba7a9be1246be29a8

In [10]:
from azureml.core import Dataset

#Create a tabular dataset from the path on the datastore 
train_dataset      = Dataset.Tabular.from_delimited_files(default_ds.path('titanic-aml/train.csv'))
test_dataset       = Dataset.Tabular.from_delimited_files(default_ds.path('titanic-aml/test.csv'))
validation_dataset = Dataset.Tabular.from_delimited_files(default_ds.path('titanic-aml/validate.csv'))



In [11]:
# choose a name for experiment
experiment_name = 'automl-titanic'

experiment=Experiment(ws, experiment_name)

output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Experiment Name'] = experiment.name
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,5da07161-3770-4a4b-aa43-418cbbb627cf
Workspace,mm-hackathon-prep
Resource Group,mm-hackathon-prep-rg
Location,eastus
Experiment Name,automl-titanic


## Create a compute cluster

Previously, the model was trained on a compute instance, by creating a compute cluster, we can submit the job to the compute cluster

In [12]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aml-cluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## Define Training Settings

Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.

|Property|Description|
|-|-|
|**task**|classification or regression or forecasting|
|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|
|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|
|**blocked_models** | *List* of *strings* indicating machine learning algorithms for AutoML to avoid in this run. <br><br> Allowed values for **Classification**<br><i>LogisticRegression</i><br><i>SGD</i><br><i>MultinomialNaiveBayes</i><br><i>BernoulliNaiveBayes</i><br><i>SVM</i><br><i>LinearSVM</i><br><i>KNN</i><br><i>DecisionTree</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>GradientBoosting</i><br><i>TensorFlowDNN</i><br><i>TensorFlowLinearClassifier</i><br><br>Allowed values for **Regression**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><br>Allowed values for **Forecasting**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><i>Arima</i><br><i>Prophet</i>|
|**allowed_models** |  *List* of *strings* indicating machine learning algorithms for AutoML to use in this run. Same values listed above for **blocked_models** allowed for **allowed_models**.|
|**experiment_exit_score**| Value indicating the target for *primary_metric*. <br>Once the target is surpassed the run terminates.|
|**experiment_timeout_hours**| Maximum amount of time in hours that all iterations combined can take before the experiment terminates.|
|**enable_early_stopping**| Flag to enble early termination if the score is not improving in the short term.|
|**featurization**| 'auto' / 'off'  Indicator for whether featurization step should be done automatically or not. Note: If the input data is sparse, featurization cannot be turned on.|
|**n_cross_validations**|Number of cross validation splits.|
|**training_data**|Input dataset, containing both features and label column.|
|**label_column_name**|The name of the label column.|

**_You can find more information about primary metrics_** [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)

In [13]:
test_dataset.to_pandas_dataframe()

Unnamed: 0,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket
0,11.1333,,S,,1,3.0,"Johnson, Miss. Eleanor Ileen",female,1.0,1.0,1.0,347742
1,9.5000,,S,,1,3.0,"Sheerlinck, Mr. Jan Baptist",male,29.0,0.0,0.0,345779
2,7.8958,,S,Austria,0,3.0,"Cor, Mr. Bartol",male,35.0,0.0,0.0,349230
3,39.0000,F4,S,"Guntur, India / Benton Harbour, MI",1,2.0,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36.0,0.0,3.0,230136
4,60.0000,C31,S,"Huntington, WV",1,1.0,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18.0,1.0,0.0,13695
...,...,...,...,...,...,...,...,...,...,...,...,...
179,7.8792,,Q,"Co Sligo, Ireland New York, NY",0,3.0,"Burns, Miss. Mary Delia",female,18.0,0.0,0.0,330963
180,53.1000,E8,S,"New York, NY / Ithaca, NY",1,1.0,"Chambers, Mr. Norman Campbell",male,27.0,1.0,0.0,113806
181,9.8250,,S,,0,3.0,"Jussila, Miss. Mari Aina",female,21.0,1.0,0.0,4137
182,136.7792,C89,C,"Los Angeles, CA",0,1.0,"Clark, Mr. Walter Miller",male,27.0,1.0,0.0,13508


In [14]:
train_dataset.to_pandas_dataframe().dtypes

fare         float64
cabin         object
embarked      object
home.dest     object
survived       int64
pclass       float64
name          object
sex           object
age          float64
sibsp        float64
parch        float64
ticket        object
dtype: object

In [15]:
validation_dataset.to_pandas_dataframe().dtypes

fare         float64
cabin         object
embarked      object
home.dest     object
survived       int64
pclass       float64
name          object
sex           object
age          float64
sibsp        float64
parch        float64
ticket        object
dtype: object

In [16]:
import logging

print(label)

survived


In [17]:
automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    #"n_cross_validations": 2,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=training_cluster,
                             experiment_exit_score = 0.9984,
                             blocked_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = train_dataset,
                             label_column_name = label,
                             validation_data = validation_dataset,
                             test_data=test_dataset,
                             **automl_settings
                            )

In [18]:
azureml.train.automl.utilities.get_primary_metrics('classification')

['average_precision_score_weighted',
 'accuracy',
 'norm_macro_recall',
 'precision_score_weighted',
 'AUC_weighted']

## Submit the AutoML Classification Training Run

In [19]:
remote_run = experiment.submit(automl_config, show_output = False)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
automl-titanic,AutoML_cbae1471-002f-48be-85d5-f4a110fbb67a,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


In [20]:
# Wait for the remote run to complete
remote_run.wait_for_completion()

{'runId': 'AutoML_cbae1471-002f-48be-85d5-f4a110fbb67a',
 'target': 'aml-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-03-16T05:37:52.060968Z',
 'endTimeUtc': '2022-03-16T06:04:54.200416Z',
 'services': {},
   'message': 'Experiment timeout reached, hence experiment stopped. Current experiment timeout: 0 hour(s) 18 minute(s)'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'aml-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"0b382e51-36e6-449a-b01f-1c9639297905\\"}, \\"validation_data\\": {\\"datasetId\\": \\"a8409806-011b-480e-842f-d04d54a639c8\\"}, \\"test_data\\": {\\"datasetId\\": \\"1ec58736-56aa-4d92-abfa-024004626b55\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_t

In [21]:
# Retrieve the best Run object
best_run = remote_run.get_best_child()
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-titanic,AutoML_cbae1471-002f-48be-85d5-f4a110fbb67a_35,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


## View Features
You can see the featurization summary for the best model.  This is stored as JSON in the outputs directory for the best run

In [22]:
# Download the featurization summary JSON file locally
import json
best_run.download_file("outputs/featurization_summary.json", "featurization_summary.json")

# Render the JSON as a pandas DataFrame
with open("featurization_summary.json", "r") as f:
    records = json.load(f)

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.DataFrame.from_records(records)

Unnamed: 0,RawFeatureName,TypeDetected,Dropped,EngineeredFeatureCount,Transformations,TransformationParams
0,fare,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['fare'], 'TransformationFunction': 'Imputer', 'Operator': 'Mean', 'FeatureType': 'Numeric', 'ShouldOutput': True, 'TransformationParams': {'add_indicator': False, 'copy': True, 'fill_value': None, 'missing_values': nan, 'strategy': 'mean', 'verbose': 0}}}"
1,pclass,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['pclass'], 'TransformationFunction': 'Imputer', 'Operator': 'Mean', 'FeatureType': 'Numeric', 'ShouldOutput': True, 'TransformationParams': {'add_indicator': False, 'copy': True, 'fill_value': None, 'missing_values': nan, 'strategy': 'mean', 'verbose': 0}}}"
2,age,Numeric,No,2,"[MeanImputer, ImputationMarker]","{'Transformer1': {'Input': ['age'], 'TransformationFunction': 'Imputer', 'Operator': 'Mean', 'FeatureType': 'Numeric', 'ShouldOutput': True, 'TransformationParams': {'add_indicator': False, 'copy': True, 'fill_value': None, 'missing_values': nan, 'strategy': 'mean', 'verbose': 0}}}"
3,sibsp,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['sibsp'], 'TransformationFunction': 'Imputer', 'Operator': 'Mean', 'FeatureType': 'Numeric', 'ShouldOutput': True, 'TransformationParams': {'add_indicator': False, 'copy': True, 'fill_value': None, 'missing_values': nan, 'strategy': 'mean', 'verbose': 0}}}"
4,parch,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['parch'], 'TransformationFunction': 'Imputer', 'Operator': 'Mean', 'FeatureType': 'Numeric', 'ShouldOutput': True, 'TransformationParams': {'add_indicator': False, 'copy': True, 'fill_value': None, 'missing_values': nan, 'strategy': 'mean', 'verbose': 0}}}"
5,cabin,Ignore,Yes,0,[],"{'Transformer1': {'Input': ['cabin'], 'TransformationFunction': '', 'Operator': None, 'FeatureType': 'Ignore', 'ShouldOutput': True, 'TransformationParams': None}}"
6,ticket,Ignore,Yes,0,[],"{'Transformer1': {'Input': ['ticket'], 'TransformationFunction': '', 'Operator': None, 'FeatureType': 'Ignore', 'ShouldOutput': True, 'TransformationParams': None}}"
7,embarked,Categorical,No,3,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['embarked'], 'TransformationFunction': 'StringCast', 'Operator': None, 'FeatureType': 'Categorical', 'ShouldOutput': False, 'TransformationParams': {}}, 'Transformer2': {'Input': ['Transformer1'], 'TransformationFunction': 'CountVectorizer', 'Operator': 'CharGram', 'FeatureType': None, 'ShouldOutput': True, 'TransformationParams': {'analyzer': 'word', 'binary': True, 'decode_error': 'strict', 'encoding': 'utf-8', 'input': 'content', 'lowercase': False, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': [1, 1], 'stop_words': None, 'strip_accents': None, 'token_pattern': '(?u)\b\w\w+\b', 'vocabulary': None}}}"
8,sex,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['sex'], 'TransformationFunction': 'CatImputer', 'Operator': 'Mode', 'FeatureType': 'Categorical', 'ShouldOutput': True, 'TransformationParams': {'copy': True}}, 'Transformer2': {'Input': ['Transformer1'], 'TransformationFunction': 'StringCast', 'Operator': None, 'FeatureType': None, 'ShouldOutput': False, 'TransformationParams': {}}, 'Transformer3': {'Input': ['Transformer2'], 'TransformationFunction': 'LabelEncoder', 'Operator': None, 'FeatureType': None, 'ShouldOutput': True, 'TransformationParams': {'hashing_seed_val': 314489979}}}"
9,home.dest,CategoricalHash,No,128,[StringCast-HashOneHotEncoder],"{'Transformer1': {'Input': ['home.dest'], 'TransformationFunction': 'StringCast', 'Operator': None, 'FeatureType': 'CategoricalHash', 'ShouldOutput': False, 'TransformationParams': {}}, 'Transformer2': {'Input': ['Transformer1'], 'TransformationFunction': 'HashOneHotEncoder', 'Operator': None, 'FeatureType': None, 'ShouldOutput': True, 'TransformationParams': {'hashing_seed_val': 314489979, 'num_cols': 128}}}"


## View Results

In [23]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Get the explanation for the Best Model

In [24]:
# Wait for the best model explanation run to complete
from azureml.core.run import Run
model_explainability_run_id = remote_run.id + "_" + "ModelExplain"
print(model_explainability_run_id)
model_explainability_run = Run(experiment=experiment, run_id=model_explainability_run_id)
model_explainability_run.wait_for_completion()

# Get the best run object
best_run = remote_run.get_best_child()

AutoML_cbae1471-002f-48be-85d5-f4a110fbb67a_ModelExplain


In [25]:
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: automl-titanic,
Id: AutoML_cbae1471-002f-48be-85d5-f4a110fbb67a_35,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=False, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=False, is_onnx_compatible=True, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/moun...
), random_state=0, reg_alpha=1.5625, reg_lambda=2.1875, subsample=0.7, tree_method='auto'))], verbose=False)), ('16', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticregression', LogisticRegression(C=4714.8663634573895, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='multinomial', n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_star

In [26]:
estimator = fitted_model.steps[-1]
print(type(estimator))
print(estimator)

<class 'tuple'>
('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(
    estimators=[('26', Pipeline(
        memory=None,
        steps=[('standardscalerwrapper', StandardScalerWrapper(
            copy=True,
            with_mean=False,
            with_std=True
        )), ('lightgbmclassifier', LightGBMClassifier(
            boosting_type='goss',
            colsample_bytree=0.6933333333333332,
            learning_rate=0.09473736842105263,
            max_bin=310,
            max_depth=6,
            min_child_weight=6,
            min_data_in_leaf=0.013801724137931036,
            min_split_gain=0.3157894736842105,
            n_estimators=200,
            num_leaves=5,
            reg_alpha=0.894736842105263,
            reg_lambda=0.5263157894736842,
            subsample=1,
            random_state=None,
            n_jobs=-1,
            problem_info=ProblemInfo(
                gpu_training_param_dict={'processing_unit_type': 'cpu'}
            )
        ))],
  

## Test the best model accuracy

In [27]:
X_test.head(5)

Unnamed: 0,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket
419,11.13,,S,,1,3.0,"Johnson, Miss. Eleanor Ileen",female,1.0,1.0,1.0,347742
484,9.5,,S,,1,3.0,"Sheerlinck, Mr. Jan Baptist",male,29.0,0.0,0.0,345779
300,7.9,,S,Austria,0,3.0,"Cor, Mr. Bartol",male,35.0,0.0,0.0,349230
268,39.0,F4,S,"Guntur, India / Benton Harbour, MI",1,2.0,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36.0,0.0,3.0,230136
103,60.0,C31,S,"Huntington, WV",1,1.0,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18.0,1.0,0.0,13695


In [28]:
X_test = pd.read_csv('./automl_train/test.csv')

Y_test = X_test.pop("Survived")

y_predict = fitted_model.predict(X_test)
print(y_predict[:10])

KeyError: 'Survived'

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
import itertools

cf =confusion_matrix(Y_test.values,y_predict)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix Test')
plt.xlabel('Predicted')
plt.ylabel('Actual')
class_labels = ['False','True']
tick_marks = np.arange(len(class_labels))
plt.xticks(tick_marks,class_labels)
plt.yticks([-0.5,0,1,1.5],['','False','True',''])
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
    plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show()

In [None]:
## Deploy Model

In [None]:
model_name = best_run.properties['model_name']

print(model_name)

script_file_name = 'inference/score.py'

best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'inference/score.py')

In [None]:
## Register the Fitted Model for Deployment

In [None]:
description = 'AutoML Model trained on data to predict survial'
tags = None
model = remote_run.register_model(model_name = model_name, description = description, tags = tags)

print(remote_run.model_id) # This will be written to the script file later in the notebook.

In [None]:
## Deploy the model as a Web Service to ACI

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model

inference_config = InferenceConfig(environment = best_run.get_environment(), entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 2, 
                                               memory_gb = 2, 
                                               tags = {'type': "automl_classification"}, 
                                               description = 'sample service for Automl Classification')

aci_service_name = 'automl-titanic'
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

## Get Logs from ACI Web Service

In [None]:
aci_service.get_logs()

In [None]:
from numpy import array

X_test = pd.read_csv('./automl_train/test.csv')

Y_test = X_test.pop("Survived")

Y_test = pd.DataFrame(Y_test)
y_test = array(Y_test['Survived'])

In [None]:
import requests

X_test_json = X_test.to_json(orient='records')
data = "{\"data\": " + X_test_json +"}"
headers = {'Content-Type': 'application/json'}

resp = requests.post(aci_service.scoring_uri, data, headers=headers)

y_pred = json.loads(json.loads(resp.text))['result']
print(y_pred)

In [None]:
from numpy import array
actual = array(Y_test)
actual = actual[:,0]
print(len(y_pred), " ", len(y_test))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(acc)
print(prec)
print(recall)

In [None]:
## Retrieve best ONNX MOdel

In [None]:
best_run, onnx_mdl = remote_run.get_output(return_onnx_model=True)

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter
onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

## Predict with ONNX model - leveraging onnxruntime package

In [None]:
test_df

In [None]:
import sys
import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        result = json.load(f)
    return result

if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    test_df = test_dataset.to_pandas_dataframe()
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_result = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_result)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(test_df)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')