Copyright (c) Microsoft Corporation. 
Licensed under the MIT license. 
# Azure Machine Learning / AutoML Integration

Capture results of AutoML experiments on the dataset and persist to Data Lake for reporting and analysis.

## Library Imports


In [90]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Run
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.model import Model
from azureml.interpret import ExplanationClient
from pyspark.sql.functions import *
import pprint

## Read Model Metrics from Azure ML

Connect to the Azure ML workspace and extract metrics from the AutoML run.

In [87]:
# connect to Azure ML
subscription_id = ''
workspace_name = ''
resource_group = ''

ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

#pp = pprint.PrettyPrinter()
#pp.pprint(ws.get_details())

### Pull Metrics from Run

Gather the following metrics:

* AUC = AUC_weighted
* Accuracy = accuracy
* Precision = precision_score_weighted
* Recall = recall_score_weighted
* F1 = f1_score_weighted


In [88]:
# pull all metrics of best run
experiment_name = ''
run_id = ''

experiment = Experiment(workspace=ws, name=experiment_name)
fetched_run = Run(experiment, run_id)
metrics = fetched_run.get_metrics()

#pp = pprint.PrettyPrinter()
#pp.pprint(metrics)

In [89]:
# select relevant metrics
auc = metrics.get('AUC_weighted')
accuracy = metrics.get('accuracy')
precision = metrics.get('precision_score_weighted')
recall = metrics.get('recall_score_weighted')
f1 = metrics.get('f1_score_weighted')

# combine into single dataframe
metrics_df = sc.parallelize([['AUC', auc], ['Accuracy', accuracy], ['Precision', precision], ['Recall', recall], ['F1', f1]]).toDF(('Metric', 'Value'))

#display(metrics_df)

## Read Feature Importances from AutoML


In [91]:
client = ExplanationClient.from_run(fetched_run)
engineered_explanations = client.download_model_explanation(raw=False)
features_dict = engineered_explanations.get_feature_importance_dict()

In [92]:
# save to list and convert numpy types to native
features_list = []

for key, value in features_dict.items():
    temp = [key.item(),value.item()]
    features_list.append(temp)

# save to dataframe
features_df = spark.createDataFrame(features_list, ['Feature', 'Value'])

#display(features_df)

## Save Results to Data Lake

Persist the model results to CSV files on the Data Lake for reporting.


In [None]:
data_lake_account_name = ''
file_system_name = ''

In [None]:
metrics_df.coalesce(1).write.option('header', 'true').mode('overwrite').csv(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/reporting/model_metrics')
features_df.coalesce(1).write.option('header', 'true').mode('overwrite').csv(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/reporting/feature_importances')