# AppEase Machine Learning

In this notebook, you use automated machine learning in Azure Machine Learning service to create a regression model to predict labels. This process accepts training data and configuration settings, and automatically iterates through combinations of different feature normalization/standardization methods, models, and hyperparameter settings to arrive at the best model.

To run this notebook, you need an Azure subscription, create a free account before you begin. Try the [free or paid version](https://aka.ms/AMLFree) of Azure Machine Learning service today.

In [2]:
# import packages
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import logging
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails
from sklearn.metrics import mean_squared_error
from math import sqrt

In [1]:
# required info
subsciption_id = '<azure-subscription-id>'

# you choose these
workspace_resource_group = None # replace this if you'd like to use a pre-built resource group
workspace_loc = 'eastus' # feel free to change this
workspace_name = '<azure-workspace-name>'
compute_cluster_name = '<azure-cluster-name>'

# name of files in local directory with data and labels (to be merged on indexes and with DV column of labels named 'Label'
# NOTE: data_file must contain at least 63 records of data for training (with 0.8/0.2 train/test split)
data_file_name = 'simulated_health_data.json'
labels_file_name = 'random_labels.json'

In [None]:
# create an Azure workspace
if workspace_resource_group == None:
    create_RG = True
else:
    create_RG = False
ws = Workspace.create(name= workspace_name, subscription_id=subsciption_id,resource_group=workspace_resource_group, create_resource_group=create_RG,location=workspace_loc)

In [None]:
# create an Azure compute cluster
try: # Verify that cluster does not exist already
    cpu_cluster = ComputeTarget(workspace=ws, name=compute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

In [11]:
# load data
if labels_file_name == None:
    final_df = pd.read_json(data_file_name)
else:
    data = pd.read_json(data_file_name)
    labels = pd.read_json(labels_file_name, typ='series')
    final_df = data.merge(labels.rename('Label'), left_index=True, right_index=True)

In [13]:
# Split the data into train and test sets
x_train, x_test = train_test_split(final_df, test_size=0.2, random_state=223)

In [14]:
# define settings for the experiment run (see parameters at https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train)
automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "primary_metric": 'spearman_correlation',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}
automl_config = AutoMLConfig(task='regression',debug_log='automated_ml_errors.log',training_data=x_train,label_column_name="Label",**automl_settings)

In [16]:
# create and run the Experiment
experiment = Experiment(ws, "AppEaseML")
local_run = experiment.submit(automl_config, show_output=True) # this can take about 20 minutes with the default settings

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
AppEaseML,AutoML_795ae4f4-fd63-4447-ba64-e05045c0482a,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.


In [None]:
# explore the results and retrieve the best model
RunDetails(local_run).show()
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

In [None]:
# calculate the root mean squared error, mean absolute percent error, and accuracy of the best model
y_test = x_test.pop("Label")
y_predict = fitted_model.predict(x_test)

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
print("Model RMSE:")
print(rmse)
print()

sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = actual_val - predict_val
    if abs_error < 0:
        abs_error = abs_error * -1

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)