# Automated ML


## Setup infrastructure

### Workspace and experiment

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'autoML-RUL-prediction-2'

# create experiment
experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')


Workspace name: quick-starts-ws-139442
Azure region: southcentralus
Subscription id: 510b94ba-e453-4417-988b-fbdc37b55ca7
Resource group: aml-quickstarts-139442


### Compute target

Create a compute target if not already exists. 

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "cluster-8node"
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_D2_V2',
        max_nodes=8
    )
    compute_target = ComputeTarget.create(
        ws, 
        cpu_cluster_name, 
        compute_config
    )

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset


### Get data from datastore

The datasets (train and test) have been uploaded to the datastore prior to this step. 

In [3]:
from azureml.core import Dataset

def dataset_is_registered(dataset_key : str) -> bool:
    if key in ws.datasets.keys(): 
        return True
    return False

datasets = {
    'turbofan2020_test': 'https://www.dropbox.com/s/l9ihi4w0u7mu9h2/test.csv?dl=1', 
    'turbofan2020_train': 'https://www.dropbox.com/s/v367cwzhaeagpln/train.csv?dl=1', 
    
}


def get_dataset_from_datastore(dataset_key: str): 
    if not dataset_key in ws.datasets.keys(): 
        print("dataset {} not found".format(dataset_key))
        print("registering {}".format(dataset_key))
        dataset = Dataset.Tabular.from_delimited_files(path=datasets.get(dataset_key))
        dataset.register(workspace=ws, name=dataset_key)
        
        
    dataset = ws.datasets[dataset_key]
    return dataset
        

dataset_train = get_dataset_from_datastore(dataset_key='turbofan2020_train')
df_train = dataset_train.to_pandas_dataframe()

dataset_test = get_dataset_from_datastore(dataset_key='turbofan2020_test')
df_test = dataset_test.to_pandas_dataframe()



In [4]:
print(df_train.describe())
df_test.describe()

                alt          Mach           TRA            T2           T24  \
count  17990.000000  17990.000000  17990.000000  17990.000000  17990.000000   
mean   21614.458500      0.619753     68.204505    475.688981    559.542377   
std     6456.433407      0.082107     14.535642     16.901636     18.010382   
min    10001.000000      0.327189     23.834170    422.873510    486.889131   
25%    16167.407500      0.561582     58.714401    461.436045    547.074666   
50%    22773.206667      0.636954     74.428914    473.697652    556.539205   
75%    27635.335000      0.688085     79.455122    490.300305    571.503904   
max    35012.260000      0.738108     87.455820    510.248191    614.186279   

                T30           T48           T50           P15            P2  \
count  17990.000000  17990.000000  17990.000000  17990.000000  17990.000000   
mean    1319.889430   1642.708782   1107.123325     10.916243      8.322230   
std       55.082131     97.693809     51.664114    

Unnamed: 0,alt,Mach,TRA,T2,T24,T30,T48,T50,P15,P2,...,W25,W31,W32,W48,W50,SmFan,SmLPC,SmHPC,phi,RUL
count,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,...,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0,4381.0
mean,18703.079153,0.585746,65.09392,483.191729,565.616347,1328.703685,1648.846191,1120.744664,11.885198,9.138643,...,163.285482,18.796848,11.278109,152.800796,161.781625,19.227164,8.069002,28.021313,38.568113,31.388039
std,6297.11601,0.081464,15.112719,16.381585,18.234413,58.218225,104.325796,53.352943,2.2213,1.824295,...,29.337598,3.465504,2.079302,28.318837,29.93236,1.449629,0.921979,2.098894,2.375526,19.025628
min,10009.25,0.330672,23.839246,443.225516,512.839507,1118.680627,1274.259049,941.904763,7.44669,5.555334,...,90.238279,10.168147,6.100888,82.293572,87.254479,14.706633,4.476232,24.093777,30.105282,0.0
25%,12418.226667,0.525135,53.580087,470.356451,552.167639,1292.087558,1586.844144,1083.940622,10.029028,7.61484,...,141.079604,16.173778,9.704267,131.401297,139.157986,18.139152,7.505072,26.26998,37.041535,15.0
50%,18701.56,0.584181,69.904482,483.56831,562.999947,1326.085813,1655.679304,1112.045291,11.735302,9.022708,...,157.366692,18.097691,10.858614,146.967991,155.631434,19.190287,8.177727,27.730773,38.841946,31.0
75%,23988.466667,0.6509,78.599733,497.89802,578.240512,1367.332798,1718.34856,1156.250371,13.741832,10.883932,...,182.954196,21.120214,12.672129,171.755794,181.808897,19.961255,8.764781,29.416528,40.346144,48.0
max,31015.186667,0.736659,86.307976,510.501427,608.63421,1454.87733,1874.654022,1265.845116,16.329258,12.240255,...,234.943782,27.261484,16.356891,222.130706,235.052587,25.913393,10.003073,36.341097,43.337365,75.0



### Overview

Let's take a quick look at the data: 

In [5]:
df_train.head()



Unnamed: 0,alt,Mach,TRA,T2,T24,T30,T48,T50,P15,P2,...,W25,W31,W32,W48,W50,SmFan,SmLPC,SmHPC,phi,RUL
0,10755.846667,0.482464,77.419957,502.715524,600.842489,1438.543404,1817.226678,1224.041054,15.686092,11.488172,...,226.527619,26.267325,15.760395,213.955572,226.414508,17.07132,9.633927,25.468917,41.855555,74.0
1,13239.803333,0.522359,78.849927,497.183179,595.356518,1426.477576,1802.148702,1207.513483,14.645682,10.699411,...,213.349419,24.71065,14.82639,201.243117,212.977353,17.457839,9.453333,25.562107,41.637166,74.0
2,15451.223333,0.542607,79.364963,490.867389,588.145503,1410.437354,1781.089558,1189.760028,13.603001,9.930772,...,199.744282,23.103543,13.862126,188.110171,199.096147,17.639119,9.415637,25.694706,41.315347,74.0
3,17282.07,0.563765,79.655587,486.092689,582.603059,1397.532616,1763.726386,1175.010565,12.822429,9.360759,...,189.27096,21.866382,13.119829,178.00151,188.411436,17.824628,9.341039,25.844179,41.038986,74.0
4,18929.04,0.581075,80.191713,481.634278,577.668525,1386.604684,1749.818216,1162.702555,12.148701,8.859822,...,180.42038,20.820907,12.492544,169.465076,179.388207,17.922389,9.308881,25.943568,40.834236,74.0


## AutoML Configuration

Setup AutoMLConfig. More details regarding the choices of configurations may be found in the project's [README](README.md).

In [2]:
#TODO: Put your automl settings here
from azureml.train.automl import AutoMLConfig
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "primary_metric" : "normalized_root_mean_squared_error"
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(
    task="regression", 
    compute_target=compute_target, 
    training_data=dataset_train,
    label_column_name="RUL", 
    n_cross_validations=5, 
    **automl_settings
)

In [None]:
# TODO: Submit your experiment
autoML_remote_run = experiment.submit(automl_config)

## Run Details

Here wer use the `RunDetails` widget to show the different experiments.

In [15]:
from azureml.widgets import RunDetails

run_details = RunDetails(run_instance=autoML_remote_run)
run_details.show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

Let's get the best model from the automl experiments and display all the properties of the model.


### Get the best model

In [16]:
best_run, fitted_model = autoML_remote_run.get_output()
print(fitted_model)



Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

### Check model's details: 

In [18]:
fitted_model.named_steps

{'datatransformer': DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                 feature_sweeping_config=None, feature_sweeping_timeout=None,
                 featurization_config=None, force_text_dnn=None,
                 is_cross_validation=None, is_onnx_compatible=None, logger=None,
                 observer=None, task=None, working_dir=None),
 'stackensembleregressor': StackEnsembleRegressor(base_learners=[('0',
                                        Pipeline(memory=None,
                                                 steps=[('maxabsscaler',
                                                         MaxAbsScaler(copy=True)),
                                                        ('lightgbmregressor',
                                                         LightGBMRegressor(boosting_type='gbdt',
                                                                           class_weight=None,
                                                                         

In [19]:
fitted_model.named_steps['stackensembleregressor']

StackEnsembleRegressor(base_learners=[('0',
                                       Pipeline(memory=None,
                                                steps=[('maxabsscaler',
                                                        MaxAbsScaler(copy=True)),
                                                       ('lightgbmregressor',
                                                        LightGBMRegressor(boosting_type='gbdt',
                                                                          class_weight=None,
                                                                          colsample_bytree=1.0,
                                                                          importance_type='split',
                                                                          learning_rate=0.1,
                                                                          max_depth=-1,
                                                                          min_child_samples=20,
   

### Save model artifact 

In [26]:
import os
import pickle

model_artifact_folder = "model_artifacts"
os.makedirs(model_artifact_folder, exist_ok=True)

# Save the best model
model_file_path = os.path.join(model_artifact_folder, "autoML_RUL_prediction_model.pkl")
with open(model_file_path, "wb") as f: 
    pickle.dump(fitted_model, f)
    

In [27]:
!ls model_artifacts

autoML_RUL_prediction_model.pkl


## Model Deployment


### Register the model

In [None]:
registered_model = autoML_remote_run.register_model(description="Model that predicts RUL of jet turbofan")
print(registered_model.model_id)

### Test the deployed endpoint

Deployment was done inside Azure ML studio.  

In [29]:
!python test_endpoint.py

b'"{\\"result\\": [59.77667311656396, 57.486508472066426]}"'


### Enable Application Insights and get logs

In [31]:
!python logs.py

2021-02-14T14:47:11,455096794+00:00 - iot-server/run 
2021-02-14T14:47:11,470352977+00:00 - rsyslog/run 
2021-02-14T14:47:11,470915980+00:00 - gunicorn/run 
2021-02-14T14:47:11,472791790+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

### Delete webservice

In [33]:
from azureml.core.webservice import Webservice

endpoint_name = "turbofan-rul-automl"
service = Webservice(name=endpoint_name, workspace=ws)

In [34]:
service.delete()