# Azure Machine Learning - AutoML Experiment using v2 SDK

Sample here can be run from an Azure ML Compute Instance using the Python 3.10 - SDK v2 Kernel

### Retrieve source data from [GH Repo](https://github.com/ignavinuales/Battery_RUL_Prediction)

In [None]:
# Clone the GitHub repository containing the Battery RUL Prediction project into the Jupyter notebook's workspace  
! git clone https://github.com/ignavinuales/Battery_RUL_Prediction  

### Import required packages

In [None]:
# Import necessary packages and modules  
from azure.ai.ml import MLClient, Input, automl
from azure.ai.ml.entities import Data, AmlCompute, Model, ModelPackage, CodeConfiguration, AzureMLOnlineInferencingServer, ManagedOnlineEndpoint, ManagedOnlineDeployment, Environment
from azure.ai.ml.constants import AssetTypes  
from azure.ai.ml import automl
from azure.identity import DefaultAzureCredential  
import mltable  
import pandas as pd    
import numpy as np    
import random

### Instantiate ml_client

In [None]:
# Create MLClient object from configuration file and DefaultAzureCredential  
ml_client = MLClient.from_config(credential=DefaultAzureCredential())  

# Print the MLClient object  
print(ml_client)  

### Create compute cluster for AutoML training

In [None]:
# Define the name of the CPU compute target to be used or created  
cpu_compute_target = "cpu-cluster"  
  
try:  
    # Try to get the details of the specified compute target if it already exists  
    ml_client.compute.get(cpu_compute_target)  
except Exception:  
    # If the compute target does not exist, catch the exception and create a new one  
    print("Creating a new cpu compute target...")  
    # Define the compute target specifications such as name, VM size, and instance limits  
    compute = AmlCompute(  
        name=cpu_compute_target, size="STANDARD_D2_V2", min_instances=0, max_instances=4  
    )  
    # Initiate the creation or update of the compute target and wait for the operation to complete  
    ml_client.compute.begin_create_or_update(compute).result()  


### Split data into train/validation subsets

In [None]:
# List files in downloaded Battery Life directory that contain 'features' in their name  
data_dir = './Battery_RUL_Prediction/Datasets/HNEI_Processed'  
files = [os.path.join(data_dir, x) for x in os.listdir(data_dir) if 'features' in x]  
  
# Randomly select 2 files for validation and use the rest for training  
validation_files = random.sample(files, 2)  
train_files = [x for x in files if x not in validation_files]  
  
# Initialize an empty DataFrame for validation data  
validation_df = pd.DataFrame()  
# Loop through each file in the validation set, read the data, and concatenate it into the validation DataFrame  
for f in validation_files:  
    validation_df = pd.concat([validation_df, pd.read_csv(f)])  
# Drop unnecessary columns from the validation DataFrame  
validation_df = validation_df.drop(columns=['Unnamed: 0', 'Cycle_Index'])  
  
# Initialize an empty DataFrame for training data  
train_df = pd.DataFrame()  
# Loop through each file in the training set, read the data, and concatenate it into the training DataFrame  
for f in train_files:  
    train_df = pd.concat([train_df, pd.read_csv(f)])  
# Drop unnecessary columns from the training DataFrame  
train_df = train_df.drop(columns=['Unnamed: 0', 'Cycle_Index'])  
  
# Display the first few rows of the training DataFrame  
print(train_df.head())  
  
# Create directories for storing processed validation and training data if they don't exist  
os.makedirs('./validation', exist_ok=True)  
os.makedirs('./train', exist_ok=True)  
  
# Save the processed validation and training DataFrames to CSV files  
validation_df.to_csv('./validation/validation_data.csv', index=False)  
train_df.to_csv('./train/train_data.csv', index=False)  


### Register train/validation datasets into the Azure ML workspace for use in model training

In [None]:
# Define the name for the validation dataset  
validation_dataset_name = 'VALIDATION_BatteryCycles'  
  
# Provide a description for the validation dataset  
validation_dataset_description = 'Holdout dataset from battery cycle repo @https://github.com/ignavinuales/Battery_RUL_Prediction'  
  
# Specify the local path to save the validation dataset  
validation_tbl_path = './validation'  
  
# Define the file path to the validation dataset CSV file  
validation_paths = [{'file':'./validation/validation_data.csv'}]  
  
# Load the validation dataset from the CSV file into a MLTable  
validation_tbl = mltable.from_delimited_files(paths=validation_paths)  
  
# Save the MLTable to the specified path  
validation_tbl.save(validation_tbl_path)  
  
# Create a Data object for the validation dataset with properties such as path, type, description, and name  
validation_data = Data(  
    path=validation_tbl_path,    
    type=AssetTypes.MLTABLE,    
    description=validation_dataset_description,    
    name=validation_dataset_name,    
)  
  
# Create or update the validation dataset in Azure ML workspace  
validation_dataset = ml_client.data.create_or_update(validation_data)  
  
# Retrieve the validation data asset information using the dataset name and version  
validation_data_asset = ml_client.data.get(name=validation_dataset_name, version=validation_dataset.version)  
  
# Load the validation data asset into a pandas DataFrame and display the first few rows  
mltable.load(f'azureml:/{validation_data_asset.id}').to_pandas_dataframe().head()  


In [None]:
# Define the name for the training dataset  
train_dataset_name = 'TRAIN_BatteryCycles'  
  
# Provide a description for the training dataset  
train_dataset_description = 'Training dataset from battery cycle repo @https://github.com/ignavinuales/Battery_RUL_Prediction'  
  
# Specify the local path to save the training dataset  
train_tbl_path = './train'  
  
# Define the file path to the training dataset CSV file  
train_paths = [{'file':'./train/train_data.csv'}]  
  
# Load the training dataset from the CSV file into a MLTable  
train_tbl = mltable.from_delimited_files(paths=train_paths)  
  
# Save the MLTable to the specified path  
train_tbl.save(train_tbl_path)  
  
# Create a Data object for the training dataset with properties such as path, type, description, and name  
train_data = Data(  
    path=train_tbl_path,    
    type=AssetTypes.MLTABLE,    
    description=train_dataset_description,    
    name=train_dataset_name,    
)  
  
# Create or update the training dataset in Azure ML workspace  
train_dataset = ml_client.data.create_or_update(train_data)  
  
# Retrieve the training data asset information using the dataset name and version  
train_data_asset = ml_client.data.get(name=train_dataset_name, version=train_dataset.version)  
  
# Load the training data asset into a pandas DataFrame and display the first few rows  
mltable.load(f'azureml:/{train_data_asset.id}').to_pandas_dataframe().head()  


### Set up and run an AutoML job

Note: here you can modify the AutoML task configuration, including number of trials, timeout limits, models to be used/not used, etc.

In [None]:
# Define the input data for the training job using the Azure ML dataset identifier for the training dataset  
training_input = Input(path=f'azureml:{train_dataset_name}:{train_dataset.version}')  
  
# Define the input data for the validation during training using the Azure ML dataset identifier for the validation dataset  
validation_input = Input(path=f'azureml:{validation_dataset_name}:{validation_dataset.version}')  
  
# Configure the AutoML regression job with required parameters like compute target, experiment name, training and validation data  
# Also, set the target column name for prediction, the primary metric to evaluate models, and enable model explainability  
regression_job = automl.regression(  
    compute=cpu_compute_target,  
    experiment_name='Battery_Cycle_RUL_Prediction',  
    training_data=training_input,  
    validation_data=validation_input,  
    target_column_name="RUL",  
    primary_metric="r2_score",  
    enable_model_explainability=True,  
)  
  
# Set limits for the regression job such as total time, time per trial, maximum number of trials, and enable early termination  
regression_job.set_limits(  
    timeout_minutes=600,   
    trial_timeout_minutes=20,   
    max_trials=25,  
    enable_early_termination=True,  
)  
  
# Set training properties, in this case enabling ONNX compatible models which can be beneficial for cross-platform consistency  
regression_job.set_training(  
    enable_onnx_compatible_models=True  
)  
  
# Set the featurization mode to 'auto' allowing AutoML to handle feature engineering automatically  
regression_job.set_featurization(  
    mode='auto'  
)  
  
# Submit the configured AutoML job to the Azure ML workspace and print the job details  
returned_job = ml_client.jobs.create_or_update(  
    regression_job  
)  
  
print(f"Created job: {returned_job}")  
  
# Retrieve and print the URL for monitoring the status of the submitted job in Azure Machine Learning Studio  
returned_job.services["Studio"].endpoint  


### Monitor AutoML job progress

In [None]:
import time  # Import the time module to use the sleep function  
  
status = ''  # Initialize the status variable as an empty string  
  
# Start a while loop that will run as long as status is not 'Completed' or 'Failed'  
while status != 'Completed' and status != 'Failed':  
    # Call the get method on ml_client.jobs using the display_name of the job to get the current status  
    status = ml_client.jobs.get(returned_job.display_name).status  
    print(status)  # Print the current status  
    time.sleep(30)  # Pause the loop for 30 seconds before checking the status again  

### Retrieve and register best performing model

Note: here is a place where you may look to include some custom champion vs. challenger logic prior to registration.

In [None]:
import mlflow  
from mlflow.tracking.client import MlflowClient  # Import MlflowClient for tracking ML experiments  
from mlflow.artifacts import download_artifacts  # Import download_artifacts for managing artifacts  
  
# Obtain the tracking URL from MLClient  
# This retrieves the MLflow tracking URL for the current workspace from MLClient  
mlflow_tracking_uri = ml_client.workspaces.get(  
    name=ml_client.workspace_name  
).mlflow_tracking_uri  
  
# Print out the obtained MLflow tracking URI  
print(mlflow_tracking_uri)  
  
# Set the tracking URI for MLflow,   
# This ensures that MLflow logs to the correct tracking server  
mlflow.set_tracking_uri(mlflow_tracking_uri)  
  
# Print the current tracking URI to verify it's been set correctly  
print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))  
  
# Initialize the MLFlow client  
# This creates an instance of MlflowClient which will be used to interact with the MLflow tracking server  
mlflow_client = MlflowClient()  


In [None]:
# Get the parent run  
# This retrieves the details of the parent run from MLflow using the display name of the job  
mlflow_parent_run = mlflow_client.get_run(returned_job.display_name)  
  
# Print information about the parent run for verification  
print("Parent Run: ")  
print(mlflow_parent_run)  # Print the entire parent run object  
print(mlflow_parent_run.data.tags)  # Specifically print the tags associated with the parent run  
  
# Get the best model's child run  
# Extract the ID of the best child run from the tags of the parent run  
best_child_run_id = mlflow_parent_run.data.tags["automl_best_child_run_id"]  
# Print the retrieved best child run ID  
print("Found best child run id: ", best_child_run_id)  
  
# Retrieve the best child run's details using its ID  
best_run = mlflow_client.get_run(best_child_run_id)  
  
# Print information about the best child run for verification  
print("Best child run: ")  
print(best_run)  # Print the entire best child run object  


In [None]:
# Define the model name to be used for the registered model  
model_name = "battery-cycle-rul-prediction"  
  
# Create a Model object with the path to the MLflow model artifacts from the best run,  
# the name for the model, a description, and the type specifying it's an MLflow model  
model = Model(  
    path=f"azureml://jobs/{best_run.info.run_id}/outputs/artifacts/outputs/mlflow-model/", # Path to the model artifacts  
    name=model_name,  # Name for the model  
    description="Regression model for predicting remaining useful life of a battery based on current cycle data", # Description of the model purpose  
    type=AssetTypes.MLFLOW_MODEL,  # Type of the model, indicating it's an MLflow model  
)  
  
# Register the model in Azure Machine Learning workspace or update if it already exists  
# This step makes the model available for deployment and further tracking  
registered_model = ml_client.models.create_or_update(model)  

### Retrieve a copy of the best performing model and download locally

In [None]:
 # Create local folder  
# Define the local directory path where artifacts will be downloaded  
local_dir = "./artifact_downloads"  
# Check if the local directory exists, and if not, create it  
if not os.path.exists(local_dir):  
    os.mkdir(local_dir)  
      
# Download run's artifacts/outputs  
# Download the artifacts associated with the specified run ID and save them to the local directory  
local_path = download_artifacts(  
    run_id=best_run.info.run_id,  # The run ID of the best child run  
    artifact_path="outputs",  # The artifact path within the run to download  
    dst_path=local_dir  # The destination path where artifacts will be saved locally  
)  
# Print the local path where artifacts were downloaded to confirm the action  
print("Artifacts downloaded in: {}".format(local_path))  
# List the contents of the downloaded artifacts directory and print them out  
print("Artifacts: {}".format(os.listdir(local_path)))  


### Create a reusable environment based on the conda YAML definition 

In [None]:
# Create or update the environment for deployment  
# This sets up an environment with the necessary dependencies for the model to run  
  
base_environment = ml_client.environments.create_or_update(  
    Environment(  
        name=f"{model_name}-env",  # Define a name for the environment, typically related to the model name  
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04",  # Specify the Docker image to use as a base for the environment  
        conda_file="./artifact_downloads/outputs/conda_env_v_1_0_0.yml",  # Point to the conda environment file that lists all dependencies  
    )  
)  


### Create a local deployment of the model (running inside docker) for inferencing

Here we can use our model in a managed online endpoint (running locally on our compute instance). If the model performs as expected, we can optionally update our deployment to target cloud resources.

In [None]:
# Define the name for the online endpoint  
endpoint_name = 'battery-cycle-rul-endpoint'  
  
# Create an instance of ManagedOnlineEndpoint with the specified name  
endpoint = ManagedOnlineEndpoint(name=endpoint_name)  
  
# Begin the process of creating or updating the online endpoint  
# This is an asynchronous operation that will run locally  
endpoint = ml_client.online_endpoints.begin_create_or_update(endpoint, local=True)  
  
# Define a name for the deployment that is for local model testing  
deployment_name = "local-model-testing"  
  
# Create a deployment package with the model, environment, and code configuration  
deployment_package = ManagedOnlineDeployment(  
    name=deployment_name,  # Set the name of the deployment  
    endpoint_name=endpoint_name,  # Associate the deployment with the endpoint created earlier  
    model=f'azureml:{registered_model.name}:{registered_model.version}',  # Define the model to deploy using the registered model name and version  
    environment=base_environment,  # Set the environment for the deployment with all necessary dependencies  
    code_configuration=CodeConfiguration(  
        code="./artifact_downloads/outputs",  # Specify the directory where the code is located  
        scoring_script="scoring_file_v_1_0_0.py"  # Identify the scoring script that will be used for inference  
    ),  
)  
  
# Begin the process of creating or updating the deployment package  
# This is also an asynchronous operation that will run locally  
ml_client.online_deployments.begin_create_or_update(deployment_package, local=True)  
  
# Retrieve the updated endpoint details  
endpoint = ml_client.online_endpoints.get(endpoint_name, local=True)  
  
# Get the scoring URI from the endpoint which can be used to send scoring requests  
scoring_uri = endpoint.scoring_uri  
  
# Print out the scoring URI  
print(scoring_uri)  


### Submit a request to local endpoint

In [None]:
import requests  # Import requests library to make HTTP requests  
import json  # Import json library for parsing JSON  
  
# Prepare the data frame for sending to the scoring endpoint  
hold_df = validation_df  # Assign the validation dataframe to hold_df  
hold_df = hold_df.reset_index()  # Reset index to ensure proper JSON formatting  
hold_df = hold_df.drop(columns=['RUL', 'index'])  # Drop 'RUL' and 'index' columns as they are not needed for prediction  
  
# Send a POST request to the scoring URI with the data to get predictions  
# Convert the dataframe to JSON format and specify the correct content type  
resp = requests.post(  
    scoring_uri,  
    data=json.dumps({'data': hold_df.to_dict(orient='records')}),  # Convert dataframe to dictionary and then to JSON string  
    headers={'Content-Type': 'application/json'}  # Set the header to indicate JSON content  
)  
  
# Parse the JSON response to extract the predictions  
result = json.loads(resp.json())['result']  # Load JSON response and extract the 'result' field which contains predictions  
  
# Create a new dataframe with the predicted Remaining Useful Life (RUL)  
result_df = validation_df  # Use the original validation dataframe  
result_df = result_df.reset_index()  # Reset index for proper alignment  
result_df['Predicted RUL'] = result  # Add a new column with predicted RUL values  
  
# Display the dataframe with predictions  
result_df  


### Plot actual RUL vs. predicted RUL

In [None]:
import matplotlib.pyplot as plt  # Import the matplotlib library for plotting  
  
# Create a scatter plot to compare actual RUL vs predicted RUL  
plt.scatter(  
    result_df['RUL'],  # X-axis data: Actual RUL values from result_df  
    result_df['Predicted RUL']  # Y-axis data: Predicted RUL values from result_df  
)  
  
# Label the X-axis as 'Actual RUL'  
plt.xlabel('Actual RUL')  
  
# Label the Y-axis as 'Predicted RUL'  
plt.ylabel('Predicted RUL')  
  
# Display the plot  
plt.show()  
