# Azure Machine Learning - AutoML Experiment using v2 SDK

Sample here can be run from an Azure ML Compute Instance using the Python 3.10 - SDK v2 Kernel

### Retrieve source data from [GH Repo](https://github.com/ignavinuales/Battery_RUL_Prediction)

In [None]:
# Clone the GitHub repository containing the Battery RUL Prediction project into the Jupyter notebook's workspace  
! git clone https://github.com/ignavinuales/Battery_RUL_Prediction  

### Import required packages

In [None]:
# Import necessary packages and modules  
from azure.ai.ml import MLClient, Input, automl
from azure.ai.ml.entities import Data, AmlCompute
from azure.ai.ml.constants import AssetTypes  
from azure.ai.ml import automl
from azure.identity import DefaultAzureCredential  
import mltable  
import pandas as pd    
import numpy as np    
import random

### Instantiate ml_client

In [None]:
# Create MLClient object from configuration file and DefaultAzureCredential  
ml_client = MLClient.from_config(credential=DefaultAzureCredential())  

# Print the MLClient object  
print(ml_client)  

### Create compute cluster for AutoML training

In [None]:
# Define the name of the CPU compute target to be used or created  
cpu_compute_target = "cpu-cluster"  
  
try:  
    # Try to get the details of the specified compute target if it already exists  
    ml_client.compute.get(cpu_compute_target)  
except Exception:  
    # If the compute target does not exist, catch the exception and create a new one  
    print("Creating a new cpu compute target...")  
    # Define the compute target specifications such as name, VM size, and instance limits  
    compute = AmlCompute(  
        name=cpu_compute_target, size="STANDARD_D2_V2", min_instances=0, max_instances=4  
    )  
    # Initiate the creation or update of the compute target and wait for the operation to complete  
    ml_client.compute.begin_create_or_update(compute).result()  


### Split data into train/validation subsets

In [None]:
# List files in downloaded Battery Life directory that contain 'features' in their name  
data_dir = './Battery_RUL_Prediction/Datasets/HNEI_Processed'  
files = [os.path.join(data_dir, x) for x in os.listdir(data_dir) if 'features' in x]  
  
# Randomly select 2 files for validation and use the rest for training  
validation_files = random.sample(files, 2)  
train_files = [x for x in files if x not in validation_files]  
  
# Initialize an empty DataFrame for validation data  
validation_df = pd.DataFrame()  
# Loop through each file in the validation set, read the data, and concatenate it into the validation DataFrame  
for f in validation_files:  
    validation_df = pd.concat([validation_df, pd.read_csv(f)])  
# Drop unnecessary columns from the validation DataFrame  
validation_df = validation_df.drop(columns=['Unnamed: 0', 'Cycle_Index'])  
  
# Initialize an empty DataFrame for training data  
train_df = pd.DataFrame()  
# Loop through each file in the training set, read the data, and concatenate it into the training DataFrame  
for f in train_files:  
    train_df = pd.concat([train_df, pd.read_csv(f)])  
# Drop unnecessary columns from the training DataFrame  
train_df = train_df.drop(columns=['Unnamed: 0', 'Cycle_Index'])  
  
# Display the first few rows of the training DataFrame  
print(train_df.head())  
  
# Create directories for storing processed validation and training data if they don't exist  
os.makedirs('./validation', exist_ok=True)  
os.makedirs('./train', exist_ok=True)  
  
# Save the processed validation and training DataFrames to CSV files  
validation_df.to_csv('./validation/validation_data.csv', index=False)  
train_df.to_csv('./train/train_data.csv', index=False)  


In [None]:
# Define the name for the validation dataset  
validation_dataset_name = 'VALIDATION_BatteryCycles'  
  
# Provide a description for the validation dataset  
validation_dataset_description = 'Holdout dataset from battery cycle repo @https://github.com/ignavinuales/Battery_RUL_Prediction'  
  
# Specify the local path to save the validation dataset  
validation_tbl_path = './validation'  
  
# Define the file path to the validation dataset CSV file  
validation_paths = [{'file':'./validation/validation_data.csv'}]  
  
# Load the validation dataset from the CSV file into a MLTable  
validation_tbl = mltable.from_delimited_files(paths=validation_paths)  
  
# Save the MLTable to the specified path  
validation_tbl.save(validation_tbl_path)  
  
# Create a Data object for the validation dataset with properties such as path, type, description, and name  
validation_data = Data(  
    path=validation_tbl_path,    
    type=AssetTypes.MLTABLE,    
    description=validation_dataset_description,    
    name=validation_dataset_name,    
)  
  
# Create or update the validation dataset in Azure ML workspace  
validation_dataset = ml_client.data.create_or_update(validation_data)  
  
# Retrieve the validation data asset information using the dataset name and version  
validation_data_asset = ml_client.data.get(name=validation_dataset_name, version=validation_dataset.version)  
  
# Load the validation data asset into a pandas DataFrame and display the first few rows  
mltable.load(f'azureml:/{validation_data_asset.id}').to_pandas_dataframe().head()  


In [None]:
# Define the name for the training dataset  
train_dataset_name = 'TRAIN_BatteryCycles'  
  
# Provide a description for the training dataset  
train_dataset_description = 'Training dataset from battery cycle repo @https://github.com/ignavinuales/Battery_RUL_Prediction'  
  
# Specify the local path to save the training dataset  
train_tbl_path = './train'  
  
# Define the file path to the training dataset CSV file  
train_paths = [{'file':'./train/train_data.csv'}]  
  
# Load the training dataset from the CSV file into a MLTable  
train_tbl = mltable.from_delimited_files(paths=train_paths)  
  
# Save the MLTable to the specified path  
train_tbl.save(train_tbl_path)  
  
# Create a Data object for the training dataset with properties such as path, type, description, and name  
train_data = Data(  
    path=train_tbl_path,    
    type=AssetTypes.MLTABLE,    
    description=train_dataset_description,    
    name=train_dataset_name,    
)  
  
# Create or update the training dataset in Azure ML workspace  
train_dataset = ml_client.data.create_or_update(train_data)  
  
# Retrieve the training data asset information using the dataset name and version  
train_data_asset = ml_client.data.get(name=train_dataset_name, version=train_dataset.version)  
  
# Load the training data asset into a pandas DataFrame and display the first few rows  
mltable.load(f'azureml:/{train_data_asset.id}').to_pandas_dataframe().head()  


In [None]:
# Define the input data for the training job using the Azure ML dataset identifier for the training dataset  
training_input = Input(path=f'azureml:{train_dataset_name}:{train_dataset.version}')  
  
# Define the input data for the validation during training using the Azure ML dataset identifier for the validation dataset  
validation_input = Input(path=f'azureml:{validation_dataset_name}:{validation_dataset.version}')  
  
# Configure the AutoML regression job with required parameters like compute target, experiment name, training and validation data  
# Also, set the target column name for prediction, the primary metric to evaluate models, and enable model explainability  
regression_job = automl.regression(  
    compute=cpu_compute_target,  
    experiment_name='Battery_Cycle_RUL_Prediction',  
    training_data=training_input,  
    validation_data=validation_input,  
    target_column_name="RUL",  
    primary_metric="r2_score",  
    enable_model_explainability=True,  
)  
  
# Set limits for the regression job such as total time, time per trial, maximum number of trials, and enable early termination  
regression_job.set_limits(  
    timeout_minutes=600,   
    trial_timeout_minutes=20,   
    max_trials=15,  
    enable_early_termination=True,  
)  
  
# Set training properties, in this case enabling ONNX compatible models which can be beneficial for cross-platform consistency  
regression_job.set_training(  
    enable_onnx_compatible_models=True  
)  
  
# Set the featurization mode to 'auto' allowing AutoML to handle feature engineering automatically  
regression_job.set_featurization(  
    mode='auto'  
)  
  
# Submit the configured AutoML job to the Azure ML workspace and print the job details  
returned_job = ml_client.jobs.create_or_update(  
    regression_job  
)  
  
print(f"Created job: {returned_job}")  
  
# Retrieve and print the URL for monitoring the status of the submitted job in Azure Machine Learning Studio  
returned_job.services["Studio"].endpoint  
