# Step 01 - Load Data from Delta and Train MLflow Model

Notebook loads delta data from a Azure ML-registered ADLS Gen2 datastore by path. Data is used to train a regression model (XGBoostRegressor) for predicting power consumption based on external factors. Model is saved in MLflow format and added to a target MLflow registry (can be either Databricks or Azure ML)

### Import required packages

In [None]:
import pandas as pd
from mltable import from_delta_lake
import xgboost as xgb
from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
import mlflow.xgboost

### Load data from Delta

In [1]:
# Load data from linked ADLS Gen2 Data Lake (DBX Delta Data)
mltable_ts = from_delta_lake(delta_table_uri="azureml://subscriptions/<SUB>/resourcegroups/<RG>/workspaces/<WORKSPACE>/datastores/<DATASTORE>/paths/<PATH>/")
df = mltable_ts.to_pandas_dataframe()

df

Unnamed: 0,DateTime,Temperature,Humidity,Wind-Speed,general-diffuse-flows,diffuse-flows,Zone-1-Power-Consumption,Zone-2--Power-Consumption,Zone-3--Power-Consumption
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.69620,16128.87538,20240.96386
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.070,0.085,29814.68354,19375.07599,20131.08434
2,2017-01-01 00:20:00,6.313,74.5,0.080,0.062,0.100,29128.10127,19006.68693,19668.43373
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.69620,17872.34043,18442.40964
...,...,...,...,...,...,...,...,...,...
39307,2017-09-30 23:10:00,18.420,86.9,4.918,0.084,0.085,35426.54867,20204.15800,16256.26149
39308,2017-09-30 23:20:00,18.400,87.1,4.920,0.062,0.119,34738.40708,19785.03119,15973.85087
39309,2017-09-30 23:30:00,18.440,87.5,4.917,0.055,0.152,34196.81416,19246.15385,15620.83759
39310,2017-09-30 23:40:00,18.360,87.7,4.919,0.062,0.144,33438.58407,18789.60499,15173.68744


### Prep data 
Target single power consumption zone for regression

In [2]:

zone_cols = [x for x in df.columns if 'Zone' in x]
# Define the target variable and features
y = df['Zone-1-Power-Consumption']  # Target column
X = df.drop(columns=zone_cols)  # Drop the target column
X = X.drop(columns=['DateTime'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create MLflow experiment, run, and register model

In [None]:
# Set the experiment name
mlflow.set_experiment("zone-power-regression-training")


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import mlflow
import mlflow.sklearn
import numpy as np
mlflow.sklearn.autolog()
# Step 3: Train the XGBoost Regressor, Calculate Additional Metrics, and Register the Model
with mlflow.start_run():

    # Define the model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
    
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    predictions = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mse)
    
    # Log additional metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("rmse", rmse)
    
    print(f"Model trained with the following metrics:")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}")
    print(f"RMSE: {rmse}")
    
    mlflow.sklearn.log_model(model, 'model')
    
    # Step 4: Register the Model
    model_uri = "runs:/{}/model".format(mlflow.active_run().info.run_id)
    print(model_uri)
    print(mlflow.active_run())
    mlflow.register_model(model_uri, "zone1-power-consumption-xgboost-model")
    
    # print(f"Model registered with URI: {model_uri}")