# Model Training with K-Folds Cross-Validation

In [1]:
%load_ext autoreload
%autoreload 2

import joblib
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import pandas as pd

from pathlib import Path
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, mean_absolute_error
from typing import Dict, Tuple

from config import MLFLOW_TRACKING_URI

## Load Data

More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [2]:
# Download original dataset with: python src/pipelines/load_data.py 

raw_data = pd.read_csv(f"../data/raw_data.csv")

raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


## Define column mapping

In [3]:
target = 'cnt'
prediction = 'prediction'
datetime = 'dteday'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]

# Define the comparison windows

In [4]:
start_date_0 = '2011-01-02 00:00:00'
end_date_0 = '2011-01-30 23:00:00'

experiment_batches = [
    
    ('2011-01-31 00:00:00','2011-02-06 23:00:00'),
    ('2011-02-07 23:00:00','2011-02-13 23:00:00'),
    ('2011-02-14 23:00:00','2011-02-20 23:00:00'),
    ('2011-02-21 00:00:00','2011-02-27 23:00:00'),
    ('2011-02-28 00:00:00','2011-03-06 23:00:00'),  
]

## Define the Reference data

In [5]:
# Set datetime index 
raw_data = raw_data.set_index('dteday')

# Define the reference dataset
reference = raw_data.loc[start_date_0:end_date_0]

print(reference.shape)
reference.head()

(617, 16)


Unnamed: 0_level_0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-03,48,1,0,1,0,0,1,1,1,0.22,0.197,0.44,0.3582,0,5,5
2011-01-03,49,1,0,1,1,0,1,1,1,0.2,0.1667,0.44,0.4179,0,2,2
2011-01-03,50,1,0,1,4,0,1,1,1,0.16,0.1364,0.47,0.3881,0,1,1
2011-01-03,51,1,0,1,5,0,1,1,1,0.16,0.1364,0.47,0.2836,0,3,3
2011-01-03,52,1,0,1,6,0,1,1,1,0.14,0.1061,0.5,0.3881,0,30,30


# MLFlow

## MLflow: Log metrics by steps or timestamps

In [6]:
import time
import datetime

# Set up MLFlow Client
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Set experiment name
mlflow.set_experiment('Metrics - Steps')

# Set experiment variables
model_path = Path('../models/model.joblib')
ref_end_data = end_date_0

# Start a new MLflow Run
with mlflow.start_run() as run: 

    # Run model train for each batch (K-Fold)
    for k, date in enumerate(experiment_batches):

        # Calculate timestamp
        timestamp = time.mktime(datetime.datetime.strptime(date[1], "%Y-%m-%d %H:%M:%S").timetuple())
    
        print(f"Train period: {start_date_0} - {ref_end_data}") 
        X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]
        y_train = raw_data.loc[start_date_0:ref_end_data, target]
        print("X_train (reference) dataset shape: ", X_train.shape, y_train.shape)
        
        print(f"Test period: {date[0]} - {date[1]}") 
        current = raw_data.loc[date[0]:date[1]]
        X_test = current.loc[:, numerical_features + categorical_features]
        y_test = current[target]
        print("X_test (current)) dataset shape: ",  X_test.shape, y_test.shape)
        
        # Update reference end date
        ref_end_data = date[1]
        
        # Train model
        regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
        regressor.fit(X_train, y_train)
    
        # Calculate metrucs
        preds = regressor.predict(X_test)
        me = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        # print(me, mae)
        
        
        # Log metrics (use Client)
        # >>> 'timestamp' - Time when this metric was calculated. Defaults to the current system time
        # >>> 'step' -  Integer training step (iteration) at which was the metric calculated. Defaults to 0.
        client.log_metric(run.info.run_id, 'me', round(me, 3), timestamp=int(timestamp)*1000)
        client.log_metric(run.info.run_id, 'mae', round(mae, 3), step=k)

    # Log model 
    mlflow.log_artifact(model_path)

    # Log parameters
    mlflow.log_param("begin", date[0])
    mlflow.log_param("end", date[1])
    

Client tracking uri: http://localhost:5000
Train period: 2011-01-02 00:00:00 - 2011-01-30 23:00:00
X_train (reference) dataset shape:  (617, 10) (617,)
Test period: 2011-01-31 00:00:00 - 2011-02-06 23:00:00
X_test (current)) dataset shape:  (141, 10) (141,)
Train period: 2011-01-02 00:00:00 - 2011-02-06 23:00:00
X_train (reference) dataset shape:  (782, 10) (782,)
Test period: 2011-02-07 23:00:00 - 2011-02-13 23:00:00
X_test (current)) dataset shape:  (139, 10) (139,)
Train period: 2011-01-02 00:00:00 - 2011-02-13 23:00:00
X_train (reference) dataset shape:  (945, 10) (945,)
Test period: 2011-02-14 23:00:00 - 2011-02-20 23:00:00
X_test (current)) dataset shape:  (141, 10) (141,)
Train period: 2011-01-02 00:00:00 - 2011-02-20 23:00:00
X_train (reference) dataset shape:  (1110, 10) (1110,)
Test period: 2011-02-21 00:00:00 - 2011-02-27 23:00:00
X_test (current)) dataset shape:  (134, 10) (134,)
Train period: 2011-01-02 00:00:00 - 2011-02-27 23:00:00
X_train (reference) dataset shape:  (12

## MLFlow <> Optuna integration

Example source: https://www.kaggle.com/code/omarkhd99/bike-sharing-demand#XGBoost-+-Optuna

In [7]:
!pip install optuna
!pip install xgboost



In [8]:
import optuna
from optuna.integration.mlflow import MLflowCallback

In [8]:
from sklearn.metrics import make_scorer
import numpy as np

def RMSLE(y_real, y_pred):
    L1 = np.log1p(np.exp(y_real))
    L2 = np.log1p(np.exp(y_pred))    
    calc = (L1 - L2) ** 2
    return np.sqrt(np.mean(calc))

# RMSLE score 
rmsle_score=make_scorer(RMSLE)

In [9]:
from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler

def objective_func(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth',2, 6),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 20, 100),
    }
    model=XGBRegressor(**params)
    xgb_model = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, early_stopping_rounds=25)
    score = RMSLE(y_test, xgb_model.predict(X_test))
    return score 

### Not-Nested


In [10]:
# Set up MLFlow Client
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Set experiment name
mlflow.set_experiment('XGBoost-Optuna')

mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmsle",
    mlflow_kwargs={
        # "experiment_id": run.info.experiment_id,
        "nested": True,
    }
)

# Optuna Study
sampler = TPESampler(seed=10)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective_func, n_trials=50, callbacks=[mlflc])

# Best results 
trial = study.best_trial
trial_params = trial.params
print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))



2023/09/26 15:51:04 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost-Optuna' does not exist. Creating a new experiment.


Client tracking uri: http://localhost:5000


NameError: name 'MLflowCallback' is not defined

### Nested 

In [None]:
# Set up MLFlow Client
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Set experiment name
mlflow.set_experiment('XGBoost-Optuna-Nested')

# Start a new Run (Parent Run)
with mlflow.start_run() as run: 

    # Show newly created run metadata info
    print("Experiment id: {}".format(run.info.experiment_id))
    print("Run id: {}".format(run.info.run_id))
    print("Run name: {}".format(run.info.run_name))
    print('MLFlow tracking uri:', mlflow.get_tracking_uri())
    print('MLFlow artifact uri:', mlflow.get_artifact_uri())

    mlflc = MLflowCallback(
        tracking_uri=MLFLOW_TRACKING_URI,
        metric_name="rmsle",
        mlflow_kwargs={
            "experiment_id": run.info.experiment_id,
            "nested": True,
        }
    )

    # Optuna Study
    sampler = TPESampler(seed=10)
    study = optuna.create_study(direction='minimize', sampler=sampler)
    study.optimize(objective_func, n_trials=50,  callbacks=[mlflc])

    # Best results
    trial = study.best_trial
    trial_params = trial.params
    print('Best Trial: score {},\nparams {}'.format(trial.value, trial_params))