# Model Training with K-Folds Cross-Validation

In [None]:
%load_ext autoreload
%autoreload 2

import joblib
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import pandas as pd

from pathlib import Path
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, mean_absolute_error
from typing import Dict, Tuple

# Prepare Data

## Load Data

More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [None]:
# Download original dataset with: python src/load_data.py 

raw_data = pd.read_csv(f"../data/raw_data.csv")
raw_data.head()

## Define column mapping

In [None]:
target = 'cnt'
prediction = 'prediction'
datetime = 'dteday'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]

## Define the comparison windows

In [None]:
start_date_0 = '2011-01-02 00:00:00'
end_date_0 = '2011-01-30 23:00:00'

experiment_batches = [
    
    ('2011-01-31 00:00:00','2011-02-06 23:00:00'),
    ('2011-02-07 23:00:00','2011-02-13 23:00:00'),
    ('2011-02-14 23:00:00','2011-02-20 23:00:00'),
    ('2011-02-21 00:00:00','2011-02-27 23:00:00'),
    ('2011-02-28 00:00:00','2011-03-06 23:00:00'),  
]

## Define the Reference data

In [None]:
# Set datetime index 
raw_data = raw_data.set_index('dteday')

# Define the reference dataset
reference = raw_data.loc[start_date_0:end_date_0]

print(reference.shape)
reference.head()

# MLflow Client

## Set up MLFlow

In [None]:
# Set up MLFlow Client
MLFLOW_TRACKING_URI = "http://localhost:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

In [None]:
client

## Create a new Experiment

In order to group any distinct runs of a particular project or idea together, we can define an Experiment that will group each iteration (runs) together. 
Defining a unique name that is relevant to what we're working on helps with organization and reduces the amount of work (searching) to find our runs later on. 

In [None]:
# Get an experiment by name if it exists

experiment_id = client.get_experiment_by_name('1-Train-K-Fold')
experiment_id

In [None]:
# Create a new experiment if it doesn't exist

if not experiment_id:
    experiment_id = client.create_experiment('1-Train-K-Fold')
    
experiment_id

In [None]:
# Fetch experiment metadata information

experiment = client.get_experiment(experiment_id)
print(f"Name: {experiment.name}")
print(f"Experiment_id: {experiment.experiment_id}")
print(f"Artifact Location: {experiment.artifact_location}")
print(f"Tags: {experiment.tags}")
print(f"Lifecycle_stage: {experiment.lifecycle_stage}")

# Metrics Tracking for K-Fold Experiments

In [None]:
# Set experiment
mlflow.set_experiment('1-Train-K-Fold') # Create a new Experiment if it doesn't exist

# Set experiment variables
model_path = Path('../models/model.joblib')
ref_end_data = end_date_0

# Run model train for each batch (K-Fold)
for k, date in enumerate(experiment_batches):

    print(f"Train period: {start_date_0} - {ref_end_data}") 
    X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]
    y_train = raw_data.loc[start_date_0:ref_end_data, target]
    print("X_train (reference) dataset shape: ", X_train.shape, y_train.shape)
    
    print(f"Test period: {date[0]} - {date[1]}") 
    current = raw_data.loc[date[0]:date[1]]
    X_test = current.loc[:, numerical_features + categorical_features]
    y_test = current[target]
    print("X_test (current)) dataset shape: ",  X_test.shape, y_test.shape)

    # Train model
    regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
    regressor.fit(X_train, y_train)

    # Calculate metrics
    preds = regressor.predict(X_test)
    me = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    # print(me, mae)
    
    # Start a new MLflow Run
    with mlflow.start_run() as run: 
        
        # Show newly created run metadata info
        print("Experiment id: {}".format(run.info.experiment_id))
        print("Run id: {}".format(run.info.run_id))
        print("Run name: {}".format(run.info.run_name))
        print('MLFlow tracking uri:', mlflow.get_tracking_uri())
        print('MLFlow artifact uri:', mlflow.get_artifact_uri())
        
        # Log parameters
        mlflow.log_param("begin", date[0])
        mlflow.log_param("end", date[1])
        
        # Log metrics
        mlflow.log_metric('me', round(me, 3))
        mlflow.log_metric('mae', round(mae, 3))
        
        # Log model 
        mlflow.log_artifact(model_path)

    # Update reference end date
    ref_end_data = date[1]
    

# Nested Runs

In [None]:
# Set up MLFlow Client
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

In [None]:

# Set experiment name
mlflow.set_experiment('2-Nested-Runs')

In [None]:

# Set experiment variables
model_path = Path('../models/model.joblib')
ref_end_data = end_date_0

# Start a new Run (Parent Run)
with mlflow.start_run() as run: 
    
    # Update metrics with metrics for each Fold
    metrics = {}

    # Run model train for each batch (K-Fold)
    for k, date in enumerate(experiment_batches):
            
        print(f"Train period: {start_date_0} - {ref_end_data}") 
        X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]
        y_train = raw_data.loc[start_date_0:ref_end_data, target]
        print("X_train (reference) dataset shape: ", X_train.shape, y_train.shape)
        
        print(f"Test period: {date[0]} - {date[1]}") 
        current = raw_data.loc[date[0]:date[1]]
        X_test = current.loc[:, numerical_features + categorical_features]
        y_test = current[target]
        print("X_test (current)) dataset shape: ",  X_test.shape, y_test.shape)
        
        # Update reference end date
        ref_end_data = date[1]

        # Train model
        regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
        regressor.fit(X_train, y_train)

        # Calculate metrucs
        preds = regressor.predict(X_test)
        me = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        metrics.update({date[1]: {'me': me, 'mae': mae}})
        
        # Run a Child Run for each Fold 
        with mlflow.start_run(run_name=date[1], nested=True) as child_run:
            
            # Show newly created run metadata info
            print("Experiment id: {}".format(run.info.experiment_id))
            print("Run id: {}".format(run.info.run_id))
            print("Run name: {}".format(run.info.run_name))
            print('MLFlow tracking uri:', mlflow.get_tracking_uri())
            print('MLFlow artifact uri:', mlflow.get_artifact_uri())
            
            # Log parameters
            mlflow.log_param("begin", date[0])
            mlflow.log_param("end", date[1])
            
            # Log metrics
            mlflow.log_metric('me', round(me, 3))
            mlflow.log_metric('mae', round(mae, 3))
        
    # Save model
    joblib.dump(regressor, model_path)

    # Log the last batch model as the parent Run model
    mlflow.log_artifact(model_path)
    
    # Log metrics
    average_run_merics = pd.DataFrame.from_dict(metrics).T.mean().round(3).to_dict()
    mlflow.log_metrics(average_run_merics)

# Log metrics by steps or timestamps

In [None]:
import time
import datetime

# Set up MLFlow Client
MLFLOW_TRACKING_URI = "http://localhost:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
print(f"Client tracking uri: {client.tracking_uri}")

# Set experiment name
mlflow.set_experiment('3-Metrics-by-steps')

# Set experiment variables
model_path = Path('../models/model.joblib')
ref_end_data = end_date_0

# Start a new MLflow Run
with mlflow.start_run() as run: 

    # Run model train for each batch (K-Fold)
    for k, date in enumerate(experiment_batches):

        # Calculate timestamp
        timestamp = time.mktime(datetime.datetime.strptime(date[1], "%Y-%m-%d %H:%M:%S").timetuple())
    
        print(f"Train period: {start_date_0} - {ref_end_data}") 
        X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]
        y_train = raw_data.loc[start_date_0:ref_end_data, target]
        print("X_train (reference) dataset shape: ", X_train.shape, y_train.shape)
        
        print(f"Test period: {date[0]} - {date[1]}") 
        current = raw_data.loc[date[0]:date[1]]
        X_test = current.loc[:, numerical_features + categorical_features]
        y_test = current[target]
        print("X_test (current)) dataset shape: ",  X_test.shape, y_test.shape)
        
        # Update reference end date
        ref_end_data = date[1]
        
        # Train model
        regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
        regressor.fit(X_train, y_train)
    
        # Calculate metrics
        preds = regressor.predict(X_test)
        me = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        # print(me, mae)
        
        # Log metrics (use Client)
        # >>> 'timestamp' - Time when this metric was calculated. Defaults to the current system time
        # >>> 'step' -  Integer training step (iteration) at which was the metric calculated. Defaults to 0.
        client.log_metric(run.info.run_id, 'me', round(me, 3), timestamp=int(timestamp)*1000)
        client.log_metric(run.info.run_id, 'mae', round(mae, 3), step=k)

    # Log model 
    mlflow.log_artifact(model_path)

    # Log parameters
    mlflow.log_param("begin", date[0])
    mlflow.log_param("end", date[1])
    