In [1]:
import os
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd

import plotly.graph_objs as go

from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score_v2

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
num_df = pd.read_csv("../dataset/cleaned_data.csv")

In [3]:
num_df.shape

(45593, 19)

In [4]:
MODEL_PATH = '../model'

## Loading the Tuned Models

In [5]:
model_dct = defaultdict(list)

In [None]:
for fold in range(5):
    file_name = os.path.join(MODEL_PATH, f"tuned_xgb_{fold}.bin")
    model = joblib.load(file_name)
    model_dct['xgb'].append(model)

In [7]:
def get_data(df, fold):
    """Returns the data of corresponding Fold"""
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(["Time_taken", "kfold"], axis=1)
    y_train = df_train['Time_taken'].values
    
    x_valid = df_valid.drop(["Time_taken", "kfold"], axis=1)
    y_valid = df_valid['Time_taken'].values

    return {'x_train': x_train, 'y_train': y_train,
        'x_valid': x_valid, 'y_valid': y_valid}

In [8]:
def get_preds(models, fold, alpha=0.1):
    """Returns the Predictions of all the model in `models` list corresponding to the `fold`"""
    model_train_preds, model_val_preds = [], []
    model_train_confs, model_val_confs = [], []
    
    for model_name in models:
        data = get_data(num_df, fold)
        
        conformal_reg = MapieRegressor(model_dct[model_name][fold], n_jobs=-1)
        conformal_reg.fit(data['x_train'], data['y_train'])
        
        train_preds, train_confs = conformal_reg.predict(data['x_train'], alpha=alpha)
        valid_preds, valid_confs = conformal_reg.predict(data['x_valid'], alpha=alpha)
        
        joblib.dump(conformal_reg, os.path.join(MODEL_PATH, f"conf_{model_name}_{fold}.bin"))

        model_train_preds.append(train_preds)
        model_val_preds.append(valid_preds)
        
        model_train_confs.append(train_confs)
        model_val_confs.append(valid_confs)
        
        y_train = data['y_train']
        y_val = data['y_valid']
    
    train_preds = np.array(model_train_preds).T
    val_preds = np.array(model_val_preds).T
    
    train_confs = np.array(model_train_confs).T
    val_confs = np.array(model_val_confs).T
        
    return {
        'y_train': y_train,
        'train_preds': train_preds,
        'y_valid': y_val,
        'valid_preds': val_preds,
        'train_confs': train_confs,
        'val_confs': val_confs
    }

In [None]:
pred_df_lst=[]

train_confs, val_confs = [], []
y_train, train_preds = [], []
for fold in range(5):
    pred_data = get_preds(model_dct, fold)
    pred_df = pd.DataFrame()
    
    pred_df['y_val'] = pred_data['y_valid']
    pred_df['val_preds'] = pred_data['valid_preds'][:, 0]
    
    pred_df['kfold'] = fold
    val_confs.append(pred_data['val_confs'])
    
    pred_df_lst.append(pred_df)
    
    train_confs.append(pred_data['train_confs'])
    y_train.append(pred_data['y_train'])
    train_preds.append(pred_data['train_preds'])

In [10]:
xgb_df = pd.concat(pred_df_lst, axis=0)
xgb_df.shape

(45593, 3)

In [11]:
xgb_df.head()

Unnamed: 0,y_val,val_preds,kfold
0,24.0,23.212982,0
1,38.0,39.657745,0
2,32.0,26.067078,0
3,35.0,40.889267,0
4,15.0,12.677548,0


In [40]:
y_train = np.concatenate(y_train)
train_preds = np.concatenate(train_preds)

In [33]:
def get_confs(confs):
    upper, lower = [], []
    for conf in confs:
        x = conf[0]
        upper.append(x[1])
        lower.append(x[0])
    
    upper_confs = np.concatenate(upper)
    lower_confs = np.concatenate(lower)
    
    return (lower_confs, upper_confs)

In [34]:
val_lower_confs, val_upper_confs = get_confs(val_confs)
train_lower_confs, train_upper_confs = get_confs(train_confs)

In [35]:
train_confs = np.concatenate([train_lower_confs, train_upper_confs], axis=1)
val_confs = np.concatenate([val_lower_confs, val_upper_confs], axis=1)

In [44]:
pred_data = {
    "y_train": y_train,
    "train_preds": train_preds,
    "train_confs": train_confs,
    "y_val": xgb_df['y_val'].values,
    "val_preds": xgb_df['val_preds'].values,
    "val_confs": val_confs
}

joblib.dump(pred_data, "../pred_data/data.bin")

['../pred_data/data.bin']

In [6]:
pred_data = joblib.load("../pred_data/data.bin")

y_train = pred_data["y_train"]
train_preds = pred_data['train_preds']
train_confs = pred_data['train_confs']

y_val =  pred_data['y_val']
val_preds = pred_data['val_preds']
val_confs = pred_data['val_confs']

In [7]:
estimated_coverage_train = regression_coverage_score_v2(y_train, train_confs)
estimated_coverage_val = regression_coverage_score_v2(y_val, val_confs)

train_mse = mean_squared_error(y_train, train_preds)
val_mse = mean_squared_error(y_val, val_preds)

train_r2 = r2_score(y_train, train_preds)
val_r2 = r2_score(y_val, val_preds)

In [10]:
estimated_coverage_val = round(estimated_coverage_val[0]*100, 3)
estimated_coverage_train = round(estimated_coverage_train[0]*100, 3)

train_mse = round(train_mse, 3)
val_mse = round(val_mse, 3)

train_r2 = round(train_r2, 3)
val_r2 = round(val_r2, 3)

In [8]:
x = np.arange(len(y_val))

In [36]:
def get_conf_plot(data_slice: slice):
    fig = go.Figure([
        go.Scatter(
            name='Val Preds',
            x=x[data_slice],
            y=val_preds[data_slice],
            mode='lines',
            line=dict(color='rgb(209, 238, 234)'),
        ),
        go.Scatter(
            name='Val Target',
            x=x[data_slice],
            y=y_val[data_slice],
            mode='lines',
            line=dict(color='rgb(79, 144, 166)'),
        ),
        go.Scatter(
            name='Upper Bound',
            x=x[data_slice],
            y=val_confs[:, 1][data_slice],
            mode='lines',
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False
        ),
        go.Scatter(
            name='Lower Bound',
            x=x[data_slice],
            y=val_confs[:, 0][data_slice],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode='lines',
            fillcolor='rgba(133, 196, 201,0.2)',
            fill='tonexty',
            showlegend=False
        )
    ])
    fig.update_layout(
        yaxis_title='ETA',
        title=f'XGBoost Conformalized Predictions(Target Coverage = 90,Estimated Coverage  = {estimated_coverage_val})',
        hovermode="x",
        template="plotly_dark"
    )
    return fig

In [37]:
data_slice = slice(50)
fig = get_conf_plot(data_slice)
fig.show()