In [1]:
import os
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

from sklearn.ensemble import VotingRegressor, StackingRegressor, RandomForestRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
num_df = pd.read_csv("../dataset/cleaned_data.csv")
cat_df = pd.read_csv("../dataset/cat_data.csv")

In [3]:
num_df.shape, cat_df.shape

((45593, 19), (45593, 20))

In [4]:
MODEL_PATH = '../model'

## Loading the Tuned Models

In [5]:
model_dct = defaultdict(list)

In [6]:
for model_name in ('catb', 'lgbm', 'xgb'):
    for fold in range(5):
        file_name = os.path.join(MODEL_PATH, f"tuned_{model_name}_{fold}.bin")
        model = joblib.load(file_name)
        
        model_dct[model_name].append(model)

In [7]:
def get_data(df, fold):
    """Returns the data of corresponding Fold"""
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(["Time_taken", "kfold"], axis=1)
    y_train = df_train['Time_taken'].values
    
    x_valid = df_valid.drop(["Time_taken", "kfold"], axis=1)
    y_valid = df_valid['Time_taken'].values

    return {'x_train': x_train, 'y_train': y_train,
        'x_valid': x_valid, 'y_valid': y_valid}

In [8]:
def model_predict(model_name, data, fold):
    """Returns the predictions of the Model with `model_name` and `fold`"""
    train_preds = model_dct[model_name][fold].predict(data['x_train'])
    val_preds = model_dct[model_name][fold].predict(data['x_valid'])
    return {
        'train_preds': train_preds,
        'valid_preds': val_preds
    }

In [9]:
def get_preds(models, fold):
    """Returns the Predictions of all the model in `models` list corresponding to the `fold`"""
    model_train_preds, model_val_preds = [], []
    
    for model in models:
#         both num_data and cat_data have same indexes for y_valid,
#         i.e., all(num_data['y_valid']==cat_data['y_valid']) -> True
        if model=='catb':
            data = get_data(cat_df, fold)
        else:
            data = get_data(num_df, fold)

        preds = model_predict(model, data, fold)
        
        model_train_preds.append(preds['train_preds'])
        model_val_preds.append(preds['valid_preds'])
        
        y_train = data['y_train']
        y_val = data['y_valid']
    
    train_preds = np.array(model_train_preds).T
    val_preds = np.array(model_val_preds).T
        
    return {
        'y_train': y_train,
        'train_preds': train_preds,
        'y_valid': y_val,
        'valid_preds': val_preds
    }

In [10]:
base_models = ['xgb', 'catb', 'lgbm']

In [11]:
pred_df_lst=[]
for fold in range(5):
    pred_data = get_preds(base_models, fold)
    pred_df = pd.DataFrame()
    pred_df['xgb'] = pred_data['valid_preds'][:, 0]
    pred_df['catb'] = pred_data['valid_preds'][:, 1]
    pred_df['lgbm'] = pred_data['valid_preds'][:, 2]
    
    pred_df['kfold'] = fold
    
    pred_df_lst.append(pred_df)

model_pred_df = pd.concat(pred_df_lst, axis=0)

In [12]:
model_pred_df.drop(['kfold'], axis=1).corr()

Unnamed: 0,xgb,catb,lgbm
xgb,1.0,0.992072,0.995236
catb,0.992072,1.0,0.992014
lgbm,0.995236,0.992014,1.0


## Average Predictions

In [13]:
def avg_preds(models):
    folds_r2 = []
    folds_rmse = []
    for fold in range(5):
        pred_data = get_preds(models, fold)
        
        y_true = pred_data['y_valid']
        avg_preds = np.mean(pred_data['valid_preds'], axis=1)

        fold_r2 = r2_score(y_true, avg_preds)
        fold_rmse = np.sqrt(mean_squared_error(y_true, avg_preds))
        print(f"Fold={fold}, R2 score={fold_r2} and RMSE={fold_rmse}")

        folds_r2.append(fold_r2)
        folds_rmse.append(fold_rmse)
    
    return {'R2 Score': np.mean(folds_r2), 'RMSE': np.mean(folds_rmse)}

In [14]:
#CatBoost
avg_preds(['catb'])

Fold=0, R2 score=0.8281496228719227 and RMSE=3.890485405654631
Fold=1, R2 score=0.8341659623820337 and RMSE=3.8183004404859275
Fold=2, R2 score=0.8310404410758493 and RMSE=3.855217465710823
Fold=3, R2 score=0.8355913031175984 and RMSE=3.806598192546705
Fold=4, R2 score=0.8303631738609306 and RMSE=3.867548646721786


{'R2 Score': 0.831862100661667, 'RMSE': 3.847630030223974}

In [15]:
#XGBoost
avg_preds(['xgb'])

Fold=0, R2 score=0.8286804472891279 and RMSE=3.8844721460111944
Fold=1, R2 score=0.8327502553332626 and RMSE=3.8345640108756918
Fold=2, R2 score=0.8316599382141886 and RMSE=3.848143319027598
Fold=3, R2 score=0.8348982033758885 and RMSE=3.8146135149913736
Fold=4, R2 score=0.8310202046590399 and RMSE=3.860051560358724


{'R2 Score': 0.8318018097743016, 'RMSE': 3.8483689102529164}

In [16]:
#Catboost
avg_preds(['lgbm'])

Fold=0, R2 score=0.8233879831897322 and RMSE=3.9440160235195134
Fold=1, R2 score=0.8317054572412466 and RMSE=3.846522497849808
Fold=2, R2 score=0.8286705288966598 and RMSE=3.8821609285138092
Fold=3, R2 score=0.8318201725030054 and RMSE=3.850007671197753
Fold=4, R2 score=0.8261666384390831 and RMSE=3.9150947787492467


{'R2 Score': 0.8283501560539452, 'RMSE': 3.8875603799660263}

In [17]:
avg_preds(['xgb', 'lgbm'])

Fold=0, R2 score=0.8278505346092476 and RMSE=3.8938694327920014
Fold=1, R2 score=0.8340642164810528 and RMSE=3.8194716018856405
Fold=2, R2 score=0.8320845321289947 and RMSE=3.8432872859742133
Fold=3, R2 score=0.8354477651894687 and RMSE=3.8082595158912005
Fold=4, R2 score=0.8306376968582678 and RMSE=3.8644179557971072


{'R2 Score': 0.8320169490534063, 'RMSE': 3.8458611584680327}

In [18]:
avg_preds(['catb', 'lgbm'])

Fold=0, R2 score=0.8289030561129096 and RMSE=3.881947627229874
Fold=1, R2 score=0.8358390180995241 and RMSE=3.7989907030026457
Fold=2, R2 score=0.8331257563619119 and RMSE=3.8313528681049243
Fold=3, R2 score=0.8369513953685784 and RMSE=3.7908202049917348
Fold=4, R2 score=0.8318865393086836 and RMSE=3.8501438858787846


{'R2 Score': 0.8333411530503214, 'RMSE': 3.8306510578415924}

In [19]:
avg_preds(['xgb', 'catb'])

Fold=0, R2 score=0.8315864516104199 and RMSE=3.851386101925955
Fold=1, R2 score=0.8364369828922829 and RMSE=3.7920653815887357
Fold=2, R2 score=0.8346429290239175 and RMSE=3.8138963204742415
Fold=3, R2 score=0.8382348892564195 and RMSE=3.775870345236101
Fold=4, R2 score=0.8343544011470438 and RMSE=3.8217798518984876


{'R2 Score': 0.8350511307860167, 'RMSE': 3.8109996002247035}

In [20]:
avg_preds(['xgb', 'catb', 'lgbm'])

Fold=0, R2 score=0.8303491239977251 and RMSE=3.8655082006934762
Fold=1, R2 score=0.8363043549927662 and RMSE=3.7936025009311205
Fold=2, R2 score=0.8342268846525109 and RMSE=3.818691256854566
Fold=3, R2 score=0.8378029466957082 and RMSE=3.7809081182434814
Fold=4, R2 score=0.8333293924774364 and RMSE=3.833586130095705


{'R2 Score': 0.8344025405632294, 'RMSE': 3.8184592413636693}

In [23]:
selected_models = ['xgb', 'catb']

## Stacking

In [21]:
model_dispatch = {
    'scaled_lr':Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LinearRegression())
    ]),
    'lr': LinearRegression(),
    'rf': RandomForestRegressor(),
    'lgbm': LGBMRegressor(),
    'xgb': XGBRegressor(),
    'catb': CatBoostRegressor(silent=True)
}

In [22]:
def fit_stacker(model_names, final_model):
    folds_r2 = []
    folds_rmse = []
    for fold in range(5):
        pred_data = get_preds(model_names, fold)
        
        stacker = model_dispatch[final_model]
        
        stacker.fit(pred_data['train_preds'], pred_data['y_train'])
        
        preds = stacker.predict(pred_data['valid_preds'])

        fold_r2 = r2_score(pred_data['y_valid'], preds)
        fold_rmse = np.sqrt(mean_squared_error(pred_data['y_valid'], preds))
        print(f"Fold={fold}, R2 score={fold_r2} and RMSE={fold_rmse}")

        folds_r2.append(fold_r2)
        folds_rmse.append(fold_rmse)
    
    return {'R2 Score': np.mean(folds_r2), 'RMSE': np.mean(folds_rmse)}    

### Stacking with LR

In [24]:
fit_stacker(selected_models, 'scaled_lr')

Fold=0, R2 score=0.8291965399607814 and RMSE=3.87861683160572
Fold=1, R2 score=0.8349331035547413 and RMSE=3.8094585557377134
Fold=2, R2 score=0.8328373021100168 and RMSE=3.8346628242188663
Fold=3, R2 score=0.8370580357612276 and RMSE=3.7895803272345
Fold=4, R2 score=0.8307474042291664 and RMSE=3.863166131145348


{'R2 Score': 0.8329544771231866, 'RMSE': 3.8350969339884293}

In [25]:
fit_stacker(selected_models, 'lr')

Fold=0, R2 score=0.8291965399607814 and RMSE=3.8786168316057195
Fold=1, R2 score=0.8349331035547413 and RMSE=3.809458555737713
Fold=2, R2 score=0.8328373021100168 and RMSE=3.8346628242188654
Fold=3, R2 score=0.8370580357612275 and RMSE=3.7895803272345003
Fold=4, R2 score=0.8307474042291665 and RMSE=3.863166131145347


{'R2 Score': 0.8329544771231866, 'RMSE': 3.8350969339884293}

### Stacking with RandomForest

In [29]:
fit_stacker(selected_models, 'rf')

Fold=0, R2 score=0.7979615888413981 and RMSE=4.218378239175165
Fold=1, R2 score=0.8048598325004402 and RMSE=4.141966641961395
Fold=2, R2 score=0.7981214183671536 and RMSE=4.214078483184794
Fold=3, R2 score=0.8016508010954811 and RMSE=4.1810933969307955
Fold=4, R2 score=0.7997809095859805 and RMSE=4.201733060901073


{'R2 Score': 0.8004749100780906, 'RMSE': 4.191449964430644}

### Stacking with LGBM

In [30]:
fit_stacker(selected_models, 'lgbm')

Fold=0, R2 score=0.8204300707881773 and RMSE=3.9769062337087915
Fold=1, R2 score=0.8268882308525504 and RMSE=3.901185106663131
Fold=2, R2 score=0.8199923257554308 and RMSE=3.97926633398604
Fold=3, R2 score=0.823304355936272 and RMSE=3.9462769896473207
Fold=4, R2 score=0.8212869167036774 and RMSE=3.9696652880053036


{'R2 Score': 0.8223803800072215, 'RMSE': 3.954659990402117}

### Stacking with XGB

In [31]:
fit_stacker(selected_models, 'xgb')

Fold=0, R2 score=0.8176869701050521 and RMSE=4.007166611726781
Fold=1, R2 score=0.8235284962757825 and RMSE=3.93886007740206
Fold=2, R2 score=0.8167950973528034 and RMSE=4.014449906010593
Fold=3, R2 score=0.8201966118932835 and RMSE=3.980829518340261
Fold=4, R2 score=0.8170562833106725 and RMSE=4.016376939545849


{'R2 Score': 0.8190526917875189, 'RMSE': 3.9915366106051087}

### Stacking with CatBoost

In [32]:
fit_stacker(selected_models, 'catb')

Fold=0, R2 score=0.8181876005805403 and RMSE=4.001661001917198
Fold=1, R2 score=0.8251241945839034 and RMSE=3.9210115721732177
Fold=2, R2 score=0.8186827402713865 and RMSE=3.993715011537825
Fold=3, R2 score=0.8213855668085042 and RMSE=3.967646015050856
Fold=4, R2 score=0.8194375784984345 and RMSE=3.9901516430019686


{'R2 Score': 0.8205635361485539, 'RMSE': 3.9748370487362132}