# Ensemble Models

Random Forest and Gradient Boosting Bond Pricing Models

author: Maris

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import yfinance as yf
import datetime as dt
import os
import seaborn as sns
import random 
from scipy.optimize import newton
from scipy.stats import randint

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
sns.set_theme()
sns.set_palette("tab20")
sns.set_style("darkgrid", rc = {"grid.color": "white"})
c10 = sns.color_palette("tab10").as_hex()[:]
c20 = sns.color_palette("tab20").as_hex()[:]

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
os.chdir('/Users/maris/Documents/FE800')

In [66]:
np.random.seed(10)

## Import Data

In [98]:
df = pd.read_pickle("factor_df.pkl")
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Return,Price,volume,Trades,TTM,Stock_Return,Stock_Volume,ETF_Return,PV,YTM,Duration,DV01,Convexity
cusip,trd_dt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
00206RBH4,2018-07-03,0.00885,86.513796,-0.785507,-0.94015,1.728678,0.015523,-0.171355,-0.005025,0.004849,0.902753,1.131115,0.498639,1.39185
00206RBH4,2018-07-05,0.003406,86.808383,-0.573559,1.089431,1.724879,-0.003363,-0.086349,0.004959,0.001749,0.878724,1.132837,0.478094,1.392504
00206RBH4,2018-07-06,0.004973,87.240021,-0.637707,-0.462601,1.72298,0.002454,0.232636,0.007858,0.001668,0.841574,1.137649,0.447232,1.395884
00206RBH4,2018-07-09,-0.001021,87.150785,-0.115591,-0.94015,1.717283,0.004123,0.357721,0.011967,-0.002931,0.852737,1.113003,0.46136,1.375882
00206RBH4,2018-07-10,0.002972,87.409758,-0.519514,-0.462601,1.715383,0.010887,0.103056,0.00215,-0.001502,0.830979,1.102112,0.447388,1.367151


In [99]:
macro_factors = pd.read_pickle("macro_factors.pkl").pct_change().drop(["Lower_Target", "Upper_Target"], axis = 1)
macro_factors.head()

Unnamed: 0,EFFR,1YRT,2YRT,3YRT,5YRT,7YRT,10YRT,30YRT,SPY,LQD
2018-07-02,,,,,,,,,,
2018-07-03,0.0,-0.004274,-0.015564,-0.007547,-0.010909,-0.014134,-0.013937,-0.010033,-0.003531,0.003677
2018-07-05,0.0,-0.004292,0.007905,0.007605,0.007353,0.003584,0.003534,-0.003378,0.008158,0.00314
2018-07-06,0.0,0.008621,-0.007843,-0.003774,-0.010949,-0.007143,-0.007042,-0.00339,0.008458,0.001391
2018-07-09,0.0,0.0,0.01581,0.007576,0.01476,0.014388,0.014184,0.006803,0.009004,0.000695


In [97]:
bad_cusips = pd.read_csv("bad_cusips.csv")
bad_cusips

Unnamed: 0.1,Unnamed: 0,0
0,0,92976GAJ0


# Random Forest

In [100]:
def adjust_value(value):
    if abs(value) > 0.04:
        return 0.04 if value > 0 else -0.04
    else:
        return value

In [101]:
df_rf = df.copy()
df_rf["Return"] = df_rf["Return"].apply(adjust_value)
df_rf["Price"] = df_rf["Price"].bfill().ffill()
df_rf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Return,Price,volume,Trades,TTM,Stock_Return,Stock_Volume,ETF_Return,PV,YTM,Duration,DV01,Convexity
cusip,trd_dt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
00206RBH4,2018-07-03,0.00885,86.513796,-0.785507,-0.94015,1.728678,0.015523,-0.171355,-0.005025,0.004849,0.902753,1.131115,0.498639,1.39185
00206RBH4,2018-07-05,0.003406,86.808383,-0.573559,1.089431,1.724879,-0.003363,-0.086349,0.004959,0.001749,0.878724,1.132837,0.478094,1.392504
00206RBH4,2018-07-06,0.004973,87.240021,-0.637707,-0.462601,1.72298,0.002454,0.232636,0.007858,0.001668,0.841574,1.137649,0.447232,1.395884
00206RBH4,2018-07-09,-0.001021,87.150785,-0.115591,-0.94015,1.717283,0.004123,0.357721,0.011967,-0.002931,0.852737,1.113003,0.46136,1.375882
00206RBH4,2018-07-10,0.002972,87.409758,-0.519514,-0.462601,1.715383,0.010887,0.103056,0.00215,-0.001502,0.830979,1.102112,0.447388,1.367151


In [102]:
def run_random_forest_no_tuning(df, feature, max_depth = None):

    X = df.drop([feature, "Price"], axis=1)[1:].fillna(0)
    Y = df[feature][1:].fillna(0)

    X_train = X.loc[:pd.Timestamp("2022-05-31")]
    X_test = X.loc[pd.Timestamp("2022-06-01"):]
    Y_train = Y.loc[:pd.Timestamp("2022-05-31")]
    Y_test = Y.loc[pd.Timestamp("2022-06-01"):]

    rf_model = RandomForestRegressor(n_estimators = 200, max_depth = max_depth)
    rf_model.fit(X_train, Y_train)
    
    Y_pred = rf_model.predict(X_test)
    r2_train = rf_model.score(X_train, Y_train)

    prices = pd.DataFrame({"Test": df["Price"].loc[pd.Timestamp("2022-06-01"):]}).ffill()
    prices["Return Pred"] = Y_pred + 1

    pred = []
    for i in range(len(prices)):
        if i == 0:
            pred.append(prices["Return Pred"].iloc[0] * df["Price"].loc[pd.Timestamp("2022-05-31")])
        else:
            pred.append(prices["Return Pred"].iloc[i] * prices["Test"].iloc[i - 1])
    prices["Pred"] = pred
    
    mse = mean_squared_error(prices["Test"], prices["Pred"])
    rmse = mean_squared_error(prices["Test"], prices["Pred"], squared=False)
    mae = mean_absolute_error(prices["Test"], prices["Pred"])

    r2_test = r2_score(Y_test, Y_pred)

    return r2_train, r2_test, mse, rmse, mae

In [103]:
%%time

run_random_forest_no_tuning(pd.concat([df_rf.loc[random.sample(list(df_rf.index.droplevel(1).unique()), 1)[0]]
                             ,  macro_factors], axis = 1), "Return")

CPU times: user 2.35 s, sys: 24.7 ms, total: 2.38 s
Wall time: 2.38 s


(0.8827050254115572,
 -5.041160749711688,
 1.6630431800029226,
 1.2895903147910668,
 1.0460034466814594)

In [104]:
%%time

rf_results = pd.DataFrame(columns = ["Cusip", "R2 Train", "R2 Test", "MSE", "RMSE", "MAE"])
for col in list(df.index.droplevel(1).unique()):
    outputs = run_random_forest_no_tuning(pd.concat([df_rf.loc[col],  macro_factors], axis = 1), "Return")
    rf_results.loc[len(rf_results)] = [col, outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]]

CPU times: user 27min 59s, sys: 10.4 s, total: 28min 9s
Wall time: 28min 11s


In [105]:
rf_results.sort_values("R2 Test").dropna()

Unnamed: 0,Cusip,R2 Train,R2 Test,MSE,RMSE,MAE
609,828807CY1,0.882232,-28.063178,4.595872,2.143799,1.917661
513,654106AF0,0.876128,-18.188570,2.275890,1.508605,1.243075
485,606822AJ3,0.848327,-15.617332,2.740492,1.655443,1.151591
289,35177PAL1,0.873450,-11.640540,5.575677,2.361287,1.850113
625,86562MAV2,0.866546,-10.501446,2.392665,1.546824,1.334921
...,...,...,...,...,...,...
211,20030NCH2,0.919355,0.646180,0.065953,0.256813,0.200342
479,594918BY9,0.893776,0.653172,0.041028,0.202554,0.148433
362,437076BN1,0.886260,0.653177,0.037470,0.193571,0.147551
520,666807BN1,0.900526,0.655616,0.053773,0.231889,0.169947


In [106]:
rf_results.drop(["Cusip"], axis = 1).mean()

R2 Train    0.889704
R2 Test    -0.029953
MSE         0.641534
RMSE        0.731822
MAE         0.523807
dtype: float64

## Cross Validation

In [107]:
random_grid = {'n_estimators': [10, 100, 200],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [2, 3, 5, 10, 20],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [108]:
def run_random_forest_cv(df, feature):
    X = df.drop([feature, "Price"], axis=1)[1:].fillna(0)
    Y = df[feature][1:].fillna(0)
    
    X_train = X.loc[:pd.Timestamp("2022-05-31")]
    X_test = X.loc[pd.Timestamp("2022-06-01"):]
    Y_train = Y.loc[:pd.Timestamp("2022-05-31")]
    Y_test = Y.loc[pd.Timestamp("2022-06-01"):]

    rf = RandomForestRegressor()

    tscv_initial = TimeSeriesSplit(n_splits=3)
    best_params_list = []
    best_r2_train = -float('inf')  

    for train_index, val_index in tscv_initial.split(X_train):
        X_train1, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        Y_train1, Y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]
        
        rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=50, cv=3,
                                   verbose=0, n_jobs=-1, scoring='r2', random_state=42)
        rf_cv.fit(X_train1, Y_train1)
        
        if rf_cv.best_score_ > best_r2_train:
            best_r2_train = rf_cv.best_score_
            best_params = rf_cv.best_params_
            best_params_list = list(best_params.values())
    
    best_rf_model = RandomForestRegressor(**best_params)
    best_rf_model.fit(X_train, Y_train)
    Y_pred = best_rf_model.predict(X_test)

    prices = pd.DataFrame({"Test": df["Price"].loc[pd.Timestamp("2022-06-01"):]}).ffill()
    prices["Return Pred"] = Y_pred + 1

    pred = []
    for i in range(len(prices)):
        if i == 0:
            pred.append(prices["Return Pred"].iloc[0] * df["Price"].loc[pd.Timestamp("2022-05-31")])
        else:
            pred.append(prices["Return Pred"].iloc[i] * prices["Test"].iloc[i - 1])
    prices["Pred"] = pred

    try:
        mse = mean_squared_error(prices["Test"], prices["Pred"])
        rmse = mean_squared_error(prices["Test"], prices["Pred"], squared=False)
        mae = mean_absolute_error(prices["Test"], prices["Pred"])

        r2_train = best_rf_model.score(X_train, Y_train)
        r2_test = r2_score(Y_test, Y_pred)  

        return best_r2_train, r2_test, mse, rmse, mae, 

    except:
        return None, None, None, None, None

In [109]:
%%time

run_random_forest_cv(pd.concat([df_rf.loc[random.sample(list(df_rf.index.droplevel(1).unique()), 1)[0]]
                             ,  macro_factors], axis = 1), "Return")

CPU times: user 840 ms, sys: 125 ms, total: 965 ms
Wall time: 4.7 s


(0.18314275118624682,
 0.4198185963532267,
 0.08471893560926166,
 0.2910651741608083,
 0.21998064853266106)

In [110]:
%%time

rf_results_cv = pd.DataFrame(columns = ["Cusip", "R2 Train", "R2 Test", "MSE", "RMSE", "MAE"])
for col in list(df.index.droplevel(1).unique()):
    outputs = run_random_forest_cv(pd.concat([df_rf.loc[col],  macro_factors], axis = 1), "Return")
    rf_results_cv.loc[len(rf_results_cv)] = [col, outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]]

CPU times: user 10min 10s, sys: 31.5 s, total: 10min 42s
Wall time: 46min 18s


In [111]:
rf_results_cv.dropna().sort_values("R2 Test").dropna()

Unnamed: 0,Cusip,R2 Train,R2 Test,MSE,RMSE,MAE
289,35177PAL1,0.177091,-1.925934,1.287273,1.134580,0.919569
609,828807CY1,0.179311,-1.665763,0.422176,0.649751,0.544755
634,871829BF3,0.038385,-1.608367,0.414144,0.643540,0.504826
404,478160CE2,0.139451,-1.563949,0.378592,0.615298,0.485882
36,03040WAD7,0.202068,-1.413482,2.210916,1.486915,1.188415
...,...,...,...,...,...,...
576,747525AU7,0.227009,0.640807,0.042959,0.207264,0.150743
205,20030NBY6,0.256798,0.656622,0.037871,0.194606,0.146695
677,913017BT5,0.278673,0.677499,0.228710,0.478237,0.360003
211,20030NCH2,0.325936,0.679338,0.059754,0.244447,0.186946


In [112]:
rf_results_cv.drop(rf_results_cv.dropna().sort_values("R2 Test").head(10).index)[["R2 Train", "R2 Test", "MSE", "RMSE", "MAE"]].mean()

R2 Train    0.183767
R2 Test     0.314459
MSE         0.518426
RMSE        0.662088
MAE         0.463133
dtype: float64

# Gradient Boosting

In [113]:
def run_gradient_boosting_no_tuning(df, feature, max_depth = None):
    
    X = df.drop([feature, "Price"], axis=1)[1:].fillna(0)
    Y = df[feature][1:].fillna(0)

    X_train = X.loc[:pd.Timestamp("2022-05-31")]
    X_test = X.loc[pd.Timestamp("2022-06-01"):]
    Y_train = Y.loc[:pd.Timestamp("2022-05-31")]
    Y_test = Y.loc[pd.Timestamp("2022-06-01"):]

    gb_model = GradientBoostingRegressor(n_estimators=200, max_depth = max_depth)
    gb_model.fit(X_train, Y_train)
    
    Y_pred = gb_model.predict(X_test)
    r2_train = gb_model.score(X_train, Y_train)
    r2_test = r2_score(Y_test, Y_pred)

    prices = pd.DataFrame({"Test": df["Price"].loc[pd.Timestamp("2022-06-01"):]}).ffill()
    prices["Return Pred"] = Y_pred + 1

    pred = []
    for i in range(len(prices)):
        if i == 0:
            pred.append(prices["Return Pred"].iloc[0] * df["Price"].loc[pd.Timestamp("2022-05-31")])
        else:
            pred.append(prices["Return Pred"].iloc[i] * prices["Test"].iloc[i - 1])
    prices["Pred"] = pred
    
    mse = mean_squared_error(prices["Test"], prices["Pred"])
    rmse = mean_squared_error(prices["Test"], prices["Pred"], squared=False)
    mae = mean_absolute_error(prices["Test"], prices["Pred"])

    return r2_train, r2_test, mse, rmse, mae

In [114]:
%%time

run_gradient_boosting_no_tuning(pd.concat([df.loc[random.sample(list(df.index.droplevel(1).unique()), 1)[0]]
                             ,  macro_factors], axis = 1), "Return")

CPU times: user 2.64 s, sys: 13.5 ms, total: 2.66 s
Wall time: 2.66 s


(0.9999999999929562,
 -0.9095215335285807,
 0.3516040628192265,
 0.5929621090923319,
 0.4148026209876423)

In [115]:
%%time

gb_results = pd.DataFrame(columns = ["Cusip", "R2 Train", "R2 Test", "MSE", "RMSE", "MAE"])
for col in list(df.index.droplevel(1).unique()):
    outputs = run_gradient_boosting_no_tuning(pd.concat([df_rf.loc[col],  macro_factors], axis = 1), "Return")
    gb_results.loc[len(gb_results)] = [col, outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]]

CPU times: user 28min 28s, sys: 17.5 s, total: 28min 45s
Wall time: 28min 46s


In [116]:
gb_results.sort_values("R2 Test").dropna()

Unnamed: 0,Cusip,R2 Train,R2 Test,MSE,RMSE,MAE
247,26441CAX3,1.0,-69.333401,9.663434,3.108606,2.821914
609,828807CY1,1.0,-56.473739,9.096367,3.016018,2.692796
449,565849AP1,1.0,-49.982732,8.177067,2.859557,2.197985
487,606822AR5,1.0,-47.677121,11.670018,3.416141,3.124588
625,86562MAV2,1.0,-46.663250,9.915713,3.148923,2.705030
...,...,...,...,...,...,...
190,172967LD1,1.0,0.271776,0.077964,0.279220,0.220998
208,20030NCC3,1.0,0.276230,0.541971,0.736187,0.570468
478,594918BT0,1.0,0.331833,0.580628,0.761990,0.593837
55,036752AG8,1.0,0.335369,0.122489,0.349984,0.265996


In [117]:
gb_results.drop(["Cusip"], axis = 1).mean()

R2 Train    0.999999
R2 Test    -1.941029
MSE         1.563489
RMSE        1.118979
MAE         0.820535
dtype: float64

## Cross Validation

In [118]:
param_grid_gb = {
    'n_estimators': [10, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

In [119]:
def time_series_cv(X, n_splits=3):
    tscv = TimeSeriesSplit(n_splits=n_splits + 1)  # Adding 1 split for test data
    for train_index, test_index in tscv.split(X):
        yield train_index[:-1], test_index

In [120]:
def run_gradient_boosting_cv(df, feature):
    X = df.drop([feature, "Price"], axis=1)[1:].fillna(0)
    Y = df[feature][1:].fillna(0)
    
    X_train = X.loc[:pd.Timestamp("2022-05-31")]
    X_test = X.loc[pd.Timestamp("2022-06-01"):]
    Y_train = Y.loc[:pd.Timestamp("2022-05-31")]
    Y_test = Y.loc[pd.Timestamp("2022-06-01"):]

    gb = GradientBoostingRegressor()

    tscv_initial = TimeSeriesSplit(n_splits=3)
    best_params_list = []
    best_r2_train = -float('inf')  

    for train_index, val_index in tscv_initial.split(X_train):
        X_train1, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        Y_train1, Y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]
        
        gb_cv = RandomizedSearchCV(estimator=gb, param_distributions=param_grid_gb, n_iter=50, cv=3,
                                   verbose=0, n_jobs=-1, scoring='r2', random_state=42)
        gb_cv.fit(X_train1, Y_train1)
        
        if gb_cv.best_score_ > best_r2_train:
            best_r2_train = gb_cv.best_score_
            best_params = gb_cv.best_params_
            best_params_list = list(best_params.values())
    
    best_gb_model = GradientBoostingRegressor(**best_params)
    best_gb_model.fit(X_train, Y_train)
    Y_pred = best_gb_model.predict(X_test)

    prices = pd.DataFrame({"Test": df["Price"].loc[pd.Timestamp("2022-06-01"):]}).ffill()
    prices["Return Pred"] = Y_pred + 1

    pred = []
    for i in range(len(prices)):
        if i == 0:
            pred.append(prices["Return Pred"].iloc[0] * df["Price"].loc[pd.Timestamp("2022-05-31")])
        else:
            pred.append(prices["Return Pred"].iloc[i] * prices["Test"].iloc[i - 1])
    prices["Pred"] = pred

    try:
        mse = mean_squared_error(prices["Test"], prices["Pred"])
        rmse = mean_squared_error(prices["Test"], prices["Pred"], squared=False)
        mae = mean_absolute_error(prices["Test"], prices["Pred"])

        r2_train = best_gb_model.score(X_train, Y_train)
        r2_test = r2_score(Y_test, Y_pred)  

        return best_r2_train, r2_test, mse, rmse, mae, 

    except:
        return None, None, None, None, None

In [121]:
%%time

run_gradient_boosting_cv(pd.concat([df.loc[random.sample(list(df.index.droplevel(1).unique()), 1)[0]]
                             ,  macro_factors], axis = 1), "Return")

CPU times: user 2.21 s, sys: 136 ms, total: 2.35 s
Wall time: 17.2 s


(0.28929281418527486,
 0.4668320272005715,
 0.2531214993987156,
 0.5031118159998984,
 0.3744591556614832)

In [122]:
%%time

gb_results_cv = pd.DataFrame(columns = ["Cusip", "R2 Train", "R2 Test", "MSE", "RMSE", "MAE"])
for col in list(df.index.droplevel(1).unique()):
    outputs = run_gradient_boosting_cv(pd.concat([df_rf.loc[col],  macro_factors], axis = 1), "Return")
    gb_results_cv.loc[len(gb_results_cv)] = [col, outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]]

CPU times: user 19min 12s, sys: 44.5 s, total: 19min 56s
Wall time: 3h 16min 24s


In [123]:
gb_results_cv.sort_values("R2 Test").dropna()

Unnamed: 0,Cusip,R2 Train,R2 Test,MSE,RMSE,MAE
609,828807CY1,0.151401,-16.556715,2.775037,1.665844,1.483387
89,05964HAF2,0.117763,-7.010948,1.934280,1.390784,0.954625
289,35177PAL1,0.163109,-3.598133,2.026898,1.423692,1.170680
625,86562MAV2,0.031120,-3.586494,0.956208,0.977859,0.816046
487,606822AR5,0.028572,-3.429322,1.060571,1.029840,0.896584
...,...,...,...,...,...,...
42,031162CQ1,0.244555,0.578837,0.070965,0.266393,0.201769
12,00287YAM1,0.255057,0.579546,0.304242,0.551582,0.420810
479,594918BY9,0.217879,0.585042,0.049264,0.221954,0.166656
375,458140AX8,0.210069,0.585225,0.059368,0.243654,0.177633


In [124]:
gb_results_cv.drop(gb_results_cv.dropna().sort_values("R2 Test").head(10).index)[["R2 Train", "R2 Test", "MSE", "RMSE", "MAE"]].mean()

R2 Train    0.174683
R2 Test     0.221875
MSE         0.571239
RMSE        0.697024
MAE         0.489313
dtype: float64

In [125]:
np.mean(gb_results.drop(["Cusip"], axis = 1), axis = 0)

R2 Train    0.999999
R2 Test    -1.941029
MSE         1.563489
RMSE        1.118979
MAE         0.820535
dtype: float64

# Compare Methods

In [126]:
compare_error = pd.DataFrame(columns = ["R2 Train", "R2 Test", "MSE", "RMSE", "MAE"])
compare_error.loc["Random Forest (w/o CV)"] = np.mean(rf_results.drop(["Cusip"], axis = 1), axis = 0)
compare_error.loc["Random Forest (w/CV)"] = rf_results_cv.drop(rf_results_cv.dropna().sort_values("R2 Test").head(10).index)[["R2 Train", "R2 Test", "MSE", "RMSE", "MAE"]].mean()
compare_error.loc["Gradient Boosting (w/o CV)"] = np.mean(gb_results.drop(["Cusip"], axis = 1), axis = 0)
compare_error.loc["Gradient Boosting (w/CV)"] = gb_results_cv.drop(gb_results_cv.dropna().sort_values("R2 Test").head(10).index)[["R2 Train", "R2 Test", "MSE", "RMSE", "MAE"]].mean()
compare_error

Unnamed: 0,R2 Train,R2 Test,MSE,RMSE,MAE
Random Forest (w/o CV),0.889704,-0.029953,0.641534,0.731822,0.523807
Random Forest (w/CV),0.183767,0.314459,0.518426,0.662088,0.463133
Gradient Boosting (w/o CV),0.999999,-1.941029,1.563489,1.118979,0.820535
Gradient Boosting (w/CV),0.174683,0.221875,0.571239,0.697024,0.489313
