In [1]:
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score as R2
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold
import joblib
import warnings 
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
rf_reg = joblib.load("rf.dat")
gbdt_reg = joblib.load("gbdt.dat")
xgb_reg = joblib.load("xgb.dat")

In [3]:
df2_backup = pd.read_csv("data_model.csv", index_col='Unnamed: 0')
df2 = deepcopy(df2_backup)
X, y = df2.iloc[:, :-1], df2.iloc[:, -1]
X_train, X_test, y_train, y_test = tts(X,y,test_size=0.3,random_state=1)

In [4]:
train_prediction_rf = rf_reg.predict(X_train)
train_prediction_gbdt = gbdt_reg.predict(X_train)
train_prediction_xgb = xgb_reg.predict(X_train)

test_prediction_rf = rf_reg.predict(X_test)
test_prediction_gbdt = gbdt_reg.predict(X_test)
test_prediction_xgb = xgb_reg.predict(X_test)

In [5]:
def train_cross(X_train, 
                y_train, 
                X_test, 
                estimators, 
                n_splits = 5, 
                random_state = 1,
                blending = False,
                regress = False):

    if type(y_train) == np.ndarray:
        y_train = pd.Series(y_train)
    
    if blending == True:
        X, X1, y, y1 = train_test_split(X_train, y_train, test_size=test_size, random_state=random_state)
        m = X1.shape[0]
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)
        X1 = X1.reset_index(drop=True)
        y1 = y1.reset_index(drop=True)
    else:
        m = X_train.shape[0]
        X = X_train.reset_index(drop=True)
        y = y_train.reset_index(drop=True)
    
    n = len(estimators)
    m_test = X_test.shape[0]
    
    columns = []
    for estimator in estimators:
        columns.append(estimator[0] + '_oof')
    
    train_oof = pd.DataFrame(np.zeros((m, n)), columns=columns)
    
    columns = []
    for estimator in estimators:
        columns.append(estimator[0] + '_predict')
    
    test_predict = pd.DataFrame(np.zeros((m_test, n)), columns=columns)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for estimator in estimators:
        model = estimator[1]
        oof_colName = estimator[0] + '_oof'
        predict_colName = estimator[0] + '_predict'
        
        for train_part_index, eval_index in kf.split(X, y):
            X_train_part = X.loc[train_part_index]
            y_train_part = y.loc[train_part_index]
            model.fit(X_train_part, y_train_part)

            if regress == True:
                if blending == True:
                    train_oof[oof_colName] += model.predict(X1) / n_splits
                    test_predict[predict_colName] += model.predict(X_test) / n_splits
                else:
                    X_eval_part = X.loc[eval_index]
                    train_oof[oof_colName].loc[eval_index] = model.predict(X_eval_part)
                    test_predict[predict_colName] += model.predict(X_test) / n_splits

            else:
                if blending == True:
                    train_oof[oof_colName] += model.predict_proba(X1)[:, 1] / n_splits
                    test_predict[predict_colName] += model.predict_proba(X_test)[:, 1] / n_splits
                else:
                    X_eval_part = X.loc[eval_index]
                    train_oof[oof_colName].loc[eval_index] = model.predict_proba(X_eval_part)[:, 1]
                    test_predict[predict_colName] += model.predict_proba(X_test)[:, 1] / n_splits

    if blending == True:
        train_oof[y1.name] = y1
    else:
        train_oof[y.name] = y
        
    return train_oof, test_predict

In [6]:
estimators = [("rf", rf_reg),
              ("gbdt", gbdt_reg),
              ("xgb", xgb_reg)
             ]   

In [7]:
train_oof, test_predict = train_cross(X_train, y_train, X_test, estimators, regress=True)

In [8]:
from sklearn.linear_model import LinearRegression
lr_reg = LinearRegression().fit(np.array(train_oof.iloc[:, :3]), y_train)
lr_train_prediction = lr_reg.predict(np.array(train_oof.iloc[:, :3]))
lr_test_prediction = lr_reg.predict(np.array(test_predict))
print('The results of LR-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(lr_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(lr_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(lr_train_prediction, y_train), 
                                      R2(lr_test_prediction, y_test)))

from sklearn.linear_model import Ridge
Ridge_reg = Ridge().fit(train_oof.iloc[:, :3], y_train)
Ridge_train_prediction = Ridge_reg.predict(np.array(train_oof.iloc[:, :3]))
Ridge_test_prediction = Ridge_reg.predict(np.array(test_predict))
print('The results of Ridge-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(Ridge_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(Ridge_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(Ridge_train_prediction, y_train), 
                                      R2(Ridge_test_prediction, y_test)))

from sklearn.linear_model import Lasso
Lasso_reg = Lasso().fit(train_oof.iloc[:, :3], y_train)
Lasso_train_prediction = Lasso_reg.predict(np.array(train_oof.iloc[:, :3]))
Lasso_test_prediction = Lasso_reg.predict(np.array(test_predict))
print('The results of Lasso-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(Lasso_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(Lasso_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(Lasso_train_prediction, y_train), 
                                      R2(Lasso_test_prediction, y_test)))


from sklearn.linear_model import ElasticNet
ElasticNet_reg = ElasticNet().fit(train_oof.iloc[:, :3], y_train)
ElasticNet_train_prediction = ElasticNet_reg.predict(np.array(train_oof.iloc[:, :3]))
ElasticNet_test_prediction = ElasticNet_reg.predict(np.array(test_predict))
print('The results of ElasticNet-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(ElasticNet_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(ElasticNet_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(ElasticNet_train_prediction, y_train), 
                                      R2(ElasticNet_test_prediction, y_test)))

from sklearn.linear_model import BayesianRidge
BayesianRidge_reg = BayesianRidge().fit(train_oof.iloc[:, :3], y_train)
BayesianRidge_train_prediction = BayesianRidge_reg.predict(np.array(train_oof.iloc[:, :3]))
BayesianRidge_test_prediction = BayesianRidge_reg.predict(np.array(test_predict))
print('The results of BayesianRidge-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(BayesianRidge_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(BayesianRidge_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(BayesianRidge_train_prediction, y_train), 
                                      R2(BayesianRidge_test_prediction, y_test)))

from sklearn.svm import SVR
SVR_reg = SVR().fit(train_oof.iloc[:, :3], y_train)
SVR_train_prediction = SVR_reg.predict(np.array(train_oof.iloc[:, :3]))
SVR_test_prediction = SVR_reg.predict(np.array(test_predict))
print('The results of SVR-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(SVR_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(SVR_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(SVR_train_prediction, y_train), 
                                      R2(SVR_test_prediction, y_test)))

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor().fit(train_oof.iloc[:, :3], y_train)
tree_train_prediction = tree_reg.predict(np.array(train_oof.iloc[:, :3]))
tree_test_prediction = tree_reg.predict(np.array(test_predict))
print('The results of tree-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(tree_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(tree_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(tree_train_prediction, y_train), 
                                      R2(tree_test_prediction, y_test)))


from sklearn.ensemble import BaggingRegressor
bagging_reg = BaggingRegressor().fit(train_oof.iloc[:, :3], y_train)
bagging_train_prediction = bagging_reg.predict(np.array(train_oof.iloc[:, :3]))
bagging_test_prediction = bagging_reg.predict(np.array(test_predict))
print('The results of bagging-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(bagging_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(bagging_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(bagging_train_prediction, y_train), 
                                      R2(bagging_test_prediction, y_test)))

RFR = RandomForestRegressor().fit(train_oof.iloc[:, :3], y_train)
RFR_train_prediction = RFR.predict(np.array(train_oof.iloc[:, :3]))
RFR_test_prediction = RFR.predict(np.array(test_predict))
print('The results of RFR-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(RFR_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(RFR_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(RFR_train_prediction, y_train), 
                                      R2(RFR_test_prediction, y_test)))

from sklearn.ensemble import AdaBoostRegressor
ABR = AdaBoostRegressor().fit(train_oof.iloc[:, :3], y_train)
ABR_train_prediction = ABR.predict(np.array(train_oof.iloc[:, :3]))
ABR_test_prediction = ABR.predict(np.array(test_predict))
print('The results of ABR-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(ABR_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(ABR_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(ABR_train_prediction, y_train), 
                                      R2(ABR_test_prediction, y_test)))

from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor().fit(train_oof.iloc[:, :3], y_train)
GBR_train_prediction = GBR.predict(np.array(train_oof.iloc[:, :3]))
GBR_test_prediction = GBR.predict(np.array(test_predict))
print('The results of GBR-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(GBR_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(GBR_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(GBR_train_prediction, y_train), 
                                      R2(GBR_test_prediction, y_test)))

from xgboost import XGBRegressor
XGB = XGBRegressor().fit(train_oof.iloc[:, :3], y_train)
XGB_train_prediction = XGB.predict(np.array(train_oof.iloc[:, :3]))
XGB_test_prediction = XGB.predict(np.array(test_predict))
print('The results of XGB-final:')
print('Train-RMSE: %f, Test-RMSE: %f' % (np.sqrt(mean_squared_error(XGB_train_prediction, y_train)), 
                                       np.sqrt(mean_squared_error(XGB_test_prediction, y_test))))
print('Train-R2: %f, Test-R2: %f' % (R2(XGB_train_prediction, y_train), 
                                      R2(XGB_test_prediction, y_test)))

The results of LR-final:
Train-RMSE: 0.028437, Test-RMSE: 0.028163
Train-R2: 0.985828, Test-R2: 0.985362
The results of Ridge-final:
Train-RMSE: 0.028741, Test-RMSE: 0.028941
Train-R2: 0.985375, Test-R2: 0.984372
The results of Lasso-final:
Train-RMSE: 0.240560, Test-RMSE: 0.237131
Train-R2: 0.000000, Test-R2: 0.000000
The results of ElasticNet-final:
Train-RMSE: 0.240560, Test-RMSE: 0.237131
Train-R2: 0.000000, Test-R2: 0.000000
The results of BayesianRidge-final:
Train-RMSE: 0.028437, Test-RMSE: 0.028172
Train-R2: 0.985826, Test-R2: 0.985351
The results of SVR-final:
Train-RMSE: 0.036866, Test-RMSE: 0.035137
Train-R2: 0.975324, Test-R2: 0.976590
The results of tree-final:
Train-RMSE: 0.001040, Test-RMSE: 0.038844
Train-R2: 0.999981, Test-R2: 0.972491
The results of bagging-final:
Train-RMSE: 0.013167, Test-RMSE: 0.032081
Train-R2: 0.996974, Test-R2: 0.980812
The results of RFR-final:
Train-RMSE: 0.011488, Test-RMSE: 0.031405
Train-R2: 0.997696, Test-R2: 0.981554
The results of ABR-fi