# **Modeling 2**
This will use the concept of stacking ensemble to improve model rmse

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

import re
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, SplineTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin 

#metrics
from sklearn.metrics import mean_squared_error
#model selection
from sklearn.model_selection import cross_val_score, GridSearchCV

#load preprocessed dataset:
import joblib
#models
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

In [2]:
X = joblib.load("transformed_train.pkl")
X.shape

(2471, 80)

In [3]:
input_folder = "kaggle/input/linking-writing-processes-to-writing-quality/"
train_scores = pd.read_csv(input_folder +"train_scores.csv", delimiter = ",", header = 0)
scores = pd.Series(data = train_scores['score'].values, index = train_scores['id'].values, name = 'score')

In [4]:
Y = scores.values

In [5]:
# display scores
def display_scores(rmse_neg_score):
    scores = np.sqrt(-rmse_neg_score)
    print("score: {}".format(scores))
    print("Mean rmse: {}".format(scores.mean()))
    print("Sd rmse: {}".format(scores.std(ddof = 1)))
    

In [6]:
# Define the hyperparameter grid
param_grid = {
    'tree_method':["hist"],
    'device': ['cpu'],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    'gamma': [0, .001, .01, 0.1, 1],
}
model = xgb.XGBRegressor()
# model = BaggingRegressor(estimator = xgb.XGBRegressor(tree_method = "hist", device = 'cpu'),
#                          random_state = 11, max_samples = .4, max_features = 0.6, n_estimators = 100)
grid_search_model = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error',
                                n_jobs = -1, verbose = 3)


In [7]:
# grid_search_model.fit(X,Y)

In [8]:
# print(grid_search_model.best_params_)
# predictions = grid_search_model.best_estimator_.predict(X)
# rmse = np.sqrt(mean_squared_error(predictions, Y))
# print("RMSE: {}".format(rmse))

In [9]:
# best cross validation parameters:
# {'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 1, 'tree_method': 'hist'}
# RMSE: 0.5573103243244937

In [10]:
# The below model provides an rmse of 0.6500 in the leaderboard: When executed here, rmse is 0.58
# model = BaggingRegressor(estimator = xgb.XGBRegressor(tree_method = "hist", device = 'cpu', learning_rate = 0.1,
#                                                      subsample = 1, max_depth = 3),random_state = 11, 
#                          max_samples = .4, max_features = 0.6, n_estimators = 50, bootstrap = False)

In [15]:
model = BaggingRegressor(estimator = xgb.XGBRegressor(tree_method = "hist", device = 'cpu', learning_rate = 0.1,
                                                     subsample = 1, max_depth = 3),random_state = 11, n_jobs = -1,
                         max_samples = .4, max_features = 0.6, n_estimators = 50, bootstrap = False)

In [16]:
model.fit(X,Y)
predictions = model.predict(X)
rmse = np.sqrt(mean_squared_error(predictions, Y))
print("RMSE: {}".format(rmse))

RMSE: 0.5979054943501494


In [17]:
# cv_result = pd.DataFrame(grid_search_model.cv_results_)
# cv_result.sort_values(by = "rank_test_score").to_csv("xgboost_grid_csv.csv",index = False)