In [50]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import xgboost as xgb
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
# importing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)
# data processing
train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )

# feature engineering wrapper
train, test = helper.feature_engineering_wrapper(train, test)

# importing school feature
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

# merge school feature onto original data set.
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [4]:
train_raw = train.copy()
test_raw = test.copy()

X_train = train_raw.drop(['SalePrice','PID'],axis='columns')
y_train = np.log(train_raw['SalePrice'])
X_test = test_raw.drop(['SalePrice','PID'],axis='columns')
y_test = np.log(test_raw['SalePrice'])

In [5]:
categorical = train.select_dtypes(['object','bool']).columns.to_list()

# XGBoost without Bayesian optimization (OneHotEncoder)

In [7]:
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                remainder='passthrough')

X = transformer.fit_transform(X_train)
y = y_train

boost = xgb.XGBRegressor()

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

scores0 = cross_val_score(boost, X, y, scoring='r2', cv=cv, n_jobs=-1)
scores1 = cross_val_score(boost, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

scores1 = abs(scores1)

print('R2 mean: %.3f (R2 sd: %.3f)' % (scores0.mean(), scores0.std()))
print('RMSE mean: %.3f (RMSE sd: %.3f)' % (scores1.mean(), scores1.std()))

R2 mean: 0.922 (R2 sd: 0.010)
RMSE mean: 0.106 (RMSE sd: 0.005)


In [8]:
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                remainder='passthrough')

X = transformer.fit_transform(X_train)
y = y_train

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=5, n_splits=4, random_state=42),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
             

In [9]:
grid.cv_results_['mean_test_score']

array([-7.50105689e+02, -1.12426350e+02,  9.30484861e-01, -7.50105689e+02,
       -1.12426350e+02,  9.29670915e-01, -4.53980027e+02,  1.06522988e-01,
        9.10997924e-01, -4.53980027e+02,  1.06522988e-01,  9.10551285e-01,
       -2.31817926e+02,  8.84795615e-01,  8.89527814e-01, -2.31817926e+02,
        8.84538672e-01,  8.88787510e-01])

In [10]:
print(grid.best_score_)
print(grid.best_params_)

0.9304848609458374
{'eta': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [13]:
split_scores_best_params = [grid.cv_results_[f'split{j}_test_score'][grid.best_index_] for j in range(20)]
split_scores_best_params

[0.9266094732669216,
 0.925089886701356,
 0.9367783628307563,
 0.9350228257994281,
 0.931905393955254,
 0.924291910720867,
 0.9390481513818065,
 0.928715176330204,
 0.9278364755762685,
 0.93738961631268,
 0.9349303235003271,
 0.930556154633325,
 0.9256630413554444,
 0.9400218219750167,
 0.9323589550484906,
 0.9114086982332865,
 0.9254541323572523,
 0.9386657643149838,
 0.9295163009874403,
 0.9284347536356401]

In [14]:
np.mean(split_scores_best_params)

0.9304848609458374

In [16]:
grid.score(X, y)

0.9999628571398433

In [17]:
X = transformer.transform(X_test)
y = y_test
grid.score(X, y)

0.917702155202159

# XGBoost without Bayesian optimization (OrdinalEncoder)

In [19]:
transformer = ColumnTransformer([("Cat", OrdinalEncoder(), categorical)], 
                                remainder='passthrough')

X = transformer.fit_transform(X_train)
y = y_train

boost = xgb.XGBRegressor()

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

scores0 = cross_val_score(boost, X, y, scoring='r2', cv=cv, n_jobs=-1)
scores1 = cross_val_score(boost, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

scores1 = abs(scores1)

print('R2 mean: %.3f (R2 sd: %.3f)' % (scores0.mean(), scores0.std()))
print('RMSE mean: %.3f (RMSE sd: %.3f)' % (scores1.mean(), scores1.std()))

R2 mean: 0.922 (R2 sd: 0.007)
RMSE mean: 0.105 (RMSE sd: 0.004)


In [28]:
transformer = ColumnTransformer([("Cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), categorical)], 
                                remainder='passthrough')

X = transformer.fit_transform(X_train)
y = y_train

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=5, n_splits=4, random_state=42),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
             

In [29]:
grid.cv_results_['mean_test_score']

array([-7.50105689e+02, -1.12426350e+02,  9.30520471e-01, -7.50105689e+02,
       -1.12426350e+02,  9.30222396e-01, -4.53980027e+02,  1.08390614e-01,
        9.10748454e-01, -4.53980027e+02,  1.08390614e-01,  9.10117497e-01,
       -2.31817926e+02,  8.83535228e-01,  8.88583313e-01, -2.31817926e+02,
        8.83013537e-01,  8.88483969e-01])

In [30]:
print(grid.best_score_)
print(grid.best_params_)

0.9305204707917465
{'eta': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [31]:
split_scores_best_params = [grid.cv_results_[f'split{j}_test_score'][grid.best_index_] for j in range(20)]
split_scores_best_params

[0.9265817339741325,
 0.9237718138392742,
 0.9398831615418026,
 0.9370476680173376,
 0.9285377837231272,
 0.9274072439206237,
 0.9403541504010273,
 0.9293170803093198,
 0.9261413240532121,
 0.9354011956047735,
 0.9384689028196186,
 0.9277287444140184,
 0.9238587546919652,
 0.9415214193446647,
 0.930387606686341,
 0.9142584857194735,
 0.9238715647013296,
 0.9394484547849568,
 0.9312522684814525,
 0.9251700588064814]

In [32]:
np.mean(split_scores_best_params)

0.9305204707917465

In [33]:
grid.score(X, y)

0.9999658058887606

In [34]:
X = transformer.transform(X_test)
y = y_test
grid.score(X, y)

0.9179438632948476

# XGBoost with Bayesian Optimization (OneHotEncoder)

In [35]:
xgb.set_config(verbosity=0)

In [36]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate):
    params = {'max_depth':int(max_depth), 
              'gamma':gamma, 
             'n_estimators':int(n_estimators), 
             'learning_rate':learning_rate,
             'subsample': 0.8,
             'eta':0.1,
             'eval_metric':'rmse'}
    cv_result = xgb.cv(params, dtrain, nfold=5, seed=42)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [37]:
transformer = ColumnTransformer([('Cat',OneHotEncoder(handle_unknown='ignore'), categorical)], remainder='passthrough')   
X = transformer.fit_transform(X_train)

y = y_train

dtrain = xgb.DMatrix(X, y, enable_categorical=True)



xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth':(3,10), 
                                           'gamma':(0,1),
                                           'learning_rate':(0,1),
                                           'n_estimators':(100,120)},
                             random_state = 42)

xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1466  [0m | [0m 0.3745  [0m | [0m 0.9507  [0m | [0m 8.124   [0m | [0m 112.0   [0m |
| [0m 2       [0m | [0m-2.133   [0m | [0m 0.156   [0m | [0m 0.156   [0m | [0m 3.407   [0m | [0m 117.3   [0m |
| [95m 3       [0m | [95m-0.1416  [0m | [95m 0.6011  [0m | [95m 0.7081  [0m | [95m 3.144   [0m | [95m 119.4   [0m |
| [0m 4       [0m | [0m-1.082   [0m | [0m 0.8324  [0m | [0m 0.2123  [0m | [0m 4.273   [0m | [0m 103.7   [0m |
| [95m 5       [0m | [95m-0.129   [0m | [95m 0.3042  [0m | [95m 0.5248  [0m | [95m 6.024   [0m | [95m 105.8   [0m |
| [0m 6       [0m | [0m-2.583   [0m | [0m 0.6119  [0m | [0m 0.1395  [0m | [0m 5.045   [0m | [0m 107.3   [0m |
| [0m 7       [0m | [0m-0.1439  [0m | [0m 0.4561  [0m | [0m 0.7852  [0m | [0m 4.398   [0m | [0m 1

In [38]:
params = xgb_bo.max['params']
print(params)

{'gamma': 0.3042422429595377, 'learning_rate': 0.5247564316322378, 'max_depth': 6.023615130494811, 'n_estimators': 105.82458280396084}


In [39]:
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

In [40]:
boost = xgb.XGBRegressor(**params).fit(X,y)
predicts = boost.predict(X)

In [41]:
r2_score(y,predicts)

0.931204587530553

In [42]:
X = transformer.transform(X_test)
predicts = boost.predict(X)
r2_score(y_test,predicts)

0.8871838885112392

# XGBoost with Bayesian Optimization (OrdinalEncoder)

In [43]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate):
    params = {'max_depth':int(max_depth), 
              'gamma':gamma, 
             'n_estimators':int(n_estimators), 
             'learning_rate':learning_rate,
             'subsample': 0.8,
             'eta':0.1,
             'eval_metric':'rmse'}
    cv_result = xgb.cv(params, dtrain, nfold=5, seed=42)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [44]:
transformer = ColumnTransformer([('Cat',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), categorical)], remainder='passthrough')   
X = transformer.fit_transform(X_train)

y = y_train

dtrain = xgb.DMatrix(X, y, enable_categorical=True)



xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth':(3,10), 
                                           'gamma':(0,1),
                                           'learning_rate':(0,1),
                                           'n_estimators':(100,120)},
                             random_state = 42)

xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1452  [0m | [0m 0.3745  [0m | [0m 0.9507  [0m | [0m 8.124   [0m | [0m 112.0   [0m |
| [0m 2       [0m | [0m-2.133   [0m | [0m 0.156   [0m | [0m 0.156   [0m | [0m 3.407   [0m | [0m 117.3   [0m |
| [95m 3       [0m | [95m-0.1416  [0m | [95m 0.6011  [0m | [95m 0.7081  [0m | [95m 3.144   [0m | [95m 119.4   [0m |
| [0m 4       [0m | [0m-1.082   [0m | [0m 0.8324  [0m | [0m 0.2123  [0m | [0m 4.273   [0m | [0m 103.7   [0m |
| [95m 5       [0m | [95m-0.1285  [0m | [95m 0.3042  [0m | [95m 0.5248  [0m | [95m 6.024   [0m | [95m 105.8   [0m |
| [0m 6       [0m | [0m-2.583   [0m | [0m 0.6119  [0m | [0m 0.1395  [0m | [0m 5.045   [0m | [0m 107.3   [0m |
| [0m 7       [0m | [0m-0.1438  [0m | [0m 0.4561  [0m | [0m 0.7852  [0m | [0m 4.398   [0m | [0m 1

In [45]:
params = xgb_bo.max['params']
print(params)

{'gamma': 0.26217155174161544, 'learning_rate': 0.49115317431045047, 'max_depth': 5.93465859048152, 'n_estimators': 105.82458656462529}


In [46]:
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

In [47]:
boost = xgb.XGBRegressor(**params).fit(X,y)
predicts = boost.predict(X)

In [48]:
r2_score(y,predicts)

0.9310885465150568

In [49]:
X = transformer.transform(X_test)
predicts = boost.predict(X)
r2_score(y_test,predicts)

0.8856938356468382

## Tried tweaking best result so far (no Bayesian with ordinal encoder) but results horrible.

### Instead we will try using the reduced number of features. 

In [71]:
coef_df = pd.read_csv('lasso_coef.csv',index_col=0)

In [72]:
selected_features = list(coef_df['features'])

In [73]:
selected_features

['GrLivArea_square_root',
 'year_since_built_square_root',
 'total_sf_square_root',
 'overall_score_square_root',
 'LotArea_square_root',
 'total_high_qual_finished_sf',
 'OverallQual_cubed',
 'Neighborhood_log_comp',
 'BsmtExposure_cubed',
 'GarageCars',
 'Neighborhood_Crawfor',
 'KitchenQual_cubed',
 'GarageArea',
 'MSZoning_log_comp',
 'YearRemodAdd',
 'Exterior1st_BrkFace',
 'Functional_Typ',
 'Condition1_Norm',
 'MSSubClass_30',
 'Neighborhood_BrkSide',
 'OverallCond',
 'Fireplaces',
 'BsmtFinSF1',
 'Neighborhood_SawyerW',
 'ScreenPorch',
 'PavedDrive',
 'closestSchool_Ames Middle School',
 'HeatingQC',
 'KitchenAbvGr',
 'closestSchool_Abbie Sawyer',
 'Exterior1st_PreCast',
 'BldgType_1Fam',
 'closestSchool_Ames High School',
 'total_deck_sf',
 'MSSubClass_160',
 'FireplaceQu',
 'Functional_Maj2',
 'BsmtFullBath',
 'SaleCondition_Normal',
 'BsmtFinType1_GLQ',
 'Foundation_BrkTil',
 'Neighborhood_CollgCr',
 'CentralAir_N',
 'MasVnrArea',
 'Neighborhood_MeadowV',
 'Neighborhood_Ston

In [74]:
cat_feats = X_train.select_dtypes(['object']).columns.to_list()
num_feats = X_train.select_dtypes(['int','float']).columns.to_list()

In [82]:
X_train.shape

(1856, 127)

In [78]:
X_train_transformed.shape

(1856, 323)

In [79]:
len(new_columns)

263

In [80]:
print(len(cat_feats))
print(len(num_feats))

32
35


In [81]:
len(list(columns_transformed))

228

In [84]:
X_train.dtypes.to_list()

[dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('int64'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('O'),
 dtype('float64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int

In [85]:
list(columns_transformed)

['MSSubClass_120',
 'MSSubClass_150',
 'MSSubClass_160',
 'MSSubClass_180',
 'MSSubClass_190',
 'MSSubClass_20',
 'MSSubClass_30',
 'MSSubClass_40',
 'MSSubClass_45',
 'MSSubClass_50',
 'MSSubClass_60',
 'MSSubClass_70',
 'MSSubClass_75',
 'MSSubClass_80',
 'MSSubClass_85',
 'MSSubClass_90',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_I (all)',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Street_Grvl',
 'Street_Pave',
 'LotShape_IR1',
 'LotShape_IR2',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_Bnk',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'Utilities_AllPub',
 'LotConfig_Corner',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LotConfig_Inside',
 'LandSlope_Gtl',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_Greens'

In [75]:
preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)

columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features = cat_feats)
new_columns = list(columns_transformed) + num_feats

X_train_transformed = pd.DataFrame(X_train_transformed,columns=new_columns)

X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed,columns=new_columns)

ValueError: Shape of passed values is (1856, 323), indices imply (1856, 263)

In [None]:
X_train = X_train_transformed[selected_features]
X_test = X_test_transformed[selected_features]

In [8]:
X = X_train
y = y_train

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=5, n_splits=4, random_state=42),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
             

In [9]:
grid.cv_results_['mean_test_score']

array([-7.50105689e+02, -1.12426350e+02,  9.30484861e-01, -7.50105689e+02,
       -1.12426350e+02,  9.29670915e-01, -4.53980027e+02,  1.06522988e-01,
        9.10997924e-01, -4.53980027e+02,  1.06522988e-01,  9.10551285e-01,
       -2.31817926e+02,  8.84795615e-01,  8.89527814e-01, -2.31817926e+02,
        8.84538672e-01,  8.88787510e-01])

In [10]:
print(grid.best_score_)
print(grid.best_params_)

0.9304848609458374
{'eta': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [13]:
split_scores_best_params = [grid.cv_results_[f'split{j}_test_score'][grid.best_index_] for j in range(20)]
split_scores_best_params

[0.9266094732669216,
 0.925089886701356,
 0.9367783628307563,
 0.9350228257994281,
 0.931905393955254,
 0.924291910720867,
 0.9390481513818065,
 0.928715176330204,
 0.9278364755762685,
 0.93738961631268,
 0.9349303235003271,
 0.930556154633325,
 0.9256630413554444,
 0.9400218219750167,
 0.9323589550484906,
 0.9114086982332865,
 0.9254541323572523,
 0.9386657643149838,
 0.9295163009874403,
 0.9284347536356401]

In [14]:
np.mean(split_scores_best_params)

0.9304848609458374

In [16]:
grid.score(X, y)

0.9999628571398433

In [17]:
X = transformer.transform(X_test)
y = y_test
grid.score(X, y)

0.917702155202159