In [105]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram
from sklearn.svm import LinearSVC, SVC

In [111]:
# train and real_test
train = pd.read_csv("./data/train.csv")
real_test = pd.read_csv("./data/test.csv")

# drop cols with less than 70% info
drop_cols = [col for col in train.columns if train[col].isnull().sum(axis=0)/len(train) >=0.3]
train.drop(drop_cols, axis=1, inplace=True)
real_test.drop(drop_cols, axis=1, inplace=True)

# find numeric and categoric columns 
num_cols = train.select_dtypes(include=np.number).columns
cat_cols = pd.Index(list(set(train.columns)- set(num_cols)))
num_cols = num_cols.drop(['SalePrice'])

# algorithms used
OHE = OneHotEncoder(handle_unknown = "ignore")
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
xgb = XGBRegressor()

# column transformation pipeline
transformer = ColumnTransformer([('cat_cols', OHE, cat_cols),
                                ('num_cols', knn_imputer, num_cols)])
# complete pipeline
pipe = Pipeline([('preprocessing', transformer),
                ('XGBR', xgb)])

# split in train test sets
y = train.SalePrice
X = train.drop(['SalePrice'], axis=1)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)


In [116]:
# Hyperparameter tunning with BayesSearchCV

fit_params = {
    'early_stopping_rounds': 10,
    'eval_set':[(X, y)],
    'verbose': False,
}

from skopt.space import Real, Categorical, Integer
# add XGBR__"hyperparameter" to distinguish 
search_space = {
    'XGBR__max_depth': Integer(0, 50),
    'XGBR__n_estimators': Integer(100, 1000),
    'XGBR__learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'XGBR__gamma': Real(1e-9, 0.5, 'log-uniform'),
    'XGBR__scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
}


    # scoring="neg_mean_squared_error",
opt = BayesSearchCV(
    pipe,
    search_spaces=search_space,
    fit_params=fit_params,
    cv=3,
    random_state=42,
    n_iter=3,
    verbose=1,
)

opt.fit(train_X, train_y)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(test_X, test_y))
print("best params: %s" % str(opt.best_params_))

# is the model overfitted? R2 vs neg_mean_squared_error

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
val. score: 0.8847372550776066
test score: 0.8563900521396924
best params: OrderedDict([('XGBR__gamma', 3.6938736545297165e-06), ('XGBR__learning_rate', 0.28539836866041823), ('XGBR__max_depth', 47), ('XGBR__n_estimators', 384), ('XGBR__scale_pos_weight', 0.6754557459174224)])


In [102]:
# pipe.fit(train_X, train_y)
# pipe.score(test_X, test_y), pipe.score(train_X, train_y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_cols',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['KitchenQual', 'Functional', 'MasVnrType', 'Neighborhood', 'LandSlope',
       'PavedDrive', 'SaleCondition', 'Street', 'Condition2', 'HouseStyle',
       'CentralAir', 'GarageCond', 'LotConfig', 'BldgType', 'MSZoning',
       'BsmtFinType1', 'HeatingQC', 'GarageFi...
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.300000012, max_delta_step=0,
                              max_depth=6, min_child_weight=1, missing=nan,
                              monotone_constraints='()', n_estimators=100,
                              n_jobs=4, num_parallel_tree=1, predictor='auto',
                              random_state=0, reg_alp

In [118]:
opt.score(test_X, test_y), opt.score(train_X, train_y)

(0.8563900521396924, 0.99999999999994)