In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import xgboost as xgb
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
# importing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)
# data processing
train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )

# feature engineering wrapper
train, test = helper.feature_engineering_wrapper(train, test)

# importing school feature
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

# merge school feature onto original data set.
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [4]:
train_raw = train.copy()
test_raw = test.copy()

X_train = train_raw.drop(['SalePrice','PID'],axis='columns')
y_train = np.log(train_raw['SalePrice'])
X_test = test_raw.drop(['SalePrice','PID'],axis='columns')
y_test = np.log(test_raw['SalePrice'])

In [5]:
coef_df = pd.read_csv('lasso_coef.csv',index_col=0)

In [6]:
selected_features = list(coef_df['features'])

In [7]:
cat_feats = X_train.select_dtypes(['object']).columns.to_list()
num_feats = X_train.select_dtypes(['int64','float64']).columns.to_list()

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)

columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features = cat_feats)
new_columns = list(columns_transformed) + num_feats

X_train_transformed = pd.DataFrame(X_train_transformed,columns=new_columns)

X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed,columns=new_columns)

In [9]:
X_train = X_train_transformed[selected_features]
X_test = X_test_transformed[selected_features]

# XGBoost without Bayesian optimization

In [15]:
X = X_train
y = y_train

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

print(grid.best_score_)
print(grid.best_params_)

0.9316554588904126
{'eta': 0.1, 'max_depth': 50, 'n_estimators': 100}


In [16]:
grid.score(X, y)

0.9999678413571278

In [18]:
X = X_test
y = y_test
grid.score(X, y)

0.9143127321077821

# XGBoost with Bayesian Optimization

In [19]:
xgb.set_config(verbosity=0)

In [50]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate):
    
    param_grid = {'max_depth':int(max_depth), 
                  'gamma':gamma, 
                  'n_estimators':int(n_estimators), 
                  'learning_rate':learning_rate,
                  'subsample':0.8,
                  'eta':0.1}
    
    boost = xgb.XGBRegressor()
    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    cross = cross_val_score(boost, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1)
            
    return cross.mean()

In [51]:
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth':(5,60), 
                                           'gamma':(0,1),
                                           'learning_rate':(0,1),
                                           'n_estimators':(90,120)},
                             random_state = 42)

xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9263  [0m | [0m 0.3745  [0m | [0m 0.9507  [0m | [0m 45.26   [0m | [0m 108.0   [0m |
| [0m 2       [0m | [0m 0.9263  [0m | [0m 0.156   [0m | [0m 0.156   [0m | [0m 8.195   [0m | [0m 116.0   [0m |
| [0m 3       [0m | [0m 0.9263  [0m | [0m 0.6011  [0m | [0m 0.7081  [0m | [0m 6.132   [0m | [0m 119.1   [0m |
| [0m 4       [0m | [0m 0.9263  [0m | [0m 0.8324  [0m | [0m 0.2123  [0m | [0m 15.0    [0m | [0m 95.5    [0m |
| [0m 5       [0m | [0m 0.9263  [0m | [0m 0.3042  [0m | [0m 0.5248  [0m | [0m 28.76   [0m | [0m 98.74   [0m |
| [0m 6       [0m | [0m 0.9263  [0m | [0m 0.6119  [0m | [0m 0.1395  [0m | [0m 21.07   [0m | [0m 101.0   [0m |
| [0m 7       [0m | [0m 0.9263  [0m | [0m 0.4561  [0m | [0m 0.7852  [0m | [0m 15.98   [0m | [0m 105.4   [0m 

In [52]:
params = xgb_bo.max['params']
print(params)

{'gamma': 0.3745401188473625, 'learning_rate': 0.9507143064099162, 'max_depth': 45.25966679962728, 'n_estimators': 107.9597545259111}


In [53]:
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

In [54]:
boost = xgb.XGBRegressor(**params).fit(X_train, y_train)
predicts_train = boost.predict(X_train)

In [55]:
r2_score(y_train, predicts_train)

0.9136974131291594

In [56]:
predicts_test = boost.predict(X_test)
r2_score(y_test, predicts_test)

0.8328442015089369