## Modeling to get the predictions

In [133]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from xgboost import XGBRegressor

pd.pandas.set_option('display.max_columns', None)

In [134]:
train_data = pd.read_csv('X_train_selected.csv')

In [135]:
y = train_data.SalePrice
x = train_data.drop('SalePrice', axis=1)

In [136]:
# train test split
x_train, x_valid, y_train, y_valid = train_test_split(x,y, train_size=0.8, test_size=0.2, random_state=1)

In [137]:
# Hyperparameter optimization
n_estimators = [100, 500, 900, 1100, 1500,2000,5000,10000]
max_depth = [2, 3, 5, 7,8,10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.01,0.05,0.1,0.15,0.20,0.25]
min_child_weight=[1,2,3,4,5]
base_score=[0.2,0.4,0.6,0.8,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

#Initialise the xGBRegressor
regressor = XGBRegressor()

In [138]:
# Random Search CV initialiser
random_cv = RandomizedSearchCV(estimator =  regressor,
                                param_distributions = hyperparameter_grid,
                                cv = 5, 
                                n_iter = 50,
                                scoring = 'neg_mean_absolute_error',
                                n_jobs = 5,
                                verbose = 5, 
                                return_train_score = True,
                                random_state = 52)

In [139]:
random_cv.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                   n_iter=50, n_jobs=5,
                   param_distributions={'base_score': [0.2, 0.4, 0.6, 0.8, 1],
                                        'booster': ['gbtree', 'gblinear'

In [140]:
# Get the best estimation
random_cv.best_estimator_

XGBRegressor(base_score=0.6, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=2, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [153]:
# Let's update the regressor with best estimated values

regressor = XGBRegressor(learning_rate=0.01,max_depth=5,n_estimators=2500,n_jobs=10)


In [154]:
# Let's fit the model
regressor.fit(x_train, y_train,
                early_stopping_rounds=34,
                eval_set=[(x_valid, y_valid)])

[0]	validation_0-rmse:11.37803
[1]	validation_0-rmse:11.26402
[2]	validation_0-rmse:11.15115
[3]	validation_0-rmse:11.03941
[4]	validation_0-rmse:10.92879
[5]	validation_0-rmse:10.81928
[6]	validation_0-rmse:10.71086
[7]	validation_0-rmse:10.60353
[8]	validation_0-rmse:10.49727
[9]	validation_0-rmse:10.39207
[10]	validation_0-rmse:10.28793
[11]	validation_0-rmse:10.18483
[12]	validation_0-rmse:10.08276
[13]	validation_0-rmse:9.98171
[14]	validation_0-rmse:9.88167
[15]	validation_0-rmse:9.78264
[16]	validation_0-rmse:9.68459
[17]	validation_0-rmse:9.58753
[18]	validation_0-rmse:9.49144
[19]	validation_0-rmse:9.39631
[20]	validation_0-rmse:9.30213
[21]	validation_0-rmse:9.20890
[22]	validation_0-rmse:9.11660
[23]	validation_0-rmse:9.02523
[24]	validation_0-rmse:8.93476
[25]	validation_0-rmse:8.84521
[26]	validation_0-rmse:8.75681
[27]	validation_0-rmse:8.66929
[28]	validation_0-rmse:8.58265
[29]	validation_0-rmse:8.49688
[30]	validation_0-rmse:8.41196
[31]	validation_0-rmse:8.32789
[32]	

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=2500, n_jobs=10,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [150]:
# Verifying with the validation data and getting the predictions, mean absolute error and root mean squared log error 
predictions = regressor.predict(x_valid)
mae = mean_absolute_error(y_valid, predictions)
print('MAE: {}'.format(mae))

rmsle = np.sqrt(mean_squared_error(np.log(y_valid), np.log(predictions)))
print('RMSLE: {}'.format(rmsle))

MAE: 0.09382401548148712
RMSLE: 0.011636075990609583


In [144]:
# Let's get the test data and predict with our model.
# Then we'll save the it as submissions.csv and submit in the Kaggle

OG_X_test = pd.read_csv('X_test.csv')
x_test = pd.read_csv('X_test_selected.csv')
submit_predictions = regressor.predict(x_test)
scaled_predictions = np.exp(submit_predictions)             # scaling out of log distribution
# Save test predictions to file
output = pd.DataFrame({'Id': OG_X_test.Id,
                       'SalePrice': scaled_predictions})
output.to_csv('submission.csv', index=False)