# About this notebook

This notebook presents the xgboost model for all pre-processed features on all training data.

## Retrieve Dataset

In [1]:
# setup random indicies
import numpy as np

# load df
import pandas as pd
df = pd.read_csv('./data/train_model.csv')

# randomized index
df.drop(['Unnamed: 0', 'Id'], axis=1, inplace = True)

# remove outliers
df.drop([907, 465], inplace=True)

# log transform SalePrice
df['SalePrice'] = np.log(df.SalePrice)

df.head(5)

Unnamed: 0,LotFrontage,LotArea,YearBuilt,GrLivArea,YrSold,SalePrice,MSZoning_RL,MSZoning_RMH,Alley_NoAccess,LotShape_Regular,...,OpenPorchSF_OpenPorch_Yes,Fence_no_fence,MoSold_peak_months,SaleType_non_deed,SaleCondition_not_normal,IsVinyl_yes,FullBaths_<=1FullBath,FullBaths_>=3FullBaths,HalfBaths_>=1HalfBaths,Enclosed_combined_EnclosedPorch_Yes
0,65.0,8450,7,1710,2008,12.247694,1,0,1,1,...,1,1,0,0,0,1,0,1,1,0
1,80.0,9600,34,1262,2007,12.109011,1,0,1,1,...,0,1,1,0,0,0,0,0,1,0
2,68.0,11250,9,1786,2008,12.317167,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0
3,60.0,9550,95,1717,2006,11.849398,1,0,1,0,...,1,1,0,0,1,0,0,0,0,1
4,84.0,14260,10,2198,2008,12.429216,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0


### - must rename some columns that have the > or < symbol because xgboost does not like

In [2]:
df.rename(columns={'BedroomAbvGr_>=4Bedr': 'BedroomAbvGr_atleast4', 'KitchenAbvGr_>=2Ktchn':\
                   'KitchenAbvGr_atleast2', 'TotRmsAbvGrd_<=4TotRms': 'TotRmsAbvGrd_atmost4',\
                  'TotRmsAbvGrd_>=8TotRms': 'TotRmsAbvGrd_atleast8', 'FullBaths_>=3FullBaths':\
                  'FullBaths_atleast3', 'HalfBaths_>=1HalfBaths': 'HalfBaths_atleast1',\
                  'FullBaths_<=1FullBath': 'FullBaths_atmost1'}, inplace=True)

In [3]:
features = df.drop(['SalePrice'], axis = 1)
logsaleprice = df.SalePrice

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, logsaleprice, random_state=42)

In [5]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [6]:
from sklearn.metrics import mean_squared_error

xgb_pred = xgb.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, xgb_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, xgb_pred)), 3))

MSE:  0.022
RMSE:  0.147
