In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

data = pd.read_csv('train.csv')

In [2]:
# Drop all rows with missing data on column 'SalePrice'
data = data.dropna(axis=0, how='any',subset=['SalePrice'])

In [154]:
y = data.SalePrice
X = data.drop(['SalePrice'],axis=1).select_dtypes(exclude=['object'])

## Missing data handler

In [155]:
from sklearn.preprocessing import Imputer
my_imputer = Imputer()

def missing_data_handler(df):
    cols_with_missing = (col for col in df.columns if df[col].isnull().any())
    for col in cols_with_missing:
        df[col+'_was_missing'] = df[col].isnull()
    return df

In [156]:
X = missing_data_handler(X)

In [16]:
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

In [17]:
my_imputer.fit(train_X)
train_X = my_imputer.transform(train_X)
test_X = my_imputer.transform(test_X)

## fit into xgboost model

In [129]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Evaluate

In [131]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 15812.436140839041


## Model Tuning

#### n_estimators & early_stopping_rounds

In [140]:
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=10, 
             eval_set=[(test_X, test_y)], verbose=True)

[0]	validation_0-rmse:183171
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:166324
[2]	validation_0-rmse:151329
[3]	validation_0-rmse:137937
[4]	validation_0-rmse:125763
[5]	validation_0-rmse:114871
[6]	validation_0-rmse:105410
[7]	validation_0-rmse:96790.3
[8]	validation_0-rmse:89062.7
[9]	validation_0-rmse:82159.3
[10]	validation_0-rmse:75693
[11]	validation_0-rmse:70204.6
[12]	validation_0-rmse:65350.7
[13]	validation_0-rmse:61000.3
[14]	validation_0-rmse:57072.3
[15]	validation_0-rmse:53597.5
[16]	validation_0-rmse:50514.2
[17]	validation_0-rmse:47784
[18]	validation_0-rmse:45266.8
[19]	validation_0-rmse:43268.2
[20]	validation_0-rmse:41430.8
[21]	validation_0-rmse:39760.4
[22]	validation_0-rmse:38125.1
[23]	validation_0-rmse:36836.1
[24]	validation_0-rmse:35694.7
[25]	validation_0-rmse:34890.1
[26]	validation_0-rmse:34009.5
[27]	validation_0-rmse:33403.5
[28]	validation_0-rmse:32560
[29]	validation_0-rmse:31748.1
[30]	validation_0-rmse:31231

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [127]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 15788.98022260274


#### Learning rate

In [164]:
my_model = XGBRegressor(n_estimators=3000, learning_rate=0.01)
my_model.fit(train_X, train_y, early_stopping_rounds=10, 
             eval_set=[(test_X, test_y)], verbose=True)

[0]	validation_0-rmse:199976
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:198123
[2]	validation_0-rmse:196273
[3]	validation_0-rmse:194443
[4]	validation_0-rmse:192654
[5]	validation_0-rmse:190882
[6]	validation_0-rmse:189130
[7]	validation_0-rmse:187374
[8]	validation_0-rmse:185673
[9]	validation_0-rmse:183973
[10]	validation_0-rmse:182277
[11]	validation_0-rmse:180597
[12]	validation_0-rmse:178965
[13]	validation_0-rmse:177333
[14]	validation_0-rmse:175705
[15]	validation_0-rmse:174128
[16]	validation_0-rmse:172551
[17]	validation_0-rmse:170986
[18]	validation_0-rmse:169423
[19]	validation_0-rmse:167877
[20]	validation_0-rmse:166371
[21]	validation_0-rmse:164871
[22]	validation_0-rmse:163385
[23]	validation_0-rmse:161927
[24]	validation_0-rmse:160463
[25]	validation_0-rmse:159047
[26]	validation_0-rmse:157619
[27]	validation_0-rmse:156214
[28]	validation_0-rmse:154818
[29]	validation_0-rmse:153420
[30]	validation_0-rmse:152068
[31]	validation

[261]	validation_0-rmse:35193.6
[262]	validation_0-rmse:35103.3
[263]	validation_0-rmse:35002.9
[264]	validation_0-rmse:34888.7
[265]	validation_0-rmse:34796.2
[266]	validation_0-rmse:34701.8
[267]	validation_0-rmse:34609.6
[268]	validation_0-rmse:34523.4
[269]	validation_0-rmse:34435.3
[270]	validation_0-rmse:34349.4
[271]	validation_0-rmse:34267.7
[272]	validation_0-rmse:34164.9
[273]	validation_0-rmse:34078.2
[274]	validation_0-rmse:33996.3
[275]	validation_0-rmse:33911.4
[276]	validation_0-rmse:33833
[277]	validation_0-rmse:33754.3
[278]	validation_0-rmse:33673.9
[279]	validation_0-rmse:33595.1
[280]	validation_0-rmse:33497.9
[281]	validation_0-rmse:33425.9
[282]	validation_0-rmse:33347.7
[283]	validation_0-rmse:33255.7
[284]	validation_0-rmse:33181.7
[285]	validation_0-rmse:33111.6
[286]	validation_0-rmse:33040.6
[287]	validation_0-rmse:32970.9
[288]	validation_0-rmse:32899
[289]	validation_0-rmse:32847.3
[290]	validation_0-rmse:32775.1
[291]	validation_0-rmse:32694.5
[292]	valida

[519]	validation_0-rmse:26639.1
[520]	validation_0-rmse:26628.5
[521]	validation_0-rmse:26621.1
[522]	validation_0-rmse:26611.1
[523]	validation_0-rmse:26606.9
[524]	validation_0-rmse:26607
[525]	validation_0-rmse:26604.2
[526]	validation_0-rmse:26593.6
[527]	validation_0-rmse:26595.3
[528]	validation_0-rmse:26588.1
[529]	validation_0-rmse:26581.1
[530]	validation_0-rmse:26574.8
[531]	validation_0-rmse:26558.8
[532]	validation_0-rmse:26551.3
[533]	validation_0-rmse:26538.8
[534]	validation_0-rmse:26534.9
[535]	validation_0-rmse:26528
[536]	validation_0-rmse:26528.7
[537]	validation_0-rmse:26521.5
[538]	validation_0-rmse:26520.9
[539]	validation_0-rmse:26520.7
[540]	validation_0-rmse:26517.9
[541]	validation_0-rmse:26509.1
[542]	validation_0-rmse:26499.9
[543]	validation_0-rmse:26496.2
[544]	validation_0-rmse:26489.7
[545]	validation_0-rmse:26482.5
[546]	validation_0-rmse:26477.8
[547]	validation_0-rmse:26473.5
[548]	validation_0-rmse:26467.2
[549]	validation_0-rmse:26461.7
[550]	valida

[777]	validation_0-rmse:25427.9
[778]	validation_0-rmse:25423.8
[779]	validation_0-rmse:25419.9
[780]	validation_0-rmse:25418.9
[781]	validation_0-rmse:25414.4
[782]	validation_0-rmse:25410
[783]	validation_0-rmse:25403.3
[784]	validation_0-rmse:25401
[785]	validation_0-rmse:25399.4
[786]	validation_0-rmse:25395
[787]	validation_0-rmse:25394.4
[788]	validation_0-rmse:25389.5
[789]	validation_0-rmse:25386.9
[790]	validation_0-rmse:25382.4
[791]	validation_0-rmse:25378.1
[792]	validation_0-rmse:25378.8
[793]	validation_0-rmse:25374.6
[794]	validation_0-rmse:25376.7
[795]	validation_0-rmse:25374.4
[796]	validation_0-rmse:25370.2
[797]	validation_0-rmse:25368.6
[798]	validation_0-rmse:25364.2
[799]	validation_0-rmse:25359.2
[800]	validation_0-rmse:25358.9
[801]	validation_0-rmse:25355.1
[802]	validation_0-rmse:25354.1
[803]	validation_0-rmse:25349.8
[804]	validation_0-rmse:25346
[805]	validation_0-rmse:25343.6
[806]	validation_0-rmse:25340.2
[807]	validation_0-rmse:25338.8
[808]	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=3000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [119]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 15663.995869006849


#### n_jobs
set the parameter n_jobs equal to the number of cores

In [157]:
X = my_imputer.fit_transform(X)

In [158]:
my_model = XGBRegressor(n_estimators=952, learning_rate=0.01)
my_model.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=952,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [162]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 11401.224315068494
