In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [104]:
RANDOM_STATE=27

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [105]:
train_df.dropna(subset = ["SalePrice"], inplace=True)
train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [106]:
X = train_df.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
y = train_df['SalePrice']

X_test = test_df.select_dtypes(exclude=['object'])

columns_to_drop = ['Id']
X = X.drop(columns_to_drop, axis=1)
X_test = X_test.drop(columns_to_drop, axis=1)

feature_names = X.columns
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=RANDOM_STATE)

In [107]:
imputer = SimpleImputer()

X_train = imputer.fit_transform(X_train)
X_valid = imputer.fit_transform(X_valid)
X_test = imputer.fit_transform(X_test)

In [108]:
xgb_params = {
    "learning_rate": 0.05
}

## DMatrix notation

# dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
# dvalid = xgb.DMatrix(X_valid, label=y_valid, feature_names=feature_names)

# reserch_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=5, 
#           evals=[(dvalid,'valid')], verbose_eval=True)

# y_train_pred = reserch_model.predict(dtrain)
# y_valid_pred = reserch_model.predict(dvalid)


reserch_model = XGBRegressor(n_estimators=1000, **xgb_params)

reserch_model.fit(X_train, y_train, early_stopping_rounds=5, 
          eval_set=[(X_valid, y_valid)], eval_metric='rmse', verbose=True)

y_train_pred = reserch_model.predict(X_train)
y_valid_pred = reserch_model.predict(X_valid)

print("RMSE train: {:.2f}".format(sqrt(mean_squared_error(y_train, y_train_pred))))
print("RMSE: {:.2f}".format(sqrt(mean_squared_error(y_valid, y_valid_pred))))
print("R2: {:.5f}".format(r2_score(y_valid, y_valid_pred)))
print("Best: {:.5f}, iter={:d}".format(reserch_model.best_score, reserch_model.best_iteration))

[0]	validation_0-rmse:184399.60938
[1]	validation_0-rmse:176009.73438
[2]	validation_0-rmse:167824.53125
[3]	validation_0-rmse:160276.65625
[4]	validation_0-rmse:153159.60938
[5]	validation_0-rmse:146408.32812
[6]	validation_0-rmse:139999.92188
[7]	validation_0-rmse:134068.54688
[8]	validation_0-rmse:128380.54688
[9]	validation_0-rmse:122996.05469
[10]	validation_0-rmse:117890.08594
[11]	validation_0-rmse:112936.81250
[12]	validation_0-rmse:108382.40625
[13]	validation_0-rmse:103911.57812
[14]	validation_0-rmse:99789.50000
[15]	validation_0-rmse:95947.78125
[16]	validation_0-rmse:92073.63281
[17]	validation_0-rmse:88537.06250
[18]	validation_0-rmse:85321.39062
[19]	validation_0-rmse:82259.56250
[20]	validation_0-rmse:79347.16406
[21]	validation_0-rmse:76568.85938
[22]	validation_0-rmse:74003.14062
[23]	validation_0-rmse:71617.53125
[24]	validation_0-rmse:69339.78906
[25]	validation_0-rmse:67168.17188
[26]	validation_0-rmse:65131.71875
[27]	validation_0-rmse:63302.27344
[28]	validation_

In [109]:
X_train = imputer.fit_transform(X)
y_train = y

model_final = XGBRegressor(n_estimators=reserch_model.best_iteration, **xgb_params)
model_final.fit(X_train, y)

y_train_pred = model_final.predict(X_train)

print("RMSE train: {:.2f}".format(sqrt(mean_absolute_error(y_train, y_train_pred))))
print("R2 train: {:.5f}".format(r2_score(y_train, y_train_pred)))

RMSE train: 76.99
R2 train: 0.98992


In [110]:
y_test_pred = model_final.predict(X_test)

output = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_test_pred})
output.to_csv(f'../data/SimpleXGBoost{model.best_iteration}_remove_id.csv', index=False)