In [1]:
import numpy as np

from importlib.util import find_spec
if find_spec("home_value_predictor") is None:
    import sys
    sys.path.append('..')

from home_value_predictor.datasets.home_dataset import HomeDataset
from home_value_predictor.models.xgboost_model import XGBoostModel

In [2]:
data = HomeDataset()

In [3]:
df = data.load_data(processed=False)

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
processed_df = data.load_data()

In [6]:
processed_df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,YrBltAndRemod_4012.0,YrBltAndRemod_4013.0,YrBltAndRemod_4014.0,YrBltAndRemod_4015.0,YrBltAndRemod_4016.0,YrBltAndRemod_4017.0,YrBltAndRemod_4018.0,YrBltAndRemod_4019.0,YrBltAndRemod_4020.0,SalePrice
0,0.054054,-0.267772,0.5,0.0,0.652174,0.243243,1.08636,0.336331,0.0,-0.683706,...,0,0,0,0,0,0,0,0,0,12.247699
1,0.459459,0.029676,0.0,2.821656,0.065217,-0.486486,0.0,0.580666,0.0,-0.364213,...,0,0,0,0,0,0,0,0,0,12.109016
2,0.135135,0.398921,0.5,0.0,0.608696,0.216216,0.993527,0.114555,0.0,-0.075511,...,0,0,0,0,0,0,0,0,0,12.317171
3,-0.081081,0.01751,0.5,0.0,-1.26087,-0.648649,0.0,-0.212787,0.0,0.103665,...,0,0,0,0,0,0,0,0,0,11.849405
4,0.567568,0.949876,1.0,0.0,0.586957,0.162162,1.419234,0.287252,0.0,0.021165,...,0,0,0,0,0,0,0,0,0,12.42922


In [7]:
X_train, X_test, y_train, y_test = data.split_data(processed_df)

In [8]:
print("X_train Shape: {}".format(X_train.shape))
print("y_train Shape: {}".format(y_train.shape))
print("X_test Shape: {}".format(X_test.shape))
print("y_test Shape: {}".format(y_test.shape))

X_train Shape: (1168, 505)
y_train Shape: (1168,)
X_test Shape: (292, 505)
y_test Shape: (292,)


In [9]:
xgb_regressor = XGBoostModel()

In [10]:
xgb_regressor.load('xgb_model.json')

In [11]:
preds = xgb_regressor.predict(X_test)

In [12]:
preds.shape

(292,)

In [13]:
print(preds[1], y_test.iloc[1])

12.693726 12.691583538230217


In [14]:
transformed_preds = xgb_regressor.predict(X_test, transform_output=True)

In [15]:
print(transformed_preds[1], xgb_regressor.transform_output(y_test.iloc[1]))

325696.94 324999.9999999999


In [16]:
xgb_regressor.evaluate(y_test, preds)

0.9664211952889856

In [17]:
test_params = {'n_estimators':range(210, 220, 10), 
             'learning_rate':[0.070], 
             'max_depth':[3,5],
             'min_child_weight':[2]}

In [18]:
xgb = XGBoostModel()

In [19]:
xgb.train(X_train, y_train, params=test_params, save_best=True)



In [20]:
print(xgb.best_params, xgb.best_score)

{'n_estimators': 210, 'min_child_weight': 2, 'max_depth': 3, 'learning_rate': 0.07} 0.8924783304998813
