In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import sys
from category_encoders import *
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
sys.path.append('./Data')

In [6]:
df = pd.read_csv('./Data/train.csv')

In [7]:
df.shape

(1460, 81)

In [8]:
cont_col = []
cat_col = []
for c in df.columns:
#     print(f'{c}:{df[c].dtypes}')
    if c not in ['SalePrice', 'Id']:
        if df[c].dtypes== 'object':
            cat_col.append(c)
        else:
            cont_col.append(c)

print(f'cat_col: {cat_col}')
print(f'con_col: {cont_col}')

cat_col: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
con_col: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'Ope

In [9]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", TargetEncoder(handle_missing=np.nan, handle_unknown="value"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, cont_col),
        ("cat", categorical_transformer, cat_col),
    ]
)

# clf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=0)
clf = xgb.XGBRegressor(n_estimators=150, max_depth=15, eta=0.1, subsample=0.7, colsample_bytree=0.8)

params = {'n_estimators':np.arange(200, 500, 10),
        'max_depth': np.arange(5, 20, 1)
         }

skf = StratifiedKFold(n_splits=10, shuffle= True, random_state= 17)

best_clf = GridSearchCV(estimator= clf, param_grid= params, scoring = 'neg_root_mean_squared_error',
                         cv= skf, verbose= True, n_jobs= -1)

pipeline_ = Pipeline([('preprocessor', preprocessor),
#                      ('regressor', clf)]
                     ('regressor', best_clf)
                    ])

In [10]:
# df_temp = preprocessor.fit_transform(df)

In [11]:
# pipeline.fit(df[cont_col+cat_col], df['SalePrice'])
pipeline_.fit(df[cont_col+cat_col], df['SalePrice'])

Fitting 10 folds for each of 450 candidates, totalling 4500 fits




In [12]:
pipeline_.named_steps['regressor'].best_params_

{'max_depth': 6, 'n_estimators': 210}

In [13]:
pipeline_.named_steps['regressor'].best_score_

-24605.04431992219

In [14]:
pipeline_.named_steps['regressor'].best_estimator_

In [15]:
pipeline = Pipeline([('preprocessor', preprocessor),
                     ('regressor', pipeline_.named_steps['regressor'].best_estimator_)])

In [16]:
pipeline.fit(df[cont_col+cat_col], df['SalePrice'])

In [26]:
import pickle
pickle.dump(pipeline, open(f'./HousingPricePrediction.p', 'wb'))

In [17]:
df['y_est'] = pipeline.predict(df[cont_col+cat_col])

In [18]:
df['error'] = abs(df['y_est']-df['SalePrice'])

In [19]:
print(df[['SalePrice', 'y_est', 'error']])

      SalePrice          y_est        error
0        208500  208548.234375    48.234375
1        181500  181458.484375    41.515625
2        223500  219563.031250  3936.968750
3        140000  142251.640625  2251.640625
4        250000  253466.359375  3466.359375
...         ...            ...          ...
1455     175000  174022.984375   977.015625
1456     210000  207251.953125  2748.046875
1457     266500  267021.562500   521.562500
1458     142125  142128.968750     3.968750
1459     147500  148667.109375  1167.109375

[1460 rows x 3 columns]


In [20]:
mean_squared_error(df['SalePrice'], df['y_est'])

6587263.063762351

In [21]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,y_est,error
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,0,2,2008,WD,Normal,208500,208548.234375,48.234375
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,0,5,2007,WD,Normal,181500,181458.484375,41.515625
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,0,9,2008,WD,Normal,223500,219563.03125,3936.96875
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,0,2,2006,WD,Abnorml,140000,142251.640625,2251.640625
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,0,12,2008,WD,Normal,250000,253466.359375,3466.359375


# Test score generation

In [22]:
df_test = pd.read_csv('./Data/test.csv')

In [23]:
df_test['SalePrice'] = pipeline.predict(df_test[cont_col+cat_col])

In [24]:
df_test[['Id', 'SalePrice']].to_csv('submission.csv',index=False)