In [309]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import sys
from category_encoders import *
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
sys.path.append('/home/ec2-user/SageMaker/Coding/HousingPrice/Data/')

In [310]:
df = pd.read_csv('/home/ec2-user/SageMaker/Coding/HousingPrice/Data/train.csv')

In [311]:
df.shape

(1460, 81)

In [312]:
cont_col = []
cat_col = []
for c in df.columns:
#     print(f'{c}:{df[c].dtypes}')
    if c not in ['SalePrice', 'Id']:
        if df[c].dtypes== 'object':
            cat_col.append(c)
        else:
            cont_col.append(c)

print(f'cat_col: {cat_col}')
print(f'con_col: {cont_col}')

cat_col: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
con_col: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'Ope

In [313]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", TargetEncoder(handle_missing=np.nan, handle_unknown="value"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, cont_col),
        ("cat", categorical_transformer, cat_col),
    ]
)

# clf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=0)
clf = xgb.XGBRegressor(n_estimators=150, max_depth=15, eta=0.1, subsample=0.7, colsample_bytree=0.8)

params = {'n_estimators':np.arange(200, 500, 10),
        'max_depth': np.arange(5, 20, 1)
         }

skf = StratifiedKFold(n_splits=10, shuffle= True, random_state= 17)

best_clf = GridSearchCV(estimator= clf, param_grid= params, scoring = 'neg_root_mean_squared_error',
                         cv= skf, verbose= True, n_jobs= -1)

pipeline_ = Pipeline([('preprocessor', preprocessor),
#                      ('regressor', clf)]
                     ('regressor', best_clf)
                    ])

In [314]:
# df_temp = preprocessor.fit_transform(df)

In [315]:
# pipeline.fit(df[cont_col+cat_col], df['SalePrice'])
pipeline_.fit(df[cont_col+cat_col], df['SalePrice'])

Fitting 10 folds for each of 450 candidates, totalling 4500 fits




Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFull...
                                  

In [316]:
pipeline_.named_steps['regressor'].best_params_

{'max_depth': 5, 'n_estimators': 420}

In [317]:
pipeline_.named_steps['regressor'].best_score_

-24684.203551808394

In [318]:
pipeline_.named_steps['regressor'].best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
             early_stopping_rounds=None, enable_categorical=False, eta=0.1,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=420, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, ...)

In [319]:
pipeline = Pipeline([('preprocessor', preprocessor),
                     ('regressor', pipeline_.named_steps['regressor'].best_estimator_)])

In [320]:
pipeline.fit(df[cont_col+cat_col], df['SalePrice'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFull...
                              eval

In [321]:
df['y_est'] = pipeline.predict(df[cont_col+cat_col])

In [322]:
df['error'] = abs(df['y_est']-df['SalePrice'])

In [323]:
print(df[['SalePrice', 'y_est', 'error']])

      SalePrice          y_est        error
0        208500  208575.359375    75.359375
1        181500  180777.546875   722.453125
2        223500  221787.656250  1712.343750
3        140000  140980.984375   980.984375
4        250000  251901.578125  1901.578125
...         ...            ...          ...
1455     175000  174784.796875   215.203125
1456     210000  207669.765625  2330.234375
1457     266500  266032.500000   467.500000
1458     142125  141930.484375   194.515625
1459     147500  148283.812500   783.812500

[1460 rows x 3 columns]


In [324]:
mean_squared_error(df['SalePrice'], df['y_est'])

2651527.6275997735

In [325]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,y_est,error
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,0,2,2008,WD,Normal,208500,208575.359375,75.359375
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,0,5,2007,WD,Normal,181500,180777.546875,722.453125
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,0,9,2008,WD,Normal,223500,221787.65625,1712.34375
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,0,2,2006,WD,Abnorml,140000,140980.984375,980.984375
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,0,12,2008,WD,Normal,250000,251901.578125,1901.578125


# Test score generation

In [326]:
df_test = pd.read_csv('/home/ec2-user/SageMaker/Coding/HousingPrice/Data/test.csv')

In [327]:
df_test['SalePrice'] = pipeline.predict(df_test[cont_col+cat_col])

In [328]:
df_test[['Id', 'SalePrice']].to_csv('submission.csv',index=False)