In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import ipynb.fs.full.CustomTransformersHP as ct
import ipynb.fs.full.preprocessFunctions as pp

In [2]:
test = pd.read_csv("test.csv", index_col="Id")
train = pd.read_csv("train.csv", index_col="Id")

After analyzing our data we ended up with the following ideas:

In [3]:
outliers = [1299, 524, 935]
dismiss = ['EnclosedPorch', 'MiscVal', 'BsmtHalfBath',
           '3SsnPorch', 'PoolArea', 'ScreenPorch',
           'Kitchen', 'PoolQC', 'MiscFeature', 'Alley',
           'Fence', 'Utilities', 'LandSlope', 'Street',
           'PoolQC', 'MiscFeature', 'Functional']

catBivs = {'Conditions': ('Condition1', 'Condition2'),
 'Roof': ('RoofStyle', 'RoofMatl'),
 'Exterior': ('Exterior1st', 'Exterior2nd'),
 'External': ('ExterQual', 'ExterCond'),
 'Basement': ('BsmtQual', 'BsmtCond'),
 'BasementFin': ('BsmtFinType1', 'BsmtFinType2'),
 'Sale': ('SaleType', 'SaleCondition'),
 'Garage': ('GarageQual', 'GarageCond'),
 'GarageTF': ('GarageType', 'GarageFinish'),
 'HeatingCond': ('Heating', 'HeatingQC'),
 'Lot': ('LotShape', 'LotConfig')}

numBivMult = {'LotFrontageOverArea': ('LotFrontage', 'LotArea'),
 'YearsBTWbuiltAndRemod': ('YearRemodAdd', 'YearBuilt'),
 'BsmtUnfPCT': ('BsmtUnfSF', 'TotalBsmtSF'),
 'GrOverLotArea': ('GrLivArea', 'LotArea')}

numBivBool = {'RemodAfter1984': ('YearRemodAdd', 1983),
 '2ndFlr': ('2ndFlrSF', 0),
 'LowQualFin': ('LowQualFinSF', 0)}

cbrt = ['MasVnrArea', 'WoodDeckSF', 'OpenPorchSF']
 
log = ['LotFrontage', 'LotArea']

logBiv = ['GrOverLotArea']
cbrtBiv = ['BsmtUnfPCT']


Now that we have analyzed our data, let's test it into a model.
We need to make all the desired transformations both in the train set and the test set. So let's begin
We'll rename the Bedroom and Kitchen column and eliminate outliers from train set

In [4]:
pp.renameCols(train)
pp.renameCols(test)

Adding price by sf in neighborhood feature (https://towardsdatascience.com/feature-engineering-and-ensembled-models-for-the-top-10-in-kaggle-housing-prices-competition-efb35828eef0)
did not improve the results

In [5]:
# train['Sqr'] = train['SalePrice'] / train['LotArea']
# train['Sqr'] = train.groupby('Neighborhood')['Sqr'].transform(lambda x: x.median())
# d = {}
# for indice_fila, x_train in train.iterrows():
#     d.update({x_train['Neighborhood']:x_train['Sqr']})
# test['Sqr'] = 0.00
# for indice, x_test in test.iterrows():
#     test.loc[test.index == indice ,'Sqr'] = d[x_test['Neighborhood']]

We'll make a list of categorical variables and numerical variables and then create our transformers

In [6]:
num = [col for col in train.select_dtypes(include='number').columns if col not in dismiss]
cat = [col for col in train.select_dtypes(include='object').columns if col not in dismiss]

catEnc = cat + ['Conditions', 'Roof', 'Exterior', 'External',
                'Basement', 'BasementFin', 'Sale', 'Garage', 'GarageTF',
                'HeatingCond', 'Lot']

remove = ['Condition1', 'Condition2', 'RoofStyle', 'RoofMatl',
          'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond',
          'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2',
          'SaleType', 'SaleCondition', 'GarageQual', 'GarageCond',
          'GarageType', 'GarageFinish', 'Heating', 'HeatingQC',
          'LotShape', 'LotConfig']

catEnc = [cat for cat in catEnc if cat not in remove]

In [7]:
num.remove('SalePrice')

In [8]:
cbe = ct.CatBoostCustom(cat=catEnc)
cbi = ct.CatBivariates(features=catBivs)
nbi = ct.NumBivariates(features=numBivMult)
nbib = ct.NumBivariatesBool(features=numBivBool)
lcr = ct.LogRtTransformer(log=log, cbrt=cbrt)
lcrBivs = ct.LogRtTransformer(log=logBiv, cbrt=cbrtBiv)

In [9]:
#Apply log to our target variable

train['SalePrice'] = np.log(train['SalePrice'])

Now we can create our pipeline with the transformers and our model

In [10]:
#For hyperparameter search
from sklearn.model_selection import GridSearchCV, train_test_split
#For imputing values
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

#Scaling data
from sklearn.preprocessing import RobustScaler, Normalizer

#Regressor
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import mean_squared_error


X_train, X_test, y_train, y_test = train_test_split(train[num + cat],train['SalePrice'])

# Preprocessing for numerical data
numerical_transformer= Pipeline(steps=[
            ('locb', lcr),
            ('bivm', nbi),  #Create bivariates before imputing
            ('bivb', nbib),
            ('locbb', lcrBivs),
            ('imputer', SimpleImputer()),
            ('scaler', RobustScaler(quantile_range=(5.0, 95.0)))
        ])

# Preprocessing for categorical data
categorical_transformer= Pipeline(steps=[
            ('biv', cbi),
            ('encoder', cbe),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-1)), #impute with mean after encoding else encoder can't select columns
        ])

preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, num),
        ('cat', categorical_transformer, cat)
        ]#,remainder='passthrough'
)


model = XGBRegressor(random_state=1, n_jobs=-1)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ]) 

# params_grid = { 'model__learning_rate': np.arange(0.01, 0.08, 0.01),
#                 'model__n_estimators': [400, 500, 600, 675, 695]
# }

params_grid = { 'model__learning_rate': np.arange(0.01, 0.05, 0.01),
                'model__n_estimators': [400, 695]
}


In [11]:
# df_data = pd.DataFrame.from_records(
#     data=preprocessor.fit_transform(X_train, y_train)
# )
# df_data.head()

In [12]:
search1 = GridSearchCV(pipe, params_grid,cv=5, n_jobs=-1, verbose=3)
search1.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search1.best_score_)
print(search1.best_params_)

preds1 = search1.predict(X_test)
score1 = mean_squared_error(y_test, preds1, squared=False)
print('RMSE:', score1)

#2.2 score
# Best parameter (CV score=0.896):
# {'model__learning_rate': 0.03, 'model__n_estimators': 695}
# RMSE: 0.11940361998669048

# Best parameter (CV score=0.895):
# {'model__learning_rate': 0.060000000000000005, 'model__n_estimators': 400}
# RMSE: 0.10750837063664301
# KAGGLE 0.13365

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   47.0s finished
  elif pd.api.types.is_categorical(cols):


Best parameter (CV score=0.900):
{'model__learning_rate': 0.04, 'model__n_estimators': 695}
RMSE: 0.12576040166600205


In [14]:
feature_importances = search1.best_estimator_.named_steps['model'].feature_importances_
attr = num + ['LotFrontageOverArea', 'YearsBTWbuiltAndRemod', 'BsmtUnfPCT', 'GrOverLotArea', 'RemodAfter1984', '2ndFlr', 'LowQualFin'] + catEnc

In [17]:
sorted(zip(feature_importances, attr), reverse=True)

[(0.19005175, 'KitchenQual'),
 (0.09780381, 'Garage'),
 (0.09421939, 'GrOverLotArea'),
 (0.078029655, 'GarageCars'),
 (0.07556239, 'Neighborhood'),
 (0.06544442, 'OverallQual'),
 (0.044862937, 'CentralAir'),
 (0.044433713, 'GarageTF'),
 (0.038598306, 'FireplaceQu'),
 (0.034205236, 'TotalBsmtSF'),
 (0.03408941, 'Basement'),
 (0.028024297, 'YearsBTWbuiltAndRemod'),
 (0.015430753, 'BsmtFinSF1'),
 (0.014932733, 'MSZoning'),
 (0.011074987, 'OverallCond'),
 (0.008979955, '1stFlrSF'),
 (0.008396079, 'Conditions'),
 (0.007476665, 'FullBath'),
 (0.0069506704, 'LandContour'),
 (0.006491297, 'GarageArea'),
 (0.0055178166, 'BasementFin'),
 (0.005457019, 'GrLivArea'),
 (0.005111319, 'Sale'),
 (0.0043622553, 'YearRemodAdd'),
 (0.004282508, 'BldgType'),
 (0.004226153, 'HalfBath'),
 (0.0040258546, 'BsmtExposure'),
 (0.003393792, 'External'),
 (0.0033171761, 'Fireplaces'),
 (0.0032223726, 'HeatingCond'),
 (0.0031861532, 'BsmtFullBath'),
 (0.0029651443, 'Bedroom'),
 (0.002879795, 'GarageYrBlt'),
 (0.002

In [None]:
# best_model = XGBRegressor(random_state=1, n_jobs=-1, learning_rate=0.03, n_estimators=695)
# pipe_bm = Pipeline(steps=[('preprocessor', preprocessor),
#                                ('model', best_model)
#                               ]) 

# pipe_bm.fit(train[num+cat], train['SalePrice'])

# preds_bm = pipe_bm.predict(test[num+cat])

# output = pd.DataFrame({'Id': test[num+cat].index,
#                         'SalePrice': np.exp(preds_bm)})
# output.to_csv('submission_2.2Last.csv', index=False)

# # ##Kaggle score 2.2Last 0.13163

In [None]:
output.head()

It's important to remember since we have applied log transformation to our target variable, our model will predict the log of the Sale Price, so if we want the actual Sale Price we need to inverse transform the predicted outputs!!