In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import ipynb.fs.full.CustomTransformersHP as ct
import ipynb.fs.full.preprocessFunctions as pp

In [2]:
test = pd.read_csv("test.csv", index_col="Id")
train = pd.read_csv("train.csv", index_col="Id")

After analyzing our data we ended up with the following ideas:

In [3]:
outliers = [1299, 524, 935]
dismiss = ['EnclosedPorch', 'MiscVal', 'BsmtHalfBath',
           '3SsnPorch', 'PoolArea', 'ScreenPorch',
           'Kitchen', 'PoolQC', 'MiscFeature', 'Alley',
           'Fence', 'Utilities', 'LandSlope', 'Street',
           'PoolQC', 'MiscFeature', 'Functional']

catBivs = {'Conditions': ('Condition1', 'Condition2'),
 'Roof': ('RoofStyle', 'RoofMatl'),
 'Exterior': ('Exterior1st', 'Exterior2nd'),
 'External': ('ExterQual', 'ExterCond'),
 'Basement': ('BsmtQual', 'BsmtCond'),
 'BasementFin': ('BsmtFinType1', 'BsmtFinType2'),
 'Sale': ('SaleType', 'SaleCondition'),
 'Garage': ('GarageQual', 'GarageCond'),
 'GarageTF': ('GarageType', 'GarageFinish'),
 'HeatingCond': ('Heating', 'HeatingQC'),
 'Lot': ('LotShape', 'LotConfig')}

numBivMult = {'LotFrontageOverArea': ('LotFrontage', 'LotArea'),
 'YearsBTWbuiltAndRemod': ('YearRemodAdd', 'YearBuilt'),
 'BsmtUnfPCT': ('BsmtUnfSF', 'TotalBsmtSF'),
 'GrOverLotArea': ('GrLivArea', 'LotArea')}

numBivBool = {'RemodAfter1984': ('YearRemodAdd', 1983),
 '2ndFlr': ('2ndFlrSF', 0),
 'LowQualFin': ('LowQualFinSF', 0)}

cbrt = ['MasVnrArea', 'WoodDeckSF', 'OpenPorchSF']
 
log = ['LotFrontage', 'LotArea']

logBiv = ['GrOverLotArea']
cbrtBiv = ['BsmtUnfPCT']


Now that we have analyzed our data, let's test it into a model.
We need to make all the desired transformations both in the train set and the test set. So let's begin
We'll rename the Bedroom and Kitchen column and eliminate outliers from train set

In [4]:
pp.renameCols(train)
pp.renameCols(test)

We'll make a list of categorical variables and numerical variables and then create our transformers

In [5]:
num = [col for col in train.select_dtypes(include='number').columns if col not in dismiss]
cat = [col for col in train.select_dtypes(include='object').columns if col not in dismiss]

catEnc = cat + ['Conditions', 'Roof', 'Exterior', 'External',
                'Basement', 'BasementFin', 'Sale', 'Garage', 'GarageTF',
                'HeatingCond', 'Lot']

remove = ['Condition1', 'Condition2', 'RoofStyle', 'RoofMatl',
          'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond',
          'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2',
          'SaleType', 'SaleCondition', 'GarageQual', 'GarageCond',
          'GarageType', 'GarageFinish', 'Heating', 'HeatingQC',
          'LotShape', 'LotConfig']

catEnc = [cat for cat in catEnc if cat not in remove]

In [6]:
num.remove('SalePrice')

In [7]:
cbe = ct.CatBoostCustom(cat=catEnc)
cbi = ct.CatBivariates(features=catBivs)
nbi = ct.NumBivariates(features=numBivMult)
nbib = ct.NumBivariatesBool(features=numBivBool)
lcr = ct.LogRtTransformer(log=log, cbrt=cbrt)
lcrBivs = ct.LogRtTransformer(log=logBiv, cbrt=cbrtBiv)

In [8]:
#Apply log to our target variable

train['SalePrice'] = np.log(train['SalePrice'])

Now we can create our pipeline with the transformers and our model

In [9]:
#For hyperparameter search
from sklearn.model_selection import GridSearchCV, train_test_split
#For imputing values
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

#Scaling data
from sklearn.preprocessing import RobustScaler, Normalizer

#Regressor
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import mean_squared_error


X_train, X_test, y_train, y_test = train_test_split(train[num + cat],train['SalePrice'])

# Preprocessing for numerical data
numerical_transformer= Pipeline(steps=[
            ('locb', lcr),
            ('bivm', nbi),  #Create bivariates before imputing
            ('bivb', nbib),
            ('locbb', lcrBivs),
            ('imputer', SimpleImputer()),
            ('scaler', RobustScaler(quantile_range=(5.0, 95.0)))
        ])

# Preprocessing for categorical data
categorical_transformer= Pipeline(steps=[
            ('biv', cbi),
            ('encoder', cbe),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-1)), #impute with mean after encoding else encoder can't select columns
        ])

preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, num),
        ('cat', categorical_transformer, cat)
        ]#,remainder='passthrough'
)


model = XGBRegressor(random_state=1, n_jobs=-1)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ]) 

params_grid = { 'model__learning_rate': np.arange(0.01, 0.08, 0.01),
                'model__n_estimators': [400, 500, 600, 675, 680, 695, 700]
}


In [10]:
# df_data = pd.DataFrame.from_records(
#     data=preprocessor.fit_transform(X_train, y_train)
# )
# df_data.head()

In [11]:
search1 = GridSearchCV(pipe, params_grid,cv=5, n_jobs=-1, verbose=3)
search1.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search1.best_score_)
print(search1.best_params_)

preds1 = search1.predict(X_test)
score1 = mean_squared_error(y_test, preds1, squared=False)
print('RMSE:', score1)


# Best parameter (CV score=0.887):
# {'model__learning_rate': 0.05, 'model__n_estimators': 400}
# RMSE: 0.12164477481117247

# Best parameter (CV score=0.903):
# {'model__learning_rate': 0.04, 'model__n_estimators': 500}
# RMSE: 0.13256410834961588

# Best parameter (CV score=0.888):
# {'model__learning_rate': 0.03, 'model__n_estimators': 600}
# RMSE: 0.10040369745049361

# Best parameter (CV score=0.885):
# {'model__learning_rate': 0.06999999999999999, 'model__n_estimators': 650}
# RMSE: 0.10991221426674046

# Best parameter (CV score=0.900):
# {'model__learning_rate': 0.05, 'model__n_estimators': 675}
# RMSE: 0.14582261721881837

# Best parameter (CV score=0.896):
# {'model__learning_rate': 0.03, 'model__n_estimators': 695}
# RMSE: 0.11940361998669048

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  6.2min finished
  elif pd.api.types.is_categorical(cols):


Best parameter (CV score=0.897):
{'model__learning_rate': 0.060000000000000005, 'model__n_estimators': 400}
RMSE: 0.12971202551169855


In [14]:
best_model = XGBRegressor(random_state=1, n_jobs=-1, learning_rate=0.06, n_estimators=400)
pipe_bm = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', best_model)
                              ]) 

pipe_bm.fit(train[num+cat], train['SalePrice'])

preds_bm = pipe_bm.predict(test[num+cat])

output = pd.DataFrame({'Id': test[num+cat].index,
                        'SalePrice': np.exp(preds_bm)})
output.to_csv('submission_2.2.csv', index=False)

output.head()

# ##Kaggle score 0.13156

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Id,SalePrice
0,1461,125149.539062
1,1462,162096.921875
2,1463,188658.296875
3,1464,195777.171875
4,1465,180081.5


It's important to remember since we have applied log transformation to our target variable, our model will predict the log of the Sale Price, so if we want the actual Sale Price we need to inverse transform the predicted outputs!!

In [13]:
num

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'Bedroom',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'MoSold',
 'YrSold']