In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.neural_network import MLPRegressor


# Train Dataset

In [94]:
df = pd.read_csv('./Dataset/train.csv', index_col='Id')
print('Shape of train dataset : ', df.shape)
all_cols = df.columns
print('Final columns of the dataset : ', all_cols)

Shape of train dataset :  (1460, 80)
Final columns of the dataset :  Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars

In [95]:
targets = df.SalePrice
df.drop(columns=['SalePrice'], axis=1, inplace=True)
print('Length of the targets : ', len(targets))
targets.head()

Length of the targets :  1460


Id
1    208500
2    181500
3    223500
4    140000
5    250000
Name: SalePrice, dtype: int64

**List of differnt that needs to handled separately**

In [96]:
cols_dict =  pkl.load(open('./data/Columns_List.pkl', 'rb'))
cols_dict.keys()

dict_keys(['Numerical_Columns', 'Cateogical_Columns', 'Continuous_Numerical_Columns', 'Discrete_Numerical_Columns', 'Discrete_Columns', 'Log_Transform_Columns', 'StandardScaler_Transform_Columns'])

In [97]:
num_cols = cols_dict['Numerical_Columns']
cont_num_cols = cols_dict['Continuous_Numerical_Columns']
cat_cols = cols_dict['Cateogical_Columns']
discrete_num_cols = cols_dict['Discrete_Numerical_Columns']
discrete_cols = cols_dict['Discrete_Columns']
log_num_cols = cols_dict['Log_Transform_Columns']
ss_num_cols = cols_dict['StandardScaler_Transform_Columns']

**Transformations**

In [98]:
impute_transformer = ColumnTransformer(transformers=[
    ('impute_median', SimpleImputer(strategy='median'), cont_num_cols),
    ('mode_impute', SimpleImputer(strategy='most_frequent'), discrete_cols)
], remainder='drop')

encoder_transformer = ColumnTransformer(transformers=[
    ('label_enc', OrdinalEncoder(), cat_cols)
], remainder='drop')

scale_transformer = ColumnTransformer(transformers=[
    ('log', FunctionTransformer(np.log1p, validate=False), log_num_cols),
    ('ss', StandardScaler(), ss_num_cols)
], remainder='drop')

In [99]:
all_cols = list(cont_num_cols) + list(discrete_cols)
cont_num_cols = list(log_num_cols) + list(ss_num_cols)

In [100]:
df_impute = impute_transformer.fit_transform(df)
df_impute = pd.DataFrame(df_impute, columns = all_cols, index = df.index)
df_impute[cont_num_cols] = df_impute[cont_num_cols].astype('float32')

df_encode = encoder_transformer.fit_transform(df_impute)
df_encode = pd.DataFrame(df_encode, columns=cat_cols, index=df.index)

df_scale = scale_transformer.fit_transform(df_impute)
df_scale = pd.DataFrame(df_scale, columns=cont_num_cols, index=df.index)

df_train = pd.merge(df_impute[discrete_num_cols], df_encode[cat_cols], on='Id')
df_train = pd.merge(df_train, df_scale, on='Id')
df_train.head()


Unnamed: 0_level_0,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,...,LotFrontage,MasVnrArea,WoodDeckSF,OpenPorchSF,BsmtFinSF1,2ndFlrSF,GrLivArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,7,5,2003,2003,1,2,1,3,8,...,4.189655,5.283204,0.0,4.127134,0.575425,1.161852,0.370333,-0.944591,-0.459303,-0.793434
2,20,6,8,1976,1976,0,2,0,3,6,...,4.394449,0.0,5.700444,0.0,1.171992,-0.795163,-0.482512,-0.641228,0.466465,0.25714
3,60,7,5,2001,2002,1,2,1,3,6,...,4.234107,5.09375,0.0,3.7612,0.092907,1.189351,0.515013,-0.301643,-0.313369,-0.627826
4,70,7,5,1915,1970,1,1,0,3,7,...,4.110874,0.0,0.0,3.583519,-0.499274,0.937276,0.383659,-0.06167,-0.687324,-0.521734
5,60,8,5,2000,2000,1,2,1,4,9,...,4.442651,5.860786,5.26269,4.442651,0.463568,1.617877,1.299326,-0.174865,0.19968,-0.045611


In [101]:
df_train = df_train.astype('float32')
df_train.dtypes


MSSubClass       float32
OverallQual      float32
OverallCond      float32
YearBuilt        float32
YearRemodAdd     float32
BsmtFullBath     float32
FullBath         float32
HalfBath         float32
BedroomAbvGr     float32
TotRmsAbvGrd     float32
Fireplaces       float32
GarageYrBlt      float32
GarageCars       float32
MoSold           float32
YrSold           float32
MSZoning         float32
LotShape         float32
LandContour      float32
LotConfig        float32
Neighborhood     float32
Condition1       float32
BldgType         float32
HouseStyle       float32
RoofStyle        float32
Exterior1st      float32
Exterior2nd      float32
MasVnrType       float32
ExterQual        float32
ExterCond        float32
Foundation       float32
BsmtQual         float32
BsmtExposure     float32
BsmtFinType1     float32
BsmtFinType2     float32
HeatingQC        float32
KitchenQual      float32
FireplaceQu      float32
GarageType       float32
GarageFinish     float32
SaleType         float32


In [102]:
xgb = XGBRegressor()
params = {'n_estimators': [150, 200, 250, 300, 350, 400],
          'learning_rate': [0.1, 0.15, 0.2, 0.25],
          'max_depth': [4, 5, 6, 7, 8, 9],
          }

grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='neg_mean_squared_log_error',
                    return_train_score=True, cv=5, n_jobs=-1)
grid.fit(df_train, targets)
df_predict = grid.predict(df_train)
print('Final score obtained : ', grid.score(df_train, targets))
print('Final r2 score obtained on train dataset : ',
      r2_score(targets, df_predict))

Final score obtained :  -0.002035673428180533
Final r2 score obtained on train dataset :  0.9930834256678608


In [105]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=150, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [106]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}

# Test Dataset

In [103]:
df_test = pd.read_csv('./Dataset/test.csv', index_col='Id')

cols_dict = pkl.load(open('./data/Columns_List.pkl', 'rb'))
num_cols = cols_dict['Numerical_Columns']
cont_num_cols = cols_dict['Continuous_Numerical_Columns']
cat_cols = cols_dict['Cateogical_Columns']
discrete_num_cols = cols_dict['Discrete_Numerical_Columns']
discrete_cols = cols_dict['Discrete_Columns']
log_num_cols = cols_dict['Log_Transform_Columns']
ss_num_cols = cols_dict['StandardScaler_Transform_Columns']

df_impute_test = impute_transformer.transform(df_test)
df_impute_test = pd.DataFrame(df_impute_test, columns=all_cols, index=df_test.index)
df_impute_test[cont_num_cols] = df_impute_test[cont_num_cols].astype('float32')

df_encode_test = encoder_transformer.transform(df_impute_test)
df_encode_test = pd.DataFrame(
    df_encode_test, columns=cat_cols, index=df_test.index)

df_scale_test = scale_transformer.transform(df_impute_test)
df_scale_test = pd.DataFrame(
    df_scale_test, columns=cont_num_cols, index=df_test.index)

df_test = pd.merge(df_scale_test, df_impute_test[discrete_num_cols], on='Id')
df_test = pd.merge(df_test, df_encode_test[cat_cols], on='Id')
df_test = df_test.astype('float32')
df_test.head()

Unnamed: 0_level_0,LotFrontage,LotArea,WoodDeckSF,EnclosedPorch,BsmtFinSF1,1stFlrSF,2ndFlrSF,GrLivArea,BsmtUnfSF,OpenPorchSF,...,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,KitchenQual,FireplaceQu,GarageType,GarageFinish,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,9.360742,6.594413,0.0,4.394449,0.0,4.94876,0.0,0.053428,-0.795163,-1.179256,...,3.0,4.0,3.0,4.0,3.0,2.0,1.0,2.0,8.0,4.0
1462,9.565775,5.746203,0.0,4.406719,4.691348,5.976351,3.610918,1.051363,-0.795163,-0.354966,...,3.0,0.0,5.0,4.0,2.0,2.0,1.0,2.0,8.0,4.0
1463,9.534668,6.180017,0.0,4.317488,0.0,5.361292,3.555348,0.761852,0.811239,0.216136,...,3.0,2.0,5.0,2.0,3.0,4.0,1.0,0.0,8.0,4.0
1464,9.208239,6.154858,0.0,4.369448,3.044523,5.888878,3.610918,0.347326,0.758532,0.168544,...,3.0,2.0,5.0,0.0,2.0,2.0,1.0,0.0,8.0,4.0
1465,8.518393,6.228511,0.0,3.78419,0.0,0.0,4.41884,-0.39619,-0.795163,-0.448246,...,3.0,0.0,5.0,0.0,2.0,2.0,1.0,1.0,8.0,4.0


In [104]:
df_test_predict = grid.predict(df_test)

submission = pd.DataFrame({'SalePrice': df_test_predict}, columns=[
                          'SalePrice'], index=df_test.index)
submission.head()


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,224665.140625
1462,151095.96875
1463,156246.21875
1464,143537.859375
1465,146760.375


In [87]:
submission.to_csv('./Kaggle Submissions/Submission5.csv')