In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [26]:
df = pd.read_csv('./Dataset/train.csv', index_col='Id')
df.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [27]:
targets = df.SalePrice
df.drop(columns=['SalePrice'], axis=1, inplace=True)
print('Length of the targets : ', len(targets))
targets.head()

Length of the targets :  1460


Id
1    208500
2    181500
3    223500
4    140000
5    250000
Name: SalePrice, dtype: int64

In [28]:
targets = np.log(targets)
targets.describe()

count    1460.000000
mean       12.024051
std         0.399452
min        10.460242
25%        11.775097
50%        12.001505
75%        12.273731
max        13.534473
Name: SalePrice, dtype: float64

**Splitting the data into Train and Validation sets**

In [29]:
df_train, df_val, targets_train, targets_val = train_test_split(df, targets, test_size = 0.2, random_state=32)
print('Length of Train dataset : ', len(df_train))
print('Length of Vaidataion dataset  : ', len(df_val))

Length of Train dataset :  1168
Length of Vaidataion dataset  :  292


In [30]:
df_train.drop(columns = ['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'], axis = 1, inplace = True)
all_cols = df_train.columns
print('Total number of columns after removing skewed categorical columns : ', len(all_cols))
all_cols

Total number of columns after removing skewed categorical columns :  74


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'Scr

In [31]:
df_train_num = df_train.select_dtypes(exclude='object')
df_train_cat = df_train.select_dtypes(include='object')

print('Shape of the dataset with numerical columns : ', df_train_num.shape)
print('Shape of the dataset with categorical columns : ', df_train_cat.shape)

Shape of the dataset with numerical columns :  (1168, 36)
Shape of the dataset with categorical columns :  (1168, 38)


**Categorizing the Columns**

In [32]:
num_cols = df_train_num.columns
discrete_num_cols = ['MSSubClass', 'OverallQual', 'OverallCond',
                     'YearBuilt', 'YearRemodAdd',  'BsmtFullBath',
                     'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
                     'GarageYrBlt', 'GarageCars', 'MoSold', 'YrSold']

cont_num_cols = list(set(num_cols) - set(discrete_num_cols))

log_num_cols = ['LowQualFinSF', '3SsnPorch', 'LotArea', 'PoolArea', 'MiscVal']
ss_num_cols = list(set(cont_num_cols) - set(log_num_cols))
cat_cols = df_train_cat.columns


**Preprocessing pipelines and columns transformers**

In [34]:
skew_cols_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                                    ('log', FunctionTransformer(np.log1p, validate=False)), 
                                    ('ss', StandardScaler())])


nonskew_cols_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                                        ('ss', StandardScaler())])


discrete_num_cols_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent'))])

cat_cols_pipline = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                                   ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessing_pipeline = ColumnTransformer(transformers = [('p1', skew_cols_pipeline, log_num_cols), 
                                ('p2', nonskew_cols_pipeline, ss_num_cols), 
                                ('p3', discrete_num_cols_pipeline, discrete_num_cols), 
                                ('p4', cat_cols_pipline, cat_cols)])

**Training Pipeline**

In [35]:
xgb = XGBRegressor()

params = {'n_estimators': [150, 200, 250, 300, 350, 400],
          'learning_rate': [0.1, 0.15, 0.2, 0.25],
          'max_depth': [4, 5, 6, 7, 8, 9],
          }

grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='neg_mean_squared_error',
                    return_train_score=True, cv=5, n_jobs=-1)

pipline = Pipeline(steps=[
    ('Preprocessing', preprocessing_pipeline),
    ('Model', grid)
])

pipline.fit(df_train, targets_train)
df_train_predict = pipline.predict(df_train)
print('Final score obtained : ', pipline.score(df_train, targets_train))
print('Final r2 score obtained on train dataset : ',
      r2_score(targets_train, df_train_predict))

df_val_predict = pipline.predict(df_val)
print('Final score obtained on the validation dataset : ',
      pipline.score(df_val, targets_val))
print('Final r2 score obtained on validation dataset : ',
      r2_score(targets_val, df_val_predict))

Final score obtained :  -0.0006369094746995268
Final r2 score obtained on train dataset :  0.9959189456939197
Final score obtained on the validation dataset :  -0.023763483239570712
Final r2 score obtained on validation dataset :  0.8625526023543768


**Training on the whole training dataset provided in the competition**

In [46]:
grid.best_params_


{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}

In [37]:
xgb_best = grid.best_estimator_
final_pipline = Pipeline(steps=[
    ('Preprocessing', preprocessing_pipeline),
    ('Model', xgb_best)
])
final_pipline.fit(df, targets)
df_predict = final_pipline.predict(df)
print('Final mean squared error obtained : ', mean_squared_error(targets, df_predict))
print('Final r2 score obtained on train dataset : ',
      r2_score(targets, df_predict))

Final mean squared error obtained :  0.0009487608821592846
Final r2 score obtained on train dataset :  0.9940498841628276


# On test dataset

In [38]:
df_test = pd.read_csv('./Dataset/test.csv', index_col='Id')
df_test_predict = final_pipline.predict(df_test)
test_targets = np.exp(df_test_predict)

In [40]:
test_targets[ : 5]

array([124771.414, 163543.02 , 192772.45 , 193167.94 , 179870.56 ],
      dtype=float32)

In [42]:
submission = pd.DataFrame({'SalePrice': test_targets}, columns=[
                          'SalePrice'], index=df_test.index)
submission.head()
submission.to_csv('./Kaggle Submissions/Final_Submission.csv')

# Thank you