In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

In [2]:
df = pd.read_csv('./Dataset/train.csv', index_col='Id')
df.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [115]:
print('Shape of the train.csv file : ', df.shape)


Shape of the train.csv file :  (1460, 79)


In [3]:
print('Nuner of features : ', len(df.columns))
print(df.columns)

Nuner of features :  80
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',


In [4]:
targets = df.SalePrice
df.drop(columns=['SalePrice'], axis = 1, inplace=True)
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [6]:
df.isnull().sum()

MSSubClass       0   
MSZoning         0   
LotFrontage      259 
LotArea          0   
Street           0   
Alley            1369
LotShape         0   
LandContour      0   
Utilities        0   
LotConfig        0   
LandSlope        0   
Neighborhood     0   
Condition1       0   
Condition2       0   
BldgType         0   
HouseStyle       0   
OverallQual      0   
OverallCond      0   
YearBuilt        0   
YearRemodAdd     0   
RoofStyle        0   
RoofMatl         0   
Exterior1st      0   
Exterior2nd      0   
MasVnrType       8   
MasVnrArea       8   
ExterQual        0   
ExterCond        0   
Foundation       0   
BsmtQual         37  
BsmtCond         37  
BsmtExposure     38  
BsmtFinType1     37  
BsmtFinSF1       0   
BsmtFinType2     38  
BsmtFinSF2       0   
BsmtUnfSF        0   
TotalBsmtSF      0   
Heating          0   
HeatingQC        0   
CentralAir       0   
Electrical       1   
1stFlrSF         0   
2ndFlrSF         0   
LowQualFinSF     0   
GrLivArea 

List of Null columns

In [7]:
null_cols = [col for col in df.columns if df[col].isnull().any()]
print('Number of columns with null values : ', len(null_cols))
print(null_cols)

Number of columns with null values :  19
['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


Dividing all the columns into Categorical and Numerical while discarding the columns with more than 30% of null values. 

In [12]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'object' 
                                and df[col].isnull().sum() < 0.3 * len(df)]
numerical_cols = [col for col in df.columns if df[col].dtype !=
                  'object' and df[col].isnull().sum() < 0.3 * len(df)]
print('Number of Categorical Cols : ', len(categorical_cols))
print('Categorical : ', categorical_cols)
print('-------------------------------------------')
print('Number of numerical Cols : ', len(numerical_cols))
print('Numerical cols : ', numerical_cols)

Number of Categorical Cols :  38
Categorical :  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
-------------------------------------------
Number of numerical Cols :  36
Numerical cols :  ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplace

Removing columns found to be  unimportant intuitively (Based on domain knowledge)

In [14]:
unimportant_feat = ['LowQualFinSF', 'BsmtFinSF2', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
numerical_imp = list(set(numerical_cols) - set(unimportant_feat))
print(numerical_imp)

['1stFlrSF', '2ndFlrSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath', 'OverallCond', 'BsmtUnfSF', 'Fireplaces', 'YearBuilt', 'BsmtFinSF1', 'GarageYrBlt', 'OverallQual', 'MasVnrArea', 'WoodDeckSF', 'BsmtFullBath', 'BedroomAbvGr', 'YearRemodAdd', 'LotArea', 'LotFrontage', 'GrLivArea', 'FullBath', 'TotalBsmtSF', 'KitchenAbvGr', 'GarageCars', 'OpenPorchSF', 'HalfBath', 'TotRmsAbvGrd']


Replacing null values with the mean in all the numerical columns

In [16]:
numerical_transform = Pipeline(
    steps=[('impute', SimpleImputer(strategy='mean'))])
numerical_reduced = numerical_transform.fit_transform(df[numerical_imp])
numerical_reduced = pd.DataFrame(numerical_reduced, columns=df[numerical_imp].columns)
numerical_reduced.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,MSSubClass,GarageArea,BsmtHalfBath,OverallCond,BsmtUnfSF,Fireplaces,YearBuilt,BsmtFinSF1,GarageYrBlt,OverallQual,MasVnrArea,WoodDeckSF,BsmtFullBath,BedroomAbvGr,YearRemodAdd,LotArea,LotFrontage,GrLivArea,FullBath,TotalBsmtSF,KitchenAbvGr,GarageCars,OpenPorchSF,HalfBath,TotRmsAbvGrd
0,856.0,854.0,60.0,548.0,0.0,5.0,150.0,0.0,2003.0,706.0,2003.0,7.0,196.0,0.0,1.0,3.0,2003.0,8450.0,65.0,1710.0,2.0,856.0,1.0,2.0,61.0,1.0,8.0
1,1262.0,0.0,20.0,460.0,1.0,8.0,284.0,1.0,1976.0,978.0,1976.0,6.0,0.0,298.0,0.0,3.0,1976.0,9600.0,80.0,1262.0,2.0,1262.0,1.0,2.0,0.0,0.0,6.0
2,920.0,866.0,60.0,608.0,0.0,5.0,434.0,1.0,2001.0,486.0,2001.0,7.0,162.0,0.0,1.0,3.0,2002.0,11250.0,68.0,1786.0,2.0,920.0,1.0,2.0,42.0,1.0,6.0
3,961.0,756.0,70.0,642.0,0.0,5.0,540.0,1.0,1915.0,216.0,1998.0,7.0,0.0,0.0,1.0,3.0,1970.0,9550.0,60.0,1717.0,1.0,756.0,1.0,3.0,35.0,0.0,7.0
4,1145.0,1053.0,60.0,836.0,0.0,5.0,490.0,1.0,2000.0,655.0,2000.0,8.0,350.0,192.0,1.0,4.0,2000.0,14260.0,84.0,2198.0,2.0,1145.0,1.0,3.0,84.0,1.0,9.0


Finding the numerical columns with high mutual information. (Feature selection)

In [17]:
numerical_mi_scores = mutual_info_regression(X=numerical_reduced, y=targets)
numerical_mi_scores = pd.DataFrame(numerical_mi_scores, index=numerical_reduced.columns, columns = ['Mi_Score'])
numerical_mi_scores_sort = numerical_mi_scores.sort_values(by='Mi_Score', ascending=False)
numerical_top_indices = numerical_mi_scores_sort[ : 23].index
print(numerical_top_indices)

Index(['OverallQual', 'GrLivArea', 'YearBuilt', 'TotalBsmtSF', 'GarageCars',
       'GarageArea', '1stFlrSF', 'GarageYrBlt', 'FullBath', 'MSSubClass',
       'YearRemodAdd', 'TotRmsAbvGrd', '2ndFlrSF', 'LotFrontage', 'Fireplaces',
       'LotArea', 'BsmtFinSF1', 'OpenPorchSF', 'BsmtUnfSF', 'OverallCond',
       'MasVnrArea', 'WoodDeckSF', 'HalfBath'],
      dtype='object')


Training the model only with nnumerical columns to finetune the parameter (numerical_top_indices : number of features to consider)

In [14]:
model = XGBRegressor()
params = {'n_estimators': [150, 200, 250, 300, 350, 400], 'learning_rate': [0.1, 0.15, 0.2, 0.25]}
grid = GridSearchCV(model, param_grid=params, cv=5,
                    scoring='neg_mean_squared_log_error', return_train_score=True)
train_score = grid.fit(numerical_reduced[numerical_top_indices], Y)
print('Train score : ', grid.score(numerical_reduced[numerical_top_indices], Y))

Train score :  -0.00041241850161472383


Replacing null values in categorical columns with most frequent value

In [18]:
cat_transform = Pipeline(
    steps=[('impute', SimpleImputer(strategy='most_frequent'))])
cat_reduced = cat_transform.fit_transform(df[categorical_cols])
cat_reduced = pd.DataFrame(
    cat_reduced, columns=df[categorical_cols].columns)
cat_reduced.head(5)

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


One hot encoding of features ignoring the first column(category)

In [35]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, drop = 'first')
cat_reduced_ohe = ohe.fit_transform(cat_reduced)
cat_reduced_ohe = pd.DataFrame(cat_reduced_ohe, columns=list(ohe.get_feature_names_out(cat_reduced.columns)), index = cat_reduced.index)
cat_reduced_ohe.head(5)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Categories present in  different features

In [36]:
ohe.categories_

[array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array(['Grvl', 'Pave'], dtype=object),
 array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['Bnk', 'HLS', 'Low', 'Lvl'], dtype=object),
 array(['AllPub', 'NoSeWa'], dtype=object),
 array(['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside'], dtype=object),
 array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
        'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
        'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown',
        'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
        'Veenker'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'],
       dtype=object),
 array(['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], dtype=object),
 array(['1.5Fin', '1.5Unf', '1Story', '2.

In [38]:
print('Number of one hot encoded features obtained : ', len(cat_reduced_ohe.columns))

Number of one hot encoded features obtained :  196


In [116]:
Finding out the mutual information of all features

SyntaxError: invalid syntax (1020206324.py, line 1)

In [59]:
cat_mi_scores = mutual_info_classif(X=cat_reduced_ohe, y=targets)
cat_mi_scores = pd.DataFrame(
    cat_mi_scores, index=cat_reduced_ohe.columns, columns=['Mi_Score'])
cat_mi_scores_sort = cat_mi_scores.sort_values(by='Mi_Score', ascending=False)
cat_top_indices = cat_mi_scores_sort.index
print(cat_mi_scores_sort)
print(cat_top_indices)

                       Mi_Score
RoofMatl_CompShg       3.851858
Street_Pave            3.836145
Condition2_Norm        3.785634
GarageQual_TA          3.747651
GarageCond_TA          3.706913
Heating_GasA           3.621182
Electrical_SBrkr       3.573394
CentralAir_Y           3.406586
SaleType_WD            3.400391
Functional_Typ         3.338048
LandContour_Lvl        3.304631
PavedDrive_Y           3.270098
BsmtCond_TA            3.221939
BsmtFinType2_Unf       3.143137
SaleCondition_Normal   3.130916
ExterCond_TA           2.834756
Condition1_Norm        2.788013
MSZoning_RL            2.414917
RoofStyle_Gable        2.278396
ExterQual_TA           2.028208
BsmtExposure_No        1.921989
LotConfig_Inside       1.621732
GarageType_Attchd      1.611040
KitchenQual_TA         1.604997
LotShape_Reg           1.490767
MasVnrType_None        1.471609
GarageFinish_Unf       1.347627
BsmtQual_TA            1.276268
KitchenQual_Gd         0.994904
Foundation_PConc       0.921816
ExterQua

In [48]:
len(cat_top_indices)

234

In [40]:
print('Top five columns with high mutual information : ', cat_top_indices[ : 5])
len(cat_top_indices)

Top five columns with high mutual information :  Index(['Street_Pave', 'Utilities_AllPub', 'RoofMatl_CompShg',
       'Condition2_Norm', 'GarageCond_TA'],
      dtype='object')


234

In [47]:
# cat_mi_scores     # Mi score for each feature

Selecting the features with high mutual information

In [90]:
cat_top_cols = cat_top_indices[ : 80]  # Considering only the top 10 columns


Training the model only using categorical features to fine tune parameter(cat_top_cols : Top features with high mutual information)

In [91]:
model = XGBRegressor()   # top100
params = {'n_estimators': [150, 200, 250, 300, 350, 400], 'learning_rate': [0.1, 0.15, 0.2, 0.25]}
grid = GridSearchCV(model, param_grid=params, cv=5,
                    scoring='neg_mean_squared_log_error', return_train_score=True)
                    
train_score = grid.fit(cat_reduced_ohe[cat_top_cols], Y)
print('Train score : ', grid.score(cat_reduced_ohe[cat_top_cols], Y))


Train score :  -0.0060918877735018796


In [92]:
cat_reduced_final = cat_reduced_ohe[cat_top_cols]
cat_reduced_final.shape

(1460, 80)

Concatenating both numerical columns and categorical columns

In [93]:
X = pd.concat([numerical_reduced, cat_reduced_final], axis=1)
print(len(X.columns))
print(X.columns)
X.head(5)

107
Index(['1stFlrSF', '2ndFlrSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath',
       'OverallCond', 'BsmtUnfSF', 'Fireplaces', 'YearBuilt', 'BsmtFinSF1',
       ...
       'GarageCond_Fa', 'Neighborhood_BrDale', 'Neighborhood_Veenker',
       'Neighborhood_StoneBr', 'Exterior1st_BrkFace', 'BsmtFinType1_LwQ',
       'Exterior1st_BrkComm', 'Neighborhood_Sawyer', 'Exterior1st_WdShing',
       'Exterior2nd_BrkFace'],
      dtype='object', length=107)


Unnamed: 0,1stFlrSF,2ndFlrSF,MSSubClass,GarageArea,BsmtHalfBath,OverallCond,BsmtUnfSF,Fireplaces,YearBuilt,BsmtFinSF1,GarageYrBlt,OverallQual,MasVnrArea,WoodDeckSF,BsmtFullBath,BedroomAbvGr,YearRemodAdd,LotArea,LotFrontage,GrLivArea,FullBath,TotalBsmtSF,KitchenAbvGr,GarageCars,OpenPorchSF,HalfBath,TotRmsAbvGrd,RoofMatl_CompShg,Street_Pave,Condition2_Norm,GarageQual_TA,GarageCond_TA,Heating_GasA,Electrical_SBrkr,CentralAir_Y,SaleType_WD,Functional_Typ,LandContour_Lvl,PavedDrive_Y,BsmtCond_TA,BsmtFinType2_Unf,SaleCondition_Normal,ExterCond_TA,Condition1_Norm,MSZoning_RL,RoofStyle_Gable,ExterQual_TA,BsmtExposure_No,LotConfig_Inside,GarageType_Attchd,KitchenQual_TA,LotShape_Reg,MasVnrType_None,GarageFinish_Unf,BsmtQual_TA,KitchenQual_Gd,Foundation_PConc,ExterQual_Gd,BsmtQual_Gd,Foundation_CBlock,HouseStyle_1Story,MasVnrType_BrkFace,GarageFinish_RFn,HeatingQC_TA,BsmtFinType1_GLQ,HouseStyle_2Story,Exterior1st_VinylSd,Exterior2nd_VinylSd,GarageType_Detchd,BsmtFinType1_Unf,MSZoning_RM,Exterior1st_Wd Sdng,Neighborhood_CollgCr,Exterior2nd_HdBoard,RoofStyle_Hip,Exterior2nd_Wd Sdng,Condition2_Feedr,BsmtFinType2_LwQ,Neighborhood_NAmes,Neighborhood_Gilbert,GarageQual_Fa,BldgType_TwnhsE,GarageType_CarPort,BldgType_Duplex,Exterior2nd_ImStucc,SaleType_ConLI,Neighborhood_Somerst,HouseStyle_2.5Fin,HeatingQC_Gd,Neighborhood_NoRidge,Functional_Min1,BsmtExposure_Gd,BsmtFinType1_Rec,LotConfig_FR2,Electrical_FuseP,SaleType_Con,MSZoning_FV,GarageCond_Fa,Neighborhood_BrDale,Neighborhood_Veenker,Neighborhood_StoneBr,Exterior1st_BrkFace,BsmtFinType1_LwQ,Exterior1st_BrkComm,Neighborhood_Sawyer,Exterior1st_WdShing,Exterior2nd_BrkFace
0,856.0,854.0,60.0,548.0,0.0,5.0,150.0,0.0,2003.0,706.0,2003.0,7.0,196.0,0.0,1.0,3.0,2003.0,8450.0,65.0,1710.0,2.0,856.0,1.0,2.0,61.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1262.0,0.0,20.0,460.0,1.0,8.0,284.0,1.0,1976.0,978.0,1976.0,6.0,0.0,298.0,0.0,3.0,1976.0,9600.0,80.0,1262.0,2.0,1262.0,1.0,2.0,0.0,0.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,920.0,866.0,60.0,608.0,0.0,5.0,434.0,1.0,2001.0,486.0,2001.0,7.0,162.0,0.0,1.0,3.0,2002.0,11250.0,68.0,1786.0,2.0,920.0,1.0,2.0,42.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,961.0,756.0,70.0,642.0,0.0,5.0,540.0,1.0,1915.0,216.0,1998.0,7.0,0.0,0.0,1.0,3.0,1970.0,9550.0,60.0,1717.0,1.0,756.0,1.0,3.0,35.0,0.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1145.0,1053.0,60.0,836.0,0.0,5.0,490.0,1.0,2000.0,655.0,2000.0,8.0,350.0,192.0,1.0,4.0,2000.0,14260.0,84.0,2198.0,2.0,1145.0,1.0,3.0,84.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Training the final model with both numerical and categorical features

In [94]:
model = XGBRegressor()
params = {'n_estimators': [150, 200, 250, 300, 350,
                           400], 'learning_rate': [0.1, 0.15, 0.2, 0.25]}
grid = GridSearchCV(model, param_grid=params, cv=5,
                    scoring='neg_mean_squared_log_error', return_train_score=True)
train_score = grid.fit(X, Y)
print('Train score : ', grid.score(X, Y))


Train score :  -4.2164306596046455e-05


Loading the test dataset

In [None]:
test_df = pd.read_csv('./Dataset/test.csv', index_col='Id')

Numerical transform on numerical columns in test set

In [97]:
numerical_reduced_test = numerical_transform.transform(test_df[numerical_reduced.columns])
numerical_reduced_test = pd.DataFrame(numerical_reduced_test, columns=numerical_reduced.columns)
numerical_reduced_test.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,MSSubClass,GarageArea,BsmtHalfBath,OverallCond,BsmtUnfSF,Fireplaces,YearBuilt,BsmtFinSF1,GarageYrBlt,OverallQual,MasVnrArea,WoodDeckSF,BsmtFullBath,BedroomAbvGr,YearRemodAdd,LotArea,LotFrontage,GrLivArea,FullBath,TotalBsmtSF,KitchenAbvGr,GarageCars,OpenPorchSF,HalfBath,TotRmsAbvGrd
0,896.0,0.0,20.0,730.0,0.0,6.0,270.0,0.0,1961.0,468.0,1961.0,5.0,0.0,140.0,0.0,2.0,1961.0,11622.0,80.0,896.0,1.0,882.0,1.0,1.0,0.0,0.0,5.0
1,1329.0,0.0,20.0,312.0,0.0,6.0,406.0,0.0,1958.0,923.0,1958.0,6.0,108.0,393.0,0.0,3.0,1958.0,14267.0,81.0,1329.0,1.0,1329.0,1.0,1.0,36.0,1.0,6.0
2,928.0,701.0,60.0,482.0,0.0,5.0,137.0,1.0,1997.0,791.0,1997.0,5.0,0.0,212.0,0.0,3.0,1998.0,13830.0,74.0,1629.0,2.0,928.0,1.0,2.0,34.0,1.0,6.0
3,926.0,678.0,60.0,470.0,0.0,6.0,324.0,1.0,1998.0,602.0,1998.0,6.0,20.0,360.0,0.0,3.0,1998.0,9978.0,78.0,1604.0,2.0,926.0,1.0,2.0,36.0,1.0,7.0
4,1280.0,0.0,120.0,506.0,0.0,5.0,1017.0,0.0,1992.0,263.0,1992.0,8.0,0.0,0.0,0.0,2.0,1992.0,5005.0,43.0,1280.0,2.0,1280.0,1.0,2.0,82.0,0.0,5.0


In [99]:
print(numerical_reduced.shape)
print(numerical_reduced_test.shape)

(1460, 27)
(1459, 27)


Categorical tansform on categorical columns in test set 

In [106]:
cat_reduced_test = cat_transform.transform(test_df[cat_reduced.columns])
cat_reduced_test = pd.DataFrame(cat_reduced_test, columns=cat_reduced.columns)
# for col in cat_reduced_test.columns:
#     le = LabelEncoder()
#     cat_reduced_test[col] = le.fit_transform(cat_reduced_test[col])
cat_reduced_test.head(5)

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RH,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,CBlock,TA,TA,No,Rec,LwQ,GasA,TA,Y,SBrkr,TA,Typ,Attchd,Unf,TA,TA,Y,WD,Normal
1,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,TA,Y,SBrkr,Gd,Typ,Attchd,Unf,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Gd,Y,SBrkr,TA,Typ,Attchd,Fin,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,Fin,TA,TA,Y,WD,Normal
4,RL,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,Gable,CompShg,HdBoard,HdBoard,,Gd,TA,PConc,Gd,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [101]:
cat_reduced_test.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

One hot encoding of categorical columns in test set

In [107]:
cat_reduced_test_ohe = ohe.transform(cat_reduced_test)
cat_reduced_test_ohe = pd.DataFrame(cat_reduced_test_ohe, columns=list(
    ohe.get_feature_names_out(cat_reduced_test.columns)), index=cat_reduced_test.index)
cat_reduced_test_ohe.head(5)


Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [108]:
len(cat_reduced_test_ohe.columns)


196

In [109]:
cat_reduced_test_final = cat_reduced_test_ohe[cat_top_cols]
len(cat_reduced_test_final.columns)

80

Concatenating both Numerical columns and Categorical columns in test set

In [110]:
X_test = pd.concat([numerical_reduced_test, cat_reduced_test_final], axis=1)
print(len(X_test.columns))
print(X_test.columns)
X_test.head(5)

107
Index(['1stFlrSF', '2ndFlrSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath',
       'OverallCond', 'BsmtUnfSF', 'Fireplaces', 'YearBuilt', 'BsmtFinSF1',
       ...
       'GarageCond_Fa', 'Neighborhood_BrDale', 'Neighborhood_Veenker',
       'Neighborhood_StoneBr', 'Exterior1st_BrkFace', 'BsmtFinType1_LwQ',
       'Exterior1st_BrkComm', 'Neighborhood_Sawyer', 'Exterior1st_WdShing',
       'Exterior2nd_BrkFace'],
      dtype='object', length=107)


Unnamed: 0,1stFlrSF,2ndFlrSF,MSSubClass,GarageArea,BsmtHalfBath,OverallCond,BsmtUnfSF,Fireplaces,YearBuilt,BsmtFinSF1,GarageYrBlt,OverallQual,MasVnrArea,WoodDeckSF,BsmtFullBath,BedroomAbvGr,YearRemodAdd,LotArea,LotFrontage,GrLivArea,FullBath,TotalBsmtSF,KitchenAbvGr,GarageCars,OpenPorchSF,HalfBath,TotRmsAbvGrd,RoofMatl_CompShg,Street_Pave,Condition2_Norm,GarageQual_TA,GarageCond_TA,Heating_GasA,Electrical_SBrkr,CentralAir_Y,SaleType_WD,Functional_Typ,LandContour_Lvl,PavedDrive_Y,BsmtCond_TA,BsmtFinType2_Unf,SaleCondition_Normal,ExterCond_TA,Condition1_Norm,MSZoning_RL,RoofStyle_Gable,ExterQual_TA,BsmtExposure_No,LotConfig_Inside,GarageType_Attchd,KitchenQual_TA,LotShape_Reg,MasVnrType_None,GarageFinish_Unf,BsmtQual_TA,KitchenQual_Gd,Foundation_PConc,ExterQual_Gd,BsmtQual_Gd,Foundation_CBlock,HouseStyle_1Story,MasVnrType_BrkFace,GarageFinish_RFn,HeatingQC_TA,BsmtFinType1_GLQ,HouseStyle_2Story,Exterior1st_VinylSd,Exterior2nd_VinylSd,GarageType_Detchd,BsmtFinType1_Unf,MSZoning_RM,Exterior1st_Wd Sdng,Neighborhood_CollgCr,Exterior2nd_HdBoard,RoofStyle_Hip,Exterior2nd_Wd Sdng,Condition2_Feedr,BsmtFinType2_LwQ,Neighborhood_NAmes,Neighborhood_Gilbert,GarageQual_Fa,BldgType_TwnhsE,GarageType_CarPort,BldgType_Duplex,Exterior2nd_ImStucc,SaleType_ConLI,Neighborhood_Somerst,HouseStyle_2.5Fin,HeatingQC_Gd,Neighborhood_NoRidge,Functional_Min1,BsmtExposure_Gd,BsmtFinType1_Rec,LotConfig_FR2,Electrical_FuseP,SaleType_Con,MSZoning_FV,GarageCond_Fa,Neighborhood_BrDale,Neighborhood_Veenker,Neighborhood_StoneBr,Exterior1st_BrkFace,BsmtFinType1_LwQ,Exterior1st_BrkComm,Neighborhood_Sawyer,Exterior1st_WdShing,Exterior2nd_BrkFace
0,896.0,0.0,20.0,730.0,0.0,6.0,270.0,0.0,1961.0,468.0,1961.0,5.0,0.0,140.0,0.0,2.0,1961.0,11622.0,80.0,896.0,1.0,882.0,1.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1329.0,0.0,20.0,312.0,0.0,6.0,406.0,0.0,1958.0,923.0,1958.0,6.0,108.0,393.0,0.0,3.0,1958.0,14267.0,81.0,1329.0,1.0,1329.0,1.0,1.0,36.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,928.0,701.0,60.0,482.0,0.0,5.0,137.0,1.0,1997.0,791.0,1997.0,5.0,0.0,212.0,0.0,3.0,1998.0,13830.0,74.0,1629.0,2.0,928.0,1.0,2.0,34.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,926.0,678.0,60.0,470.0,0.0,6.0,324.0,1.0,1998.0,602.0,1998.0,6.0,20.0,360.0,0.0,3.0,1998.0,9978.0,78.0,1604.0,2.0,926.0,1.0,2.0,36.0,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1280.0,0.0,120.0,506.0,0.0,5.0,1017.0,0.0,1992.0,263.0,1992.0,8.0,0.0,0.0,0.0,2.0,1992.0,5005.0,43.0,1280.0,2.0,1280.0,1.0,2.0,82.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
Y_pred_test = grid.predict(X_test)
print(Y_pred_test)
print(test_df.index)

[122857.08 149807.08 189923.42 ... 161186.47 113599.19 220783.16]
Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name='Id', length=1459)


Creating the csv file to submit predictions in the specified format (as mentioned in kaggle)

In [113]:
submission = pd.DataFrame({'SalePrice': Y_pred_test}, columns=[
                          'SalePrice'], index=test_df.index)
submission.head()


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,122857.078125
1462,149807.078125
1463,189923.421875
1464,182665.734375
1465,186064.3125


In [114]:
submission.to_csv('./Submissions/submssion4.csv')