In [99]:
#loading necessary libraries and packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#import warning_filters
import warnings
warnings.filterwarnings('ignore')

In [100]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn import neighbors
from sklearn.impute import SimpleImputer

In [101]:
#reading the testing and training dataset into csv
train = pd.read_csv('train.csv', index_col = 'Id')
test = pd.read_csv('test.csv', index_col = 'Id')

#subsetting the saleprice column from train
target = train['SalePrice']

In [102]:
#removing SalePrice column from train file
train.drop('SalePrice', axis=1, inplace = True)

#combining the two files into one
data =  pd.concat([train,test], axis = 0)

In [103]:
#glancing through the datasets 
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [104]:
print(data.columns)
print(f'\nThe total number of columns in the dataset is: {len(data.columns)}')

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [105]:
display(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2919 non-null   int64  
 1   MSZoning       2915 non-null   object 
 2   LotFrontage    2433 non-null   float64
 3   LotArea        2919 non-null   int64  
 4   Street         2919 non-null   object 
 5   Alley          198 non-null    object 
 6   LotShape       2919 non-null   object 
 7   LandContour    2919 non-null   object 
 8   Utilities      2917 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuil

None

In [106]:
#splitting the data based on the datatype
categorical = data.select_dtypes(include = 'object')
numerical = data.select_dtypes(exclude = 'object')

In [107]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       2915 non-null   object
 1   Street         2919 non-null   object
 2   Alley          198 non-null    object
 3   LotShape       2919 non-null   object
 4   LandContour    2919 non-null   object
 5   Utilities      2917 non-null   object
 6   LotConfig      2919 non-null   object
 7   LandSlope      2919 non-null   object
 8   Neighborhood   2919 non-null   object
 9   Condition1     2919 non-null   object
 10  Condition2     2919 non-null   object
 11  BldgType       2919 non-null   object
 12  HouseStyle     2919 non-null   object
 13  RoofStyle      2919 non-null   object
 14  RoofMatl       2919 non-null   object
 15  Exterior1st    2918 non-null   object
 16  Exterior2nd    2918 non-null   object
 17  MasVnrType     2895 non-null   object
 18  ExterQual      2919 non-null

In [108]:
#glancing through categorical data at random
categorical.sample(n=10)

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,RL,Pave,,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,,,,WD,Normal
22,RM,Pave,Grvl,Reg,Bnk,AllPub,Inside,Gtl,IDOTRR,Norm,...,Attchd,Unf,TA,TA,N,,GdPrv,,WD,Normal
1643,FV,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,RRNn,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
2458,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Norm,...,Attchd,Unf,TA,TA,Y,,GdWo,,WD,Normal
303,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
306,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
1467,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,Attchd,Fin,TA,TA,Y,,GdPrv,Shed,WD,Normal
517,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,Norm,...,Attchd,RFn,TA,TA,Y,,GdPrv,,COD,Abnorml
786,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Feedr,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
2690,FV,Pave,,Reg,Lvl,AllPub,Corner,Gtl,Somerst,Feedr,...,Attchd,RFn,TA,TA,Y,,,,New,Partial


In [109]:
#checking the number of missing vlaues in categorical data
categorical.isnull().sum().sort_values(ascending=False)

PoolQC           2909
MiscFeature      2814
Alley            2721
Fence            2348
FireplaceQu      1420
GarageCond        159
GarageQual        159
GarageFinish      159
GarageType        157
BsmtCond           82
BsmtExposure       82
BsmtQual           81
BsmtFinType2       80
BsmtFinType1       79
MasVnrType         24
MSZoning            4
Functional          2
Utilities           2
Electrical          1
KitchenQual         1
SaleType            1
Exterior2nd         1
Exterior1st         1
Heating             0
Condition1          0
Street              0
PavedDrive          0
LotShape            0
LandContour         0
LotConfig           0
LandSlope           0
Neighborhood        0
RoofStyle           0
Condition2          0
BldgType            0
HouseStyle          0
HeatingQC           0
RoofMatl            0
ExterQual           0
ExterCond           0
Foundation          0
CentralAir          0
SaleCondition       0
dtype: int64

In [110]:
#converting the categorical missing value into dictionary
cat_missing_value = categorical.isnull().sum().sort_values(ascending=False).to_dict()

In [111]:
#dropping columns with more than 5% missing data
for key, value in cat_missing_value.items():
    if value > round((len(categorical)/100) * 5, ):
        categorical.drop([key], axis=1, inplace =True)

In [112]:
#checking to see if those columns have been dropped
categorical.head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Y,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal


In [113]:
categorical = pd.DataFrame(categorical)
categorical = categorical.astype('category')

In [114]:
print(categorical.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 34 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSZoning       2915 non-null   category
 1   Street         2919 non-null   category
 2   LotShape       2919 non-null   category
 3   LandContour    2919 non-null   category
 4   Utilities      2917 non-null   category
 5   LotConfig      2919 non-null   category
 6   LandSlope      2919 non-null   category
 7   Neighborhood   2919 non-null   category
 8   Condition1     2919 non-null   category
 9   Condition2     2919 non-null   category
 10  BldgType       2919 non-null   category
 11  HouseStyle     2919 non-null   category
 12  RoofStyle      2919 non-null   category
 13  RoofMatl       2919 non-null   category
 14  Exterior1st    2918 non-null   category
 15  Exterior2nd    2918 non-null   category
 16  MasVnrType     2895 non-null   category
 17  ExterQual      2919 non-null   ca

In [115]:
categorical.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [116]:
#imputting the missing categorical value with the most frequent among the data
imp_frequent = SimpleImputer(strategy = "most_frequent")
cat = imp_frequent.fit_transform(categorical)
print(cat)

[['RL' 'Pave' 'Reg' ... 'Y' 'WD' 'Normal']
 ['RL' 'Pave' 'Reg' ... 'Y' 'WD' 'Normal']
 ['RL' 'Pave' 'IR1' ... 'Y' 'WD' 'Normal']
 ...
 ['RL' 'Pave' 'Reg' ... 'Y' 'WD' 'Abnorml']
 ['RL' 'Pave' 'Reg' ... 'Y' 'WD' 'Normal']
 ['RL' 'Pave' 'Reg' ... 'Y' 'WD' 'Normal']]


In [117]:
header = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
categorical = pd.DataFrame(cat, columns = header, dtype = 'object')

In [118]:
categorical.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal


In [119]:
#checking for any missing value after imputting
categorical.isnull().sum().sort_values(ascending=False)

SaleCondition    0
Condition1       0
Exterior1st      0
RoofMatl         0
RoofStyle        0
HouseStyle       0
BldgType         0
Condition2       0
Neighborhood     0
SaleType         0
LandSlope        0
LotConfig        0
Utilities        0
LandContour      0
LotShape         0
Street           0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
PavedDrive       0
MSZoning         0
dtype: int64

In [120]:
#converting the numerical missing values into dictionary
num_missing_value = numerical.isnull().sum().sort_values(ascending=False).to_dict()

In [121]:
#dropping numerical columns with more than 5% missing data
for key, value in num_missing_value.items():
    if value > round((len(numerical)/100) * 5, ):
        numerical.drop([key], axis=1, inplace =True)

In [122]:
len(numerical.columns)

34

In [123]:
#imputting the missing value with the mean value of the data
imp_mean = SimpleImputer(strategy = "mean")
num = imp_frequent.fit_transform(numerical)
print(num)

[[6.0000e+01 8.4500e+03 7.0000e+00 ... 0.0000e+00 2.0000e+00 2.0080e+03]
 [2.0000e+01 9.6000e+03 6.0000e+00 ... 0.0000e+00 5.0000e+00 2.0070e+03]
 [6.0000e+01 1.1250e+04 7.0000e+00 ... 0.0000e+00 9.0000e+00 2.0080e+03]
 ...
 [2.0000e+01 2.0000e+04 5.0000e+00 ... 0.0000e+00 9.0000e+00 2.0060e+03]
 [8.5000e+01 1.0441e+04 5.0000e+00 ... 7.0000e+02 7.0000e+00 2.0060e+03]
 [6.0000e+01 9.6270e+03 7.0000e+00 ... 0.0000e+00 1.1000e+01 2.0060e+03]]


In [124]:
#reconverting imputted arrays into proper dataframe
num_header = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
numerical = pd.DataFrame(num, columns = num_header, dtype='object')

In [125]:
numerical.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,8450,7,5,2003,2003,196,706,0,150,...,548,0,61,0,0,0,0,0,2,2008
1,20,9600,6,8,1976,1976,0,978,0,284,...,460,298,0,0,0,0,0,0,5,2007
2,60,11250,7,5,2001,2002,162,486,0,434,...,608,0,42,0,0,0,0,0,9,2008
3,70,9550,7,5,1915,1970,0,216,0,540,...,642,0,35,272,0,0,0,0,2,2006
4,60,14260,8,5,2000,2000,350,655,0,490,...,836,192,84,0,0,0,0,0,12,2008


In [126]:
categorical.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Y,WD,Normal


- cross checking the columns b4 splitting into Label_encoding or OneHotEncoding.
- most of the data are in ordinal value, and those that are not ordinal have more than 10 unique values.
- label encoding is hereby adopted.


In [127]:
cat_col = categorical.columns.to_list()
cat_col

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [128]:
#importing label_encoding package
from sklearn.preprocessing import LabelEncoder
label_encoding = LabelEncoder()

#categorical encoding of all columns in categorcial data
for col in cat_col:
    categorical[col] = label_encoding.fit_transform(categorical[col].astype(str))

In [129]:
categorical.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
0,3,1,3,3,0,4,0,5,2,2,...,5,1,0,1,4,2,6,2,8,4
1,3,1,3,3,0,2,0,24,1,2,...,5,1,0,1,4,3,6,2,8,4
2,3,1,0,3,0,4,0,5,2,2,...,5,1,0,1,4,2,6,2,8,4
3,3,1,0,3,0,0,0,6,2,2,...,5,1,2,1,4,2,6,2,8,0
4,3,1,0,3,0,2,0,15,2,2,...,5,1,0,1,4,2,6,2,8,4


In [130]:
#check
len(categorical.Neighborhood.unique())

25

In [131]:
numerical.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,8450,7,5,2003,2003,196,706,0,150,...,548,0,61,0,0,0,0,0,2,2008
1,20,9600,6,8,1976,1976,0,978,0,284,...,460,298,0,0,0,0,0,0,5,2007
2,60,11250,7,5,2001,2002,162,486,0,434,...,608,0,42,0,0,0,0,0,9,2008
3,70,9550,7,5,1915,1970,0,216,0,540,...,642,0,35,272,0,0,0,0,2,2006
4,60,14260,8,5,2000,2000,350,655,0,490,...,836,192,84,0,0,0,0,0,12,2008


In [132]:
numerical.isnull().sum().sort_values(ascending=False)

YrSold           0
BsmtFinSF2       0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
TotalBsmtSF      0
BsmtUnfSF        0
BsmtFinSF1       0
MoSold           0
MasVnrArea       0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
OverallQual      0
LotArea          0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MSSubClass       0
dtype: int64

In [133]:
print(f'The number of rows in numerical data is: {len(numerical)}\n')
print(f'The number of columns in numerical data is: {len(numerical.columns)}\n')
print(f'The number of rows in categorical data is: {len(categorical)}\n')
print(f'The number of columns in categorical data is: {len(categorical.columns)}\n')

The number of rows in numerical data is: 2919

The number of columns in numerical data is: 34

The number of rows in categorical data is: 2919

The number of columns in categorical data is: 34



In [134]:
#joining the two data together
new_data = pd.concat([categorical, numerical], axis=1)

In [135]:
#picking just the first 1460 values to meet the target(SalePrice) length.
new_data = new_data.iloc[:1460,]
new_data.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,3,1,3,3,0,4,0,5,2,2,...,548,0,61,0,0,0,0,0,2,2008
1,3,1,3,3,0,2,0,24,1,2,...,460,298,0,0,0,0,0,0,5,2007
2,3,1,0,3,0,4,0,5,2,2,...,608,0,42,0,0,0,0,0,9,2008
3,3,1,0,3,0,0,0,6,2,2,...,642,0,35,272,0,0,0,0,2,2006
4,3,1,0,3,0,2,0,15,2,2,...,836,192,84,0,0,0,0,0,12,2008


- Modelling

In [136]:
#creating dictionary of estimators in order to pick the best model
estimators = {
    'LinearRegression': LinearRegression(),
    'ElasticNet': ElasticNet(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Decision_Tree': DecisionTreeRegressor(),
    'Random_Forest': RandomForestRegressor()
}

In [146]:
#using kfold and cross_validation for picking the best model with highest r2 score
from sklearn.model_selection import KFold,cross_val_score


for key, value in estimators.items():
    kfold = KFold(n_splits = 10, shuffle=True, random_state =11)
    scores = cross_val_score(estimator = value, X = new_data, y = target, cv = kfold, scoring = 'r2')
    print(f'{key:>16}: ' + f'mean of r2 scores={scores.mean():.3f}')

LinearRegression: mean of r2 scores=0.796
      ElasticNet: mean of r2 scores=0.798
           Lasso: mean of r2 scores=0.796
           Ridge: mean of r2 scores=0.797
   Decision_Tree: mean of r2 scores=0.691
   Random_Forest: mean of r2 scores=0.851


- RandomForest gives the highest r2 mean score and seems to be the best.

In [138]:
#splitting the dataset into train and test data.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_data, target, test_size = 0.3, random_state=11)

In [139]:
#modelling 
adv_reg = RandomForestRegressor(random_state =11)
adv_reg.fit(X_train, y_train)
pred = adv_reg.predict(X_test)

In [150]:
#calculating the mean absolute error of the final prediction.
mae = mean_absolute_error(pred, y_test)
print(f'The Mean Absolute Error: {mae:.2f}')

The Mean Absolute Error: 16822.82
