### Import libraries

In [2]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib as plt
import researchpy as rp
import numpy as np

### Data import and wrangling for model

Let's start with loading the cleaned data after the EDA1

In [4]:
df=pd.read_csv("./houses_data_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,MSZoning,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior1st,...,Fireplaces,GarageType,GarageFinish,GarageCars,GarageArea,SalePrice,OverallCond_bin,FullBath_bin,ExterQual_bin,KitchenQual_bin
0,0,142,20,RL,CollgCr,1Story,7,2005,2005,VinylSd,...,0,Attchd,Fin,2,660,260000,1,1,0,0
1,1,170,20,RL,Timber,1Story,8,1981,1981,Plywood,...,1,Attchd,RFn,2,511,228000,0,1,0,1
2,2,303,20,RL,CollgCr,1Story,7,2001,2002,VinylSd,...,1,Attchd,RFn,3,843,205000,1,1,0,0
3,3,371,60,RL,Gilbert,2Story,6,2000,2000,VinylSd,...,1,Attchd,RFn,2,460,172400,1,1,1,1
4,4,412,190,RL,Gilbert,1Story,5,1955,1955,Wd Sdng,...,0,Attchd,Fin,2,572,145000,1,0,1,1


In [5]:
#check column types
pd.value_counts(df.dtypes)

int64     16
object    11
dtype: int64

Object types are most likely string data. Let's check it:

In [6]:
#Inspect which columns are of object type
pd.set_option('display.max_columns', 30)
print(df.select_dtypes(exclude="number"))

     MSZoning Neighborhood HouseStyle Exterior1st Exterior2nd MasVnrType  \
0          RL      CollgCr     1Story     VinylSd     VinylSd       None   
1          RL       Timber     1Story     Plywood     Plywood    BrkFace   
2          RL      CollgCr     1Story     VinylSd     VinylSd    BrkFace   
3          RL      Gilbert     2Story     VinylSd     VinylSd       None   
4          RL      Gilbert     1Story     Wd Sdng     Wd Sdng       None   
...       ...          ...        ...         ...         ...        ...   
1455       RL        NAmes     1Story     VinylSd     VinylSd       None   
1456       RM       IDOTRR     2Story     VinylSd     Wd Shng       None   
1457       RL      Edwards     1.5Fin     MetalSd     MetalSd       None   
1458       RL      NPkVill     2Story     Plywood     Brk Cmn       None   
1459       RL      NPkVill     1Story     Plywood     Plywood       None   

     Foundation BsmtQual BsmtFinType1 GarageType GarageFinish  
0         PConc       G

In [7]:
#Switching data types from object to string
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = pd.Series(df[col], dtype="string")
        
pd.value_counts(df.dtypes)

int64     16
string    11
dtype: int64

The string data will have to be encoded, but so will some of the others, the ones that are more categorical/ordinal. Let's mark those variables, wchich are such

In [8]:
col_ordinal=[col for col in df.select_dtypes(include=["number"]).columns if 1.*df[col].nunique()/df[col].count() < 0.05]
col_ordinal

['MSSubClass',
 'OverallQual',
 'YearRemodAdd',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'OverallCond_bin',
 'FullBath_bin',
 'ExterQual_bin',
 'KitchenQual_bin']

The "$_bin" variables are in fact already encoded, so we should leave them out when encoding in future

In [12]:
col_to_encode=[col for col in df.select_dtypes(exclude=["number"]).columns]
col_to_encode.extend([col for col in col_ordinal if col.find("_bin") == -1])
col_to_encode

['MSZoning',
 'Neighborhood',
 'HouseStyle',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'BsmtQual',
 'BsmtFinType1',
 'GarageType',
 'GarageFinish',
 'MSSubClass',
 'OverallQual',
 'YearRemodAdd',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars']

So now let's encode using one hot encoding in Pandas

In [27]:
df_encoded=pd.get_dummies(df, columns=col_to_encode, drop_first = True)
df_encoded.head()

Unnamed: 0.1,Unnamed: 0,Id,YearBuilt,GrLivArea,GarageArea,SalePrice,OverallCond_bin,FullBath_bin,ExterQual_bin,KitchenQual_bin,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_Blueste,...,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,TotRmsAbvGrd_12,TotRmsAbvGrd_14,Fireplaces_1,Fireplaces_2,Fireplaces_3,GarageCars_1,GarageCars_2,GarageCars_3,GarageCars_4
0,0,142,2005,1734,660,260000,1,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,170,1981,1707,511,228000,0,1,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2,2,303,2001,1541,843,205000,1,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,3,371,2000,1664,460,172400,1,1,1,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,4,412,1955,1056,572,145000,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


Let's double check if the not encoded columns match?

In [25]:
df[df.columns[~df.columns.isin(col_to_encode)]]

Unnamed: 0.1,Unnamed: 0,Id,YearBuilt,GrLivArea,GarageArea,SalePrice,OverallCond_bin,FullBath_bin,ExterQual_bin,KitchenQual_bin
0,0,142,2005,1734,660,260000,1,1,0,0
1,1,170,1981,1707,511,228000,0,1,0,1
2,2,303,2001,1541,843,205000,1,1,0,0
3,3,371,2000,1664,460,172400,1,1,1,1
4,4,412,1955,1056,572,145000,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...
1455,1455,420,1968,1056,304,142000,0,0,1,1
1456,1456,706,1930,1092,0,55000,1,1,1,1
1457,1457,1145,1941,924,280,80000,0,0,1,1
1458,1458,1161,1978,1456,440,146000,1,1,1,1


Luckily for us - it does match :] Yay!
Now, let's look how many entries did we got?

In [30]:
df_encoded.shape

(1460, 198)