In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings('ignore')

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [3]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18  overallc

In [5]:
df.head(5)

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# EDA

## Checking for null values

In [6]:
nulls = df.isnull().sum(axis=0)
nulls[nulls != 0]

lotfrontage      259
alley           1369
masvnrtype         8
masvnrarea         8
bsmtqual          37
bsmtcond          37
bsmtexposure      38
bsmtfintype1      37
bsmtfintype2      38
electrical         1
fireplacequ      690
garagetype        81
garageyrblt       81
garagefinish      81
garagequal        81
garagecond        81
poolqc          1453
fence           1179
miscfeature     1406
dtype: int64

Is it null because there is none, like in alley? Need to explore each null

### Lot Frontage

Lot frontage is the linear feet of the street.  It should not be missing.  So for filling in missing values I will usee KNNImputer after dealing with other missing values.

### Alley

In [8]:
df.alley.unique()

array([None, 'Grvl', 'Pave'], dtype=object)

From the codebook, NA means no alley access.  So instead will fill NAs with 'None'.

In [9]:
df.alley.fillna('None', inplace=True)

### Masvnr type and area

In [11]:
df[(df['masvnrtype'].isnull())|(df['masvnrarea'].isnull())][['masvnrtype','masvnrarea']]

Unnamed: 0,masvnrtype,masvnrarea
235,,
529,,
650,,
936,,
973,,
977,,
1244,,
1278,,


Those with masvnrtype is None also have msvnrarea as NaN.  As the codebook says None means none.  I'll fill the nulls with 'None' for masvnrtype and NaNs with 0. 

In [12]:
df['masvnrtype'].fillna('None',inplace=True)
df['masvnrarea'].fillna(0, inplace=True)

### Working with basement quality, condition, exposure, and finish

In [13]:
basement_variables = ['bsmtqual','bsmtcond','bsmtexposure','bsmtfintype1','bsmtfintype2']
df[df.bsmtqual.isnull()][basement_variables]

Unnamed: 0,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfintype2
17,,,,,
39,,,,,
90,,,,,
100,,,,,
103,,,,,
157,,,,,
183,,,,,
260,,,,,
343,,,,,
363,,,,,


As those codebook says those with none have no basement, None will be replaced with 'None'

In [31]:
#somehow this didn't work
#df[df.bsmtqual.isnull()][basement_variables].fillna('None',inplace=True)
missing_indices = df[df.bsmtqual.isnull()].index
# this didn't work either
# df.loc[missing_indices,basement_variables].fillna('None',inplace=True)
df.loc[missing_indices,basement_variables] = df.loc[missing_indices,basement_variables].fillna('None')

However there are still some observation with basement variables as None

In [32]:
df[basement_variables].isnull().sum()

bsmtqual        0
bsmtcond        0
bsmtexposure    1
bsmtfintype1    0
bsmtfintype2    1
dtype: int64

In [33]:
df[(df.bsmtexposure.isnull())|(df.bsmtfintype2.isnull())][basement_variables]

Unnamed: 0,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfintype2
333,Gd,TA,No,GLQ,
948,Gd,TA,,Unf,Unf


For these two, I will fill in with the KNN Imputer later.

### Electrical

For the one value missing with the electrical system, I will fill in with the KNN imputer.

### Fireplace quality

According to the codebook, None in fireplace quality means there is no fireplace.  Therefore None will be replace with 'None'.

In [34]:
df.fireplacequ.fillna('None',inplace=True)

### Working with garage variables