# Modeling - Housing Price Prediction

## By: Nick Roller

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

We first load our preprocessed data. As we can see, we have already one-hot encoded the categorical variables and scaled all continuous ones.

In [2]:
df = pd.read_csv('Data Files/data_preprocessed.csv')
df.head()

Unnamed: 0,SalePrice,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,208500,65.0,8450,196.0,706,150,856,856,854,1710,...,0,0,0,1,0,0,0,0,1,0
1,181500,80.0,9600,0.0,978,284,1262,1262,0,1262,...,0,0,0,1,0,0,0,0,1,0
2,223500,68.0,11250,162.0,486,434,920,920,866,1786,...,0,0,0,1,0,0,0,0,1,0
3,250000,84.0,14260,350.0,655,490,1145,1145,1053,2198,...,0,0,0,1,0,0,0,0,1,0
4,307000,75.0,10084,186.0,1369,317,1686,1694,0,1694,...,0,0,0,1,0,0,0,0,1,0


In [3]:
df.shape

(1137, 251)

In [7]:
cols_with_missing = [col for col in df.columns 
                                 if df[col].isnull().any()]                                  
candidate_train_predictors = df.drop(['SalePrice'] + cols_with_missing, axis=1)

In [19]:
candidate_train_predictors.shape

(1137, 250)

In [9]:
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]

In [18]:
train_predictors.dtypes.tail(50)

FireplaceQu_Po           int64
FireplaceQu_TA           int64
GarageType_2Types        int64
GarageType_Attchd        int64
GarageType_Basment       int64
GarageType_BuiltIn       int64
GarageType_CarPort       int64
GarageType_Detchd        int64
GarageType_None          int64
GarageFinish_Fin         int64
GarageFinish_None        int64
GarageFinish_RFn         int64
GarageFinish_Unf         int64
GarageQual_Ex            int64
GarageQual_Fa            int64
GarageQual_Gd            int64
GarageQual_None          int64
GarageQual_Po            int64
GarageQual_TA            int64
GarageCond_Ex            int64
GarageCond_Fa            int64
GarageCond_Gd            int64
GarageCond_None          int64
GarageCond_Po            int64
GarageCond_TA            int64
PavedDrive_N             int64
PavedDrive_P             int64
PavedDrive_Y             int64
Fence_GdPrv              int64
Fence_GdWo               int64
Fence_MnPrv              int64
Fence_MnWw               int64
Fence_No

In [17]:
train_predictors.shape

(1137, 250)