# Iowa Housing Group Project
## STAD 2019

###### First, we handle all imports. We are primarily using pandas and numpy for our data management and manipulation; matplotlib and seaborn for visualization; sklearn and scipy for the ML models and performance metrics.

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import norm
from scipy import stats
import numpy as np
from sklearn import preprocessing 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

###### We read in the "train.csv" file into a Pandas dataframe and print out our columns for reference.

In [21]:
ames_train_data = pd.read_csv('train.csv')
ames_train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

###### Data Preprocessing, Step 1: drop "subjective" categorical columns - OverallQual, OverallCond, ExterQual, ExterCond, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond, PoolQC, Fence, LotShape, LandContour, LandSlope.

In [22]:
ames_train_data.drop(['Id', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'LotShape', 'LandContour', 'LandSlope'], axis=1, inplace=True)



###### Here, we check for missing data, and output the top 10 results. The tenth "result" YearRemodAdd reveals that we only have 9 categories with missing values.

In [23]:
ames_train_data.isnull().sum().sort_values(ascending = False).head(10)

MiscFeature     1406
Alley           1369
LotFrontage      259
GarageType        81
GarageYrBlt       81
BsmtQual          37
MasVnrType         8
MasVnrArea         8
Electrical         1
YearRemodAdd       0
dtype: int64

In [24]:
# Data Preprocessing - Step 2: Remove Columns With High Percentage of Missing Data

ames_train_data.drop(['MiscFeature', 'Alley', 'LotFrontage'], axis=1, inplace=True)


In [25]:
# Data Preprocessing - Step 3: Remove Rows with Missing Values
ames_train_data_numerical = ames_train_data.select_dtypes([np.number]).columns
for i in ames_train_data.columns:
    ames_train_data.dropna(subset=[i], inplace=True)  #drop NA (missing data) in the column,

In [26]:
#Show numerical data using dot plot:
#for col in ames_train_data_numerical:
    #data = pd.concat([ames_train_data['SalePrice'], ames_train_data[col]], axis=1)
    #data.plot.scatter(x=col, y='SalePrice', ylim=(0,800000))

In [27]:
# Feature Selection - Calculate and Display R^2 For Each 

def takeCoef(elem):
    return elem[1]

correlations = []
for i in ames_train_data_numerical:
    coef = np.corrcoef(ames_train_data[i], ames_train_data['SalePrice'])
    correlations.append((i, (coef[0][1] ** 2)))

correlations_sorted = sorted(correlations, key=takeCoef, reverse=True)

print(correlations_sorted)
print(len(correlations_sorted))

[('SalePrice', 1.0), ('GrLivArea', 0.5061234892664401), ('GarageCars', 0.41045995580385386), ('GarageArea', 0.3697971952517224), ('1stFlrSF', 0.36604622513305035), ('TotalBsmtSF', 0.36046692430329647), ('FullBath', 0.32431695289363444), ('TotRmsAbvGrd', 0.304428143848862), ('YearBuilt', 0.2546428006318737), ('YearRemodAdd', 0.2517535513883513), ('GarageYrBlt', 0.2323990346909284), ('MasVnrArea', 0.21743267612019038), ('Fireplaces', 0.19857059159735066), ('BsmtFinSF1', 0.12983406018455657), ('OpenPorchSF', 0.10358038510184744), ('2ndFlrSF', 0.09626961040583962), ('WoodDeckSF', 0.09304158007263354), ('HalfBath', 0.0661312648659248), ('LotArea', 0.06482894599077604), ('BsmtFullBath', 0.044260368432504534), ('BsmtUnfSF', 0.037373840881455976), ('BedroomAbvGr', 0.028674366426636334), ('EnclosedPorch', 0.01629236966606662), ('KitchenAbvGr', 0.012434271116146959), ('ScreenPorch', 0.009271286440882319), ('PoolArea', 0.008419637104922158), ('MSSubClass', 0.006453091125641996), ('MoSold', 0.0017

In [28]:
#Show categorical data using boxplot:
#ames_train_data_categorical = ames_train_data.select_dtypes(include=['object']).columns
#for i in ames_train_data_categorical:
 #   ax = sns.boxplot(x=i, y="SalePrice",data=ames_train_data)
 #   plt.show()


In [29]:
# Preview Remaining Columns:

print(ames_train_data.shape)

ames_numerical_only = ames_train_data
for col in ames_train_data.columns:
    print(col)

(1340, 57)
MSSubClass
MSZoning
LotArea
Street
Utilities
LotConfig
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
Foundation
BsmtQual
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
TotRmsAbvGrd
Fireplaces
GarageType
GarageYrBlt
GarageCars
GarageArea
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
SaleType
SaleCondition
SalePrice


In [30]:
# Convert numerical categorical data to strings:

numerical_categorical = [ ]

In [31]:
# Encode non-numerical and numerical categorical data:

ames_train_data = pd.get_dummies(ames_train_data)
ames_train_data_categorical = ames_train_data.select_dtypes(include=['object']).columns
print(ames_train_data_categorical)

for col in ames_train_data_categorical:
    ames_train_data.drop(col, axis = 1, inplace=True)


print(ames_train_data.shape)

Index([], dtype='object')
(1340, 195)


In [32]:
# Define and remove Y from data
Y = ames_train_data['SalePrice']
ames_train_data.drop(["SalePrice"], axis=1, inplace=True)

# Define features
features = ['GrLivArea', 'GarageCars']

# test all features

x = ames_train_data

In [33]:
#split train.csv into 20 % for validation and 80% for train
x_train, x_val,y_train,y_val = train_test_split(x, Y, test_size= .2, random_state=0)

print("Train set has {}".format(x_train.shape[0]))
print("Validation set has {}".format(x_val.shape[0]))

Train set has 1072
Validation set has 268


In [34]:
# Define and fit model

house_model = DecisionTreeRegressor(random_state = 0)
house_model.fit(x_train, y_train)
house_model2 = RandomForestRegressor(random_state = 0)
house_model2.fit(x_train, y_train)
house_model3 = DecisionTreeRegressor(random_state = 0, criterion = "mae")
house_model3.fit(x_train, y_train)



DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [35]:
# Make predictions

predictions = house_model.predict(x_val)
predictions2 = house_model2.predict(x_val)
predictions3 = house_model3.predict(x_val)
#print(predictions)
#print(predictions2)

In [36]:

print("Decision Tree MAE: ", end='')
print(mean_absolute_error(predictions, y_val)) # 23192.20895522388
print("Random Forest MAE: ", end='')
print(mean_absolute_error(predictions2, y_val)) # 18686.317537313433
print("Decision Tree Improved? MAE: ", end='')
print(mean_absolute_error(predictions3, y_val)) # 28354.589552238805

Decision Tree MAE: 23192.20895522388
Random Forest MAE: 18686.317537313433
Decision Tree Improved? MAE: 28354.589552238805


In [19]:
from sklearn.linear_model import Ridge

ridge1=Ridge(alpha=0.05,normalize=True)
ridge1.fit(x_train,y_train)
predR=ridge1.predict(x_val)
print("MAE:",mean_absolute_error(predR, y_val))

MAE: 18822.6690894025


In [18]:
from sklearn.linear_model import Lasso

lasso1=Lasso(alpha=0.05,normalize=True)
lasso1.fit(x_train,y_train)
predL=lasso1.predict(x_val)
print("MAE:",mean_absolute_error(predL, y_val))

MAE: 19516.63167162993


