# Kaggle Home price prediction

#### In this version, we use only the numerical variables in the dataset

In [2]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6



## Load train and test data

In [3]:
# get home price train & test csv files as a DataFrame
train = pd.read_csv("../Data/train.csv")
test    = pd.read_csv("../Data/test.csv")
full = train.append(test, ignore_index=True)
print (train.shape, test.shape, full.shape)

((1460, 81), (1459, 80), (2919, 81))


In [4]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [25]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
train.columns

Index([u'Id', u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea',
       u'Street', u'Alley', u'LotShape', u'LandContour', u'Utilities',
       u'LotConfig', u'LandSlope', u'Neighborhood', u'Condition1',
       u'Condition2', u'BldgType', u'HouseStyle', u'OverallQual',
       u'OverallCond', u'YearBuilt', u'YearRemodAdd', u'RoofStyle',
       u'RoofMatl', u'Exterior1st', u'Exterior2nd', u'MasVnrType',
       u'MasVnrArea', u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual',
       u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1',
       u'BsmtFinType2', u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF',
       u'Heating', u'HeatingQC', u'CentralAir', u'Electrical', u'1stFlrSF',
       u'2ndFlrSF', u'LowQualFinSF', u'GrLivArea', u'BsmtFullBath',
       u'BsmtHalfBath', u'FullBath', u'HalfBath', u'BedroomAbvGr',
       u'KitchenAbvGr', u'KitchenQual', u'TotRmsAbvGrd', u'Functional',
       u'Fireplaces', u'FireplaceQu', u'GarageType', u'GarageYrBlt',
       u'GarageFinish',

## Selecting only numerical variables based on the test description file

In [43]:
col = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
       'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
      ]

In [50]:
full_numerical = full[col]
full_numerical.shape

(2919, 36)

In [51]:
full_numerical.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,548.0,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,460.0,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,608.0,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,642.0,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,836.0,192,84,0,0,0,0,0,12,2008


In [64]:
train_numerical = train[col]
train_numerical.shape

(1460, 36)

In [65]:
test_numerical = test[col]
test_numerical.shape

(1459, 36)

In [66]:
test_numerical.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,506.0,0,82,0,0,144,0,0,1,2010


## Missing Data

In [67]:
#missing data in train data
total = train_numerical.isnull().sum().sort_values(ascending=False)
percent = (train_numerical.isnull().sum()/train_numerical.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(36)

Unnamed: 0,Total,Percent
LotFrontage,259,0.177397
GarageYrBlt,81,0.055479
MasVnrArea,8,0.005479
YrSold,0,0.0
BsmtFinSF2,0,0.0
LowQualFinSF,0,0.0
2ndFlrSF,0,0.0
1stFlrSF,0,0.0
TotalBsmtSF,0,0.0
BsmtUnfSF,0,0.0


We remove the following variables ('LotFrontage', 'GarageYrBlt' and 'MasVnrArea')

In [68]:
#missing data in test data
total = test_numerical.isnull().sum().sort_values(ascending=False)
percent = (test_numerical.isnull().sum()/test_numerical.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(36)

Unnamed: 0,Total,Percent
LotFrontage,227,0.155586
GarageYrBlt,78,0.053461
MasVnrArea,15,0.010281
BsmtHalfBath,2,0.001371
BsmtFullBath,2,0.001371
GarageArea,1,0.000685
BsmtFinSF1,1,0.000685
BsmtFinSF2,1,0.000685
BsmtUnfSF,1,0.000685
TotalBsmtSF,1,0.000685


Same observation as for the train data. We remove the first 3 variables and fill the missing ones with -1.

In [69]:
col_updated = ['MSSubClass', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
       'GarageCars', 'GarageArea', 'WoodDeckSF', 
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
       'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
      ]

In [70]:
train_numerical = train[col_updated]
train_numerical.shape

(1460, 33)

In [71]:
test_numerical = test[col_updated]
test_numerical.shape

(1459, 33)

In [72]:
test_numerical.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,20,11622,5,6,1961,1961,468.0,144.0,270.0,882.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,20,14267,6,6,1958,1958,923.0,0.0,406.0,1329.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,60,13830,5,5,1997,1998,791.0,0.0,137.0,928.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,60,9978,6,6,1998,1998,602.0,0.0,324.0,926.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,120,5005,8,5,1992,1992,263.0,0.0,1017.0,1280.0,...,506.0,0,82,0,0,144,0,0,1,2010


In [90]:
#missing data in train data
total = train_numerical.isnull().sum().sort_values(ascending=False)
percent = (train_numerical.isnull().sum()/train_numerical.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(36)

Unnamed: 0,Total,Percent
YrSold,0,0.0
BsmtHalfBath,0,0.0
LotArea,0,0.0
OverallQual,0,0.0
OverallCond,0,0.0
YearBuilt,0,0.0
YearRemodAdd,0,0.0
BsmtFinSF1,0,0.0
BsmtFinSF2,0,0.0
BsmtUnfSF,0,0.0


In [88]:
test_numerical.BsmtHalfBath.fillna(-1, inplace=True);
test_numerical.BsmtFullBath.fillna(-1, inplace=True);
test_numerical.GarageArea.fillna(-1, inplace=True);
test_numerical.BsmtFinSF1.fillna(-1, inplace=True);
test_numerical.BsmtFinSF2.fillna(-1, inplace=True);
test_numerical.BsmtUnfSF.fillna(-1, inplace=True);
test_numerical.TotalBsmtSF.fillna(-1, inplace=True);
test_numerical.GarageCars.fillna(-1, inplace=True);

Make sure Test data has no more missing data

In [89]:
#missing data in test data
total = test_numerical.isnull().sum().sort_values(ascending=False)
percent = (test_numerical.isnull().sum()/test_numerical.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(33)

Unnamed: 0,Total,Percent
YrSold,0,0.0
BsmtHalfBath,0,0.0
LotArea,0,0.0
OverallQual,0,0.0
OverallCond,0,0.0
YearBuilt,0,0.0
YearRemodAdd,0,0.0
BsmtFinSF1,0,0.0
BsmtFinSF2,0,0.0
BsmtUnfSF,0,0.0


## Train, validation split

In [91]:
train_numerical_y = train.SalePrice

In [97]:
train_x, valid_x, train_y, valid_y = train_test_split(train_numerical, 
                                                      train_numerical_y,
                                                     train_size=0.7)
train_x.shape, valid_x.shape, train_y.shape, valid_y.shape, test_numerical.shape

((1021, 33), (439, 33), (1021,), (439,), (1459, 33))

## Modelling

In [139]:
#model = RandomForestClassifier(n_estimators=100)
#model = SVC()
model = GradientBoostingClassifier()
#model = KNeighborsClassifier(n_neighbors = 3)
#model = GaussianNB()
#model = LogisticRegression()

In [None]:
model.fit(train_x, train_y)

In [137]:
# Score the model
print (model.score(train_x, train_y), model.score(valid_x, valid_y))

(0.73653281096963763, 0.0045558086560364463)


## Submission

In [138]:
test_y = model.predict(test_numerical)
test_id = test.Id
test_submit = pd.DataFrame({'Id': test_id, 'SalePrice': test_y})
test_submit.shape
test_submit.head()
test_submit.to_csv('house_price_pred_LogisticReg.csv', index=False)