# Predict Housing Price 

Import Libraries

In [1]:
# !pip install nbimporter
import numpy as np
import pandas as pd
import nbimporter
import preprocessing_utils as pu

Importing Jupyter notebook from preprocessing_utils.ipynb


# Data Pre-Processing

Pull test data

In [5]:
training_dataset_csv = pd.read_csv('../data/housing_price/train.csv')
test_dataset_csv = pd.read_csv('../data/housing_price/test.csv')
training_dataset = training_dataset_csv
test_dataset = test_dataset_csv

Get List of missing valued Columns

In [7]:
missing_valued_columns = pu.get_missing_valued_columns_list(training_dataset)
print(missing_valued_columns)

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


Calculate number of missing data per column

In [8]:
missing_values_with_count = pu.get_missing_valued_column_details(training_dataset)
print(missing_values_with_count)

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


Removed items which missde more than 1000 rows of information

In [9]:
remove_columns =['Alley','PoolQC','Fence','MiscFeature','Id','Street',
                 'Utilities','LandSlope','Condition2','Heating','CentralAir',
                 'BsmtHalfBath','KitchenAbvGr','PavedDrive','MSSubClass',
                 'MSZoning','LotFrontage']

In [10]:
training_dataset = pu.remove_columns(training_dataset,remove_columns)
test_dataset = pu.remove_columns(test_dataset,remove_columns)

Update missing valued columns after removing

In [11]:
missing_valued_columns = pu.get_missing_valued_columns_list(training_dataset)
missing_valued_columns = missing_valued_columns.append(pu.get_missing_valued_columns_list(test_dataset))

Determine Categorical Data

In [12]:
categorical_columns = ['MSZoning','LotShape','LandContour','LotConfig',
                       'LandSlope','Neighborhood','Condition1','Condition2',
                       'BldgType','HouseStyle','OverallCond',
                       'OverallQual','RoofStyle','RoofMatl','Exterior1st',
                       'Exterior2nd','MasVnrType','ExterQual','ExterCond',
                       'Foundation','BsmtQual','BsmtCond','BsmtExposure',
                       'BsmtFinType1','BsmtFinType2','Heating','HeatingQC',
                       'CentralAir','Electrical','KitchenQual','Functional',
                       'FireplaceQu','FireplaceQu','GarageFinish','GarageQual',
                       'GarageCond','PavedDrive', 'SaleType','SaleCondition',
                       'GarageType']

In [13]:
categorical_columns = [x for x in categorical_columns if x not in remove_columns]

Factorise Categorical Data

In [14]:
training_dataset = pu.factorize_categorical_columns(training_dataset,categorical_columns)
test_dataset = pu.factorize_categorical_columns(test_dataset,categorical_columns)

Fill missing valued Columns

In [15]:
training_dataset = pu.fill_missing_values(training_dataset,categorical_columns, missing_valued_columns)
test_dataset = pu.fill_missing_values(test_dataset,categorical_columns, missing_valued_columns)

MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
BsmtFullBath
KitchenQual
Functional
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
SaleType
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
BsmtFullBath
KitchenQual
Functional
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
SaleType


Specify that these columns are categorical

In [16]:
training_dataset = pu.set_categorical_columns(training_dataset,categorical_columns)
test_dataset = pu.set_categorical_columns(test_dataset,categorical_columns)

Linear Regression with 75 variables

In [17]:
# Decision Tree seem to have overfitted as it has no mean square errors
#from sklearn.tree import DecisionTreeRegressor
#regressor = DecisionTreeRegressor()

In [18]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()

In [19]:
#from sklearn.preprocessing import PolynomialFeatures
#from sklearn.linear_model import LinearRegression

In [20]:
non_low_variance_columns = pu.get_low_variance_columns(training_dataset)

In [21]:
#poly = PolynomialFeatures(degree = 2)
#training_input = poly.fit_transform(training_dataset.iloc[:,:-1])
#testing_input = poly.fit_transform(test_dataset)

In [22]:
#regressor = LinearRegression()
#regressor.fit(training_input,training_dataset.iloc[:,-1])
#
#train_columns = list(training_dataset.columns.values)
#test_columns  = list(test_dataset.columns.values)
#
#low_variance_columns = [x for x in train_columns if x not in non_low_variance_columns]
#
#y_pred = regressor.predict(training_input)
#
#from sklearn.metrics import mean_squared_error
#mean_squared_error(training_dataset.iloc[:,-1],y_pred)
#
#y_pred[1:10]
#
#ids=test_dataset['Id'].tolist()
#
#y_test_pred = regressor.predict(test_dataset)
#
#val = list(zip(ids,y_test_pred.tolist()))
#
#dataset = pd.DataFrame(val,columns = ['Id','SalePrice']).to_csv('out.csv',index=False)

In [23]:

#y_test_pred = regressor.predict(testing_input)
#
#training_dataset.iloc[1:10,76]
#y_pred[1:10]
#
#ids=test_dataset['Id'].tolist()
#
#val = list(zip(ids,y_test_pred.tolist()))
#
#dataset = pd.DataFrame(val,columns = ['Id','SalePrice']).to_csv('out.csv',index=False)
#frames = [test_dataset['Id'], y_test_pred]
#
#result = pd.concat(frames)


For normal Regressions like Decision Tree and Random Forest 
(Non - Polynomial Features)

In [24]:
regressor.fit(training_dataset.iloc[:,:-1],training_dataset.iloc[:,-1])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [34]:

train_columns = list(training_dataset.columns.values)
test_columns  = list(test_dataset.columns.values)

missing_column = [x for x in train_columns if x not in test_columns]

y_pred = regressor.predict(training_dataset.iloc[:,:-1])

from sklearn.metrics import mean_squared_error
mean_squared_error(training_dataset.iloc[:,-1],y_pred)


147440035.1866027

In [26]:
y_test_pred = regressor.predict(test_dataset)

In [28]:
training_dataset.iloc[1:10,-1]
#y_pred[1:10]

1    181500
2    223500
3    140000
4    250000
5    143000
6    307000
7    200000
8    129900
9    118000
Name: SalePrice, dtype: int64

In [29]:
ids = test_dataset_csv['Id'].tolist()

In [30]:
val = list(zip(ids,y_test_pred.tolist()))

In [37]:
dataset = pd.DataFrame(val,columns = ['Id','SalePrice']).to_csv('../data/housing_price/out.csv',index=False)

In [32]:
#frames = [test_dataset['Id'], y_test_pred]
#
#result = pd.concat(frames)

Scores 2931st position