In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import zscore, stats
import datetime

In [2]:
# Read the dataset
df = pd.read_csv("./SydneyHousePrices.csv")

In [3]:
# View the data
df.head()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house


In [4]:
df.describe()

Unnamed: 0,Id,postalCode,sellPrice,bed,bath,car
count,199504.0,199504.0,199504.0,199350.0,199504.0,181353.0
mean,99752.5,2196.379155,1269776.0,3.516479,1.890669,1.936224
std,57591.98839,193.053467,6948239.0,1.066555,0.926001,1.060237
min,1.0,2000.0,1.0,1.0,1.0,1.0
25%,49876.75,2082.0,720000.0,3.0,1.0,1.0
50%,99752.5,2144.0,985000.0,3.0,2.0,2.0
75%,149628.25,2211.0,1475000.0,4.0,2.0,2.0
max,199504.0,4878.0,2147484000.0,99.0,99.0,41.0


In [5]:
# Filter our unwanted columns
df = df.loc[:, ~df.columns.isin(['Id', 'Date', "postalCode"])]
df

Unnamed: 0,suburb,sellPrice,bed,bath,car,propType
0,Avalon Beach,1210000,4.0,2,2.0,house
1,Avalon Beach,2250000,4.0,3,4.0,house
2,Whale Beach,2920000,3.0,3,2.0,house
3,Avalon Beach,1530000,3.0,1,2.0,house
4,Whale Beach,8000000,5.0,4,4.0,house
...,...,...,...,...,...,...
199499,Illawong,1900000,5.0,3,7.0,house
199500,Illawong,980000,4.0,3,2.0,house
199501,Alfords Point,850000,4.0,2,2.0,house
199502,Illawong,640000,3.0,2,2.0,townhouse


In [30]:
# Check if column is date and parse 
# df['Date'] = pd.to_datetime(df['Date'])

In [6]:
# Check for null values

def containsOneOrTwo(colName):
    return 1 in set(df[colName].unique()) or 2 in set(df[colName].unique())

columnNames = df.columns
for col in columnNames:
    if (containsOneOrTwo(col)):
        # Replace null values with 0
        df[col] = df[col].fillna(0)


In [None]:
# remove outliers
# z_scores = stats.zscore(df)

# print(z_scores)
# abs_z_scores = np.abs(z_scores)
# filtered_entries = (abs_z_scores < 3).all(axis=1)
# print(filtered_entries)
# new_df = df[filtered_entries]
# new_df

In [7]:
# Convert non numerical columns to categorical values
cols = []
for col in df.columns:
    if (df[col].dtype != 'float64' and df[col].dtype != 'int64'):
        if (np.issubdtype(df[col].dtype, np.datetime64) != True):
              cols.append(col)     

newDf = pd.get_dummies(data = df, columns=cols)
# df = df.loc[:, df.columns != 'suburb']
# df.columns
# newDf = newDf.loc[:, newDf.columns != ['suburb', 'propType']]
newDf.loc[:, ~newDf.columns.isin(cols)]


Unnamed: 0,sellPrice,bed,bath,car,suburb_Abbotsbury,suburb_Abbotsford,suburb_Agnes Banks,suburb_Airds,suburb_Alexandria,suburb_Alfords Point,...,suburb_Yowie Bay,suburb_Zetland,propType_acreage,propType_duplex/semi-detached,propType_house,propType_other,propType_terrace,propType_townhouse,propType_villa,propType_warehouse
0,1210000,4.0,2,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2250000,4.0,3,4.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2920000,3.0,3,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1530000,3.0,1,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,8000000,5.0,4,4.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199499,1900000,5.0,3,7.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
199500,980000,4.0,3,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
199501,850000,4.0,2,2.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
199502,640000,3.0,2,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# remove outliers
# z_scores = stats.zscore(newDf)

# print(z_scores)
# abs_z_scores = np.abs(z_scores)
# filtered_entries = (abs_z_scores < 3).all(axis=1)
# print(filtered_entries)
# new_df = df[filtered_entries]
# new_df

[[-0.46297733 -0.00860311  0.45416954 ... -0.28694489 -0.1715123
  -0.01416112]
 [-0.46297733  0.14107548  0.45416954 ... -0.28694489 -0.1715123
  -0.01416112]
 [-0.46297733  0.23750302 -0.47988237 ... -0.28694489 -0.1715123
  -0.01416112]
 ...
 [ 0.19487316 -0.06041492  0.45416954 ... -0.28694489 -0.1715123
  -0.01416112]
 [ 0.19487316 -0.09063848 -0.47988237 ...  3.48498979 -0.1715123
  -0.01416112]
 [ 0.19487316  0.0491095   1.38822145 ... -0.28694489 -0.1715123
  -0.01416112]]
[False False False ... False False False]


Unnamed: 0,suburb,postalCode,sellPrice,bed,bath,car,propType


In [None]:
# df=pd.DataFrame({'data':['bed', 'bath']})
# df['z_score']=stats.zscore(df['data'])

In [None]:
# Identify the predictive parameters


In [8]:
# test train split


X = newDf.loc[:, ~newDf.columns.isin(['sellPrice'])]
y = newDf['sellPrice']

import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.65,test_size=0.35, random_state=101)
print ("X_train: ", X_train)
print ("y_train: ", y_train)
print("X_test: ", X_test)
print ("y_test: ", y_test)

X_train:          bed  bath  car  suburb_Abbotsbury  suburb_Abbotsford  \
85660   3.0     1  1.0                  0                  0   
163168  4.0     2  1.0                  0                  0   
31484   4.0     2  2.0                  0                  0   
147486  4.0     3  4.0                  0                  0   
197965  4.0     2  2.0                  0                  0   
...     ...   ...  ...                ...                ...   
151535  3.0     2  2.0                  0                  0   
55293   4.0     1  1.0                  0                  0   
49751   3.0     1  2.0                  0                  0   
136767  4.0     2  2.0                  0                  0   
176991  4.0     1  2.0                  0                  0   

        suburb_Agnes Banks  suburb_Airds  suburb_Alexandria  \
85660                    0             0                  0   
163168                   0             0                  0   
31484                    0      

In [10]:
# Train the model with XGBoost
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [24]:
param = {'max_depth': 3, 'eta': 1, 'objective': 'reg:squarederror'}
param['nthread'] = 6
param['eval_metric'] = 'mae'
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist)
bst.save_model('house-prices.model')


[0]	eval-mae:579245.25000	train-mae:502918.62500
[1]	eval-mae:558143.12500	train-mae:482134.28125
[2]	eval-mae:550905.75000	train-mae:474206.65625
[3]	eval-mae:546223.75000	train-mae:469493.46875
[4]	eval-mae:541955.37500	train-mae:465376.84375
[5]	eval-mae:534723.75000	train-mae:457960.62500
[6]	eval-mae:529641.25000	train-mae:452930.90625
[7]	eval-mae:527886.25000	train-mae:451171.78125
[8]	eval-mae:522780.81250	train-mae:446596.40625
[9]	eval-mae:520151.46875	train-mae:443376.71875
[10]	eval-mae:516573.06250	train-mae:440160.31250
[11]	eval-mae:514067.50000	train-mae:437724.90625
[12]	eval-mae:511264.28125	train-mae:434442.84375
[13]	eval-mae:508395.03125	train-mae:431631.75000
[14]	eval-mae:506319.50000	train-mae:429458.40625
[15]	eval-mae:503186.62500	train-mae:426478.96875
[16]	eval-mae:501337.34375	train-mae:424371.75000
[17]	eval-mae:498560.81250	train-mae:421553.90625
[18]	eval-mae:496878.87500	train-mae:419581.43750
[19]	eval-mae:494972.87500	train-mae:417539.09375
[20]	eval-

In [None]:
# bst = xgb.Booster({'nthread': 4})  # init model
# bst.load_model('house-prices.model')  # load data
