### Melbourne House Prices Model
### Nichollas Tidow
### March 13, 2021

In [27]:
# import data set 
import pandas as pd # import pandas
melbourne_data = pd.read_csv(r"/Users/nicktidow/Downloads/melb_data.csv")

melbourne_data.describe() # summary statistics

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [7]:
melbourne_data.columns # check variables

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
melbourne_data= melbourne_data.dropna(axis=0) #remove NA rows, if axis=1 it would remove columns
melbourne_data.isnull().values.any() #check if there are any values 

False

In [8]:
y= melbourne_data.Price # set the prediction variable target

In [9]:
mlbn_features = ['Rooms','Postcode','Landsize',"Bathroom",'YearBuilt'] 
# select the features used to make the prediction

In [10]:
X = melbourne_data[mlbn_features] #set features equal to X

In [11]:
X.describe() #summary statistics of the features

Unnamed: 0,Rooms,Postcode,Landsize,Bathroom,YearBuilt
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,3101.947708,471.00694,1.57634,1964.081988
std,0.971079,86.421604,897.449881,0.711362,38.105673
min,1.0,3000.0,0.0,1.0,1196.0
25%,2.0,3044.0,152.0,1.0,1940.0
50%,3.0,3081.0,373.0,1.0,1970.0
75%,4.0,3147.0,628.0,2.0,2000.0
max,8.0,3977.0,37000.0,8.0,2018.0


###### Now comes the model

In [12]:
from sklearn.tree import DecisionTreeRegressor # import sklearn 
mlbn_model_sk = DecisionTreeRegressor(random_state=1)
mlbn_model_sk.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [28]:
print("Predictions of house prices for the last 5 houses in the data set:")
print(X.tail())
print("")
print("")
print("The predictions are....")
print(mlbn_model_sk.predict(X.tail()))
#print(melbourne_data["Price"].tail())

Predictions of house prices for the last 5 houses in the data set:
       Rooms  Postcode  Landsize  Bathroom  YearBuilt
12205      3    3757.0     972.0       2.0     1996.0
12206      3    3016.0     179.0       1.0     1890.0
12207      1    3016.0       0.0       1.0     1967.0
12209      2    3181.0       0.0       1.0     2012.0
12212      6    3013.0    1087.0       3.0     1920.0


The predictions are....
[ 601000. 1050000.  385000.  560000. 2450000.]
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, dtype: float64


###### Model Validation

In [20]:
from sklearn.metrics import mean_absolute_error
# Error is actual value minus predicted value 
predicted_prices = mlbn_model_sk.predict(X)
mean_absolute_error(y, predicted_prices)

4494.467157510734

In [21]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# Split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

296094.77008208056


In [23]:
###### We see that the mean absolute error using in-sample data is around $4,500
###### while the mean absolute error using out-of-sample data is around $296,100
melbourne_data["Price"].mean()
###### For reference the average actual price is $1,068,828 so the error is substantial 

1068828.202065849

###### Underfitting and Overfitting

In [24]:
# the more leaves the more we move from underfit to overfit
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [25]:
# compare MAE with different values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  381052
Max leaf nodes: 50  		 Mean Absolute Error:  299625
Max leaf nodes: 500  		 Mean Absolute Error:  280676
Max leaf nodes: 5000  		 Mean Absolute Error:  296849


##### Random Forests

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))
# with a MAE of $241,013 the random forest model has a smaller MAE than out-of-sample model and the leaf nodes model

241013.06606390633
