In [7]:
# We will Import the libraries we need
# pandas is a useful helper lib
import pandas as pd
# from sklearn we need the method to split test data from training data
from sklearn.model_selection import train_test_split
# we want to use the GradientBoostingRegressor ensemble method
from sklearn.ensemble import GradientBoostingRegressor
# metrics will help us to calculate the prediction error
from sklearn.metrics import mean_absolute_error

In [8]:
# Lets import the data
full_dataset = pd.read_csv("./Melbourne_housing_FULL.csv")

In [9]:
# Data Scrubbing
# let's remove not needed colums
# and delete all uncomplete rows
# then we will use One-hot encoding
# and define the price column as the value to predict

del full_dataset['Address']
del full_dataset['Method']
del full_dataset['SellerG']
del full_dataset['Date']
del full_dataset['Postcode']
del full_dataset['Lattitude']
del full_dataset['Longtitude']
del full_dataset['Regionname']
del full_dataset['Propertycount']

# Remove missing values, see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html for details
full_dataset.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

# Convert categorical variable into indicator variables https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
full_dataset = pd.get_dummies(full_dataset, columns = ['Suburb', 'CouncilArea', 'Type'])

# as input we use everything apart from price
input_vars = full_dataset.drop('Price', axis = 1)
# aus output variable we will use only price
output_vars = full_dataset['Price']
full_dataset

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,1,0,1,0,0
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,...,0,0,0,0,0,1,0,1,0,0
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,0,...,0,0,0,1,0,0,0,1,0,0
34849,3,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,0,...,0,0,0,1,0,0,0,1,0,0
34853,2,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,0,...,0,0,0,0,0,0,0,1,0,0
34854,2,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
# Define a test and training set, as split we will use 70/30% and we will shuffle the data before splitting it
input_vars_train, input_vars_test, output_vars_train, output_vars_test = train_test_split(input_vars,output_vars, test_size = 0.3, shuffle = True)

In [11]:
# Define an algorithm/model
# We will use https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
model = GradientBoostingRegressor(
  n_estimators = 250,
  max_depth = 8,
  min_samples_split = 4,
  min_samples_leaf = 6,
  max_features = 0.6
)

In [12]:
# Train the model
model.fit(input_vars_train, output_vars_train)

GradientBoostingRegressor(max_depth=8, max_features=0.6, min_samples_leaf=6,
                          min_samples_split=4, n_estimators=250)

In [13]:
# Calculate the quality of results
model_prediction_training_data = model.predict(input_vars_train)
model_prediction_test_data = model.predict(input_vars_test)
error_training_data = mean_absolute_error(output_vars_train, model_prediction_training_data)
error_test_data = mean_absolute_error(output_vars_test, model_prediction_test_data)

print("mean error training data vs test data %.2f %.2f" % (error_training_data,error_test_data))
print("%.2f" % error_training_data)
print("%.2f" % error_test_data)

mean error training data vs test data 90968.76 157144.21
90968.76
157144.21


In [14]:
# let's say we are getting new values, then we could use forecast the price.
# let's for example assume the line 3 of our dataset would be new and let's see
# what our model predicts
sample = full_dataset.iloc[[2]]
del sample['Price']
samplePrediction = model.predict(sample)
print("%.2f" % samplePrediction)

1587509.42


In [15]:
full_dataset.iloc[[2]]

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,1,0,1,0,0
