# Predicted Price of houses in Melbourne

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from joblib import dump

In [4]:
melbourne_file_path = r"C:\Users\Duda\Desktop\melb_data.csv"

In [5]:
melbourne_data = pd.read_csv(melbourne_file_path)

In [6]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [7]:
melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data = melbourne_data[(melbourne_data[['Landsize']] != 0).all(axis=1)]

In [8]:
y = melbourne_data.Price

In [9]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [10]:
X = melbourne_data[melbourne_features]

In [11]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [12]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [13]:
melbourne_model = DecisionTreeRegressor(random_state=1)

In [14]:
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [15]:
print("Making predictions for the following exemples:")
print(val_y)
print("The predictions are")
val_predictions = melbourne_model.predict(val_X)
print(val_predictions)

Making predictions for the following exemples:
696       875000.0
8229      520000.0
4153     1090000.0
6096      470000.0
1490     1000000.0
           ...    
3064     2600000.0
5220      885000.0
6413      880000.0
10109    1082000.0
1709      860000.0
Name: Price, Length: 1296, dtype: float64
The predictions are
[ 870000.  550055.  800000. ...  650000. 1306000. 2000000.]


In [16]:
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 276,576


In [17]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [18]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  408589
Max leaf nodes: 50  		 Mean Absolute Error:  297506
Max leaf nodes: 500  		 Mean Absolute Error:  266460
Max leaf nodes: 5000  		 Mean Absolute Error:  282475


In [19]:
best_tree_size = 500

In [20]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

In [21]:
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=500, random_state=1)

In [24]:
dump(final_model,"final_model_melbourne.joblib")

['final_model_melbourne.joblib']