In [1]:
from pathlib import Path
import pandas as pd

file_path = Path('melb_data.csv')
data = pd.read_csv(file_path)

data.describe()

Unnamed: 0.1,Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,18396.0,18396.0,18396.0,18395.0,18395.0,14927.0,14925.0,14820.0,13603.0,7762.0,8958.0,15064.0,15064.0,18395.0
mean,11826.787073,2.93504,1056697.0,10.389986,3107.140147,2.913043,1.538492,1.61552,558.116371,151.220219,1965.879996,-37.809849,144.996338,7517.975265
std,6800.710448,0.958202,641921.7,6.00905,95.000995,0.964641,0.689311,0.955916,3987.326586,519.188596,37.013261,0.081152,0.106375,4488.416599
min,1.0,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,5936.75,2.0,633000.0,6.3,3046.0,2.0,1.0,1.0,176.5,93.0,1950.0,-37.8581,144.931193,4294.0
50%,11820.5,3.0,880000.0,9.7,3085.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.803625,145.00092,6567.0
75%,17734.25,3.0,1302000.0,13.3,3149.0,3.0,2.0,2.0,651.0,174.0,2000.0,-37.75627,145.06,10331.0
max,23546.0,12.0,9000000.0,48.1,3978.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [2]:
data = data.dropna(axis=0)
data.columns

Index(['Unnamed: 0', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea',
       'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [3]:
features = [
    'Rooms',
    'Bathroom',
    'Landsize',
    'Lattitude',
    'Longtitude'
]

X = data[features]
y = data.Price

In [4]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

dt_model = DecisionTreeRegressor(random_state=1)
dt_model.fit(train_X, train_y)

dt_val_predictions = dt_model.predict(val_X)

dt_mae = mean_absolute_error(dt_val_predictions, val_y)

print(dt_mae)

251688.7630729503


In [6]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 1000, 2500, 5000]
dt_mae_list = [
    get_mae(i, train_X, val_X, train_y, val_y)
    for i in candidate_max_leaf_nodes
]

for i, dt_mae in enumerate(dt_mae_list):
    print(f'{candidate_max_leaf_nodes[i] : >4} | {dt_mae:>.1f}')

   5 | 369673.0
  25 | 283377.6
  50 | 266644.2
 100 | 256533.3
 250 | 240719.9
 500 | 243613.3
1000 | 244793.1
2500 | 254895.2
5000 | 256227.6


In [8]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)

rf_val_predictions = rf_model.predict(val_X)

rf_mae = mean_absolute_error(rf_val_predictions, val_y)

print(rf_mae)

190414.59149025998
