In [110]:
import pandas as pd
import numpy as np
from collections.abc import Callable
from typing import TypeVar
T = TypeVar("T")
data_file_path = "../datasets/houses_melb/datasets/"

In [90]:
melb_data = pd.read_csv(data_file_path + "melb_data.csv")
melb_data = melb_data.dropna(axis = 0)
display(melb_data.head())
display(melb_data.columns)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.0,1890.0,Yarra,-37.806,144.9954,Northern Metropolitan,4019.0


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [91]:
melb_features = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]
X = melb_data[melb_features]
y = melb_data.Price

In [92]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

train_X : pd.DataFrame
train_y : pd.DataFrame
val_X : pd.DataFrame
val_y : pd.DataFrame
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)


In [None]:
def test_model_mae(model_type: Callable,
                   test_max_leaf_nodes: list[int],
                   train_X: pd.DataFrame,
                   train_y: pd.DataFrame,
                   val_X: pd.DataFrame,
                   val_y: pd.DataFrame) -> list[float]:

    test_mae : list[float] = []
    for max_leaf_nodes in test_max_leaf_nodes:
        model = model_type(max_leaf_nodes=max_leaf_nodes, random_state=0)
        model.fit(train_X, train_y)
        
        val_predictions = model.predict(val_X)
        mae = mean_absolute_error(val_predictions, val_y)

        test_mae.append(mae)

    return test_mae

In [94]:
def get_minimized_mae_model_with_leaf_nodes(model_type: Callable[..., T],
                                            test_max_leaf_nodes: list[int],
                                            train_X: pd.DataFrame,
                                            train_y: pd.DataFrame,
                                            val_X: pd.DataFrame,
                                            val_y: pd.DataFrame,
                                            show_mae: bool = False) -> T:


    # if show_mae:
    #     for max_leaf_nodes, mae in zip(test_max_leaf_nodes, test_mae):
    #         print(f"{max_leaf_nodes=}\t\t{mae=}")
    test_mae = test_model_mae(model_type,
                              test_max_leaf_nodes,
                              train_X,
                              train_y,
                              val_X,
                              val_y)
    
    min_mae_index = np.argmin(test_mae)
    min_mae_value = min(test_mae)
    min_mae_leaf_nodes = test_max_leaf_nodes[min_mae_index]

    if show_mae:
        print(f"For min MAE\nmax leaf nodes: {min_mae_leaf_nodes}\nMAE: {min_mae_value}")

    model = model_type(max_leaf_nodes=min_mae_leaf_nodes, random_state=0)
    return model

In [102]:
melb_model_DTR: DecisionTreeRegressor = get_minimized_mae_model_with_leaf_nodes(DecisionTreeRegressor, [5, 10, 50, 100, 500, 1000, 5000], train_X, train_y, val_X, val_y, show_mae = True)
melb_model_DTR.fit(train_X, train_y)
melb_model_DTR_val_predictions = melb_model_DTR.predict(val_X)
melb_model_DTR_val_predictions = pd.DataFrame(melb_model_DTR_val_predictions, index=val_X.index, columns=["DTR Predicted Price"])
display(val_X.join(val_y).join(melb_model_DTR_val_predictions))

For min MAE
max leaf nodes: 500
MAE: 243613.31456921576


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude,Price,DTR Predicted Price
6048,3,3.0,221.0,-37.77080,144.8401,620000.0,6.306351e+05
9186,4,2.0,528.0,-37.83539,145.0431,2320000.0,1.819800e+06
3991,3,2.0,0.0,-37.80950,144.9691,750000.0,9.124706e+05
5829,3,2.0,1039.0,-37.86380,144.9820,1120000.0,1.157500e+06
3616,6,6.0,1334.0,-37.80290,145.0267,6500000.0,4.025000e+06
...,...,...,...,...,...,...,...
829,2,1.0,0.0,-37.90820,145.0681,486500.0,5.967545e+05
5926,3,1.0,700.0,-37.79330,144.8411,950000.0,7.344111e+05
20,4,2.0,780.0,-37.80730,144.9952,1330000.0,2.477500e+06
3935,3,2.0,151.0,-37.78550,144.8958,865000.0,9.509444e+05


In [100]:
from sklearn.ensemble import RandomForestRegressor

melb_model_RFR: RandomForestRegressor = get_minimized_mae_model_with_leaf_nodes(RandomForestRegressor,
                                                         [5, 10, 50, 100, 500, 1000, 5000],
                                                         train_X,
                                                         train_y,
                                                         val_X,
                                                         val_y,
                                                         show_mae=True)
melb_model_RFR.fit(train_X, train_y)

For min MAE
max leaf nodes: 5000
MAE: 192642.48277045097


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,5000
,min_impurity_decrease,0.0
,bootstrap,True


In [103]:
melb_model_RFR_val_predictions = melb_model_RFR.predict(val_X)
melb_model_RFR_val_predictions = pd.DataFrame(melb_model_RFR_val_predictions, index=val_X.index, columns=["RFR Predicted Price"])
display(val_X.join(val_y).join(melb_model_RFR_val_predictions))

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude,Price,RFR Predicted Price
6048,3,3.0,221.0,-37.77080,144.8401,620000.0,5.908460e+05
9186,4,2.0,528.0,-37.83539,145.0431,2320000.0,2.246410e+06
3991,3,2.0,0.0,-37.80950,144.9691,750000.0,8.124690e+05
5829,3,2.0,1039.0,-37.86380,144.9820,1120000.0,2.031300e+06
3616,6,6.0,1334.0,-37.80290,145.0267,6500000.0,4.163540e+06
...,...,...,...,...,...,...,...
829,2,1.0,0.0,-37.90820,145.0681,486500.0,5.578867e+05
5926,3,1.0,700.0,-37.79330,144.8411,950000.0,8.432250e+05
20,4,2.0,780.0,-37.80730,144.9952,1330000.0,3.123840e+06
3935,3,2.0,151.0,-37.78550,144.8958,865000.0,8.915500e+05


In [None]:
val_test_X = pd.DataFrame({
    "Rooms" : 4,
    "Bathroom" : 2.0,
    "Landsize" : 332,
    "Lattitude": -37.90988936743714,
    "Longtitude": 145.04470105815463,
},
index=[0])

val_test_melb_model_RFR_predictions = melb_model_RFR.predict(val_test_X)
display(val_test_X)
display(val_test_melb_model_RFR_predictions)

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,4,2.0,332,-37.909889,145.044701


array([1428180.])