In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [2]:
file_path = './houses.csv'

In [3]:
data = pd.read_csv(file_path)

In [4]:
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [5]:
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = data[features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [6]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941


In [7]:
y = data.Price

In [8]:
train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [9]:
train_X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,10864.0,10864.0,10864.0,10864.0,10864.0
mean,2.941458,1.535162,565.872423,-37.809911,144.995053
std,0.961697,0.692709,4382.784066,0.079467,0.103625
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,178.0,-37.8573,144.929667
50%,3.0,1.0,440.0,-37.802465,145.0002
75%,4.0,2.0,650.25,-37.757172,145.057518
max,10.0,8.0,433014.0,-37.40853,145.48273


In [10]:
val_X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,2716.0,2716.0,2716.0,2716.0,2716.0
mean,2.924153,1.53056,528.590943,-37.806372,144.995868
std,0.931617,0.687822,1671.270666,0.078376,0.105085
min,1.0,0.0,0.0,-38.16404,144.55784
25%,2.0,1.0,172.0,-37.8552,144.92915
50%,3.0,1.0,444.0,-37.80205,144.9995
75%,3.0,2.0,652.0,-37.75408,145.059955
max,8.0,7.0,76000.0,-37.49674,145.52635


In [11]:
model = DecisionTreeRegressor()

In [12]:
model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [13]:
print(model.predict(X))

[1480000. 1035000. 1465000. ... 1170000. 2500000. 1005000.]


In [14]:
val_predictions = model.predict(val_X)

In [15]:
print(mean_absolute_error(val_y, val_predictions))

243136.9358124693


In [16]:
# compare same model with different parameters ... for example max leaf nodes in Decision tree ...
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [17]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# A loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  348049
Max leaf nodes: 25  		 Mean Absolute Error:  283302
Max leaf nodes: 40  		 Mean Absolute Error:  272336
Max leaf nodes: 100  		 Mean Absolute Error:  242797
Max leaf nodes: 250  		 Mean Absolute Error:  227321
Max leaf nodes: 500  		 Mean Absolute Error:  222263
