In [1]:
import pandas as pd

path = "melb_data.csv"
melbourne_data = pd.read_csv(path)

In [2]:
melbourne_data_shaped = melbourne_data.dropna(axis=0)

In [3]:
# Selecting Data for Modeling
y = melbourne_data_shaped.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = melbourne_data_shaped[melbourne_features]

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
melbourne_model = DecisionTreeRegressor(random_state=1)

melbourne_model.fit(X, y)
predicted_home_prices = melbourne_model.predict(X)
print(mean_absolute_error(y, predicted_home_prices))
print(predicted_home_prices[:5])
print(y.head().tolist())

434.715945771
[ 1035000.  1465000.  1600000.  1876000.  1636000.]
[1035000.0, 1465000.0, 1600000.0, 1876000.0, 1636000.0]


In [5]:
test_data = pd.DataFrame({'Rooms': [4], 'Bathroom': [1], 'Landsize': [120], 'BuildingArea': [142], 'YearBuilt': [2014], 'Lattitude': [-37.8072], 'Longtitude': [144.9941]})
test_data = test_data[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']]

predicted_home_prices_test = melbourne_model.predict(test_data)
print(predicted_home_prices_test)

[ 1600000.]


In [12]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

melbourne_model.fit(train_X, train_y)
val_predicted_home_prices = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predicted_home_prices))
print(val_predicted_home_prices[:5])
print(val_y.head().tolist())

262442.656553
[  937500.   550000.  1015000.  1382500.   910000.]
[815000.0, 655000.0, 957500.0, 1330000.0, 722000.0]


In [27]:
test_data = pd.DataFrame({'Rooms': [2], 'Bathroom': [1], 'Landsize': [96], 'BuildingArea': [71], 'YearBuilt': [1880], 'Lattitude': [-37.85010], 'Longtitude': [144.99530]})
test_data = test_data[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']]

val_predicted_home_prices_test = melbourne_model.predict(test_data)
print(val_predicted_home_prices_test)

[ 937500.]


In [29]:
### Experimenting With Different Models
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
    
#### compare MAE with differing values of max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
scores = {max_leaf_nodes: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) for max_leaf_nodes in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

print(best_tree_size)

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

#final_model.fit(X, y)
#predicted_home_prices = final_model.predict(X)
#print(mean_absolute_error(y, predicted_home_prices))
#print(predicted_home_prices[:5])
#print(y.head().tolist())

final_model.fit(train_X, train_y)
val_predicted_home_prices = final_model.predict(val_X)
print(mean_absolute_error(val_y, val_predicted_home_prices))
print(val_predicted_home_prices[:5])
print(val_y.head().tolist())

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 25  		 Mean Absolute Error:  270649
Max leaf nodes: 50  		 Mean Absolute Error:  257829
Max leaf nodes: 100  		 Mean Absolute Error:  248457
Max leaf nodes: 250  		 Mean Absolute Error:  246998
Max leaf nodes: 500  		 Mean Absolute Error:  243176
500
246474.054599
[  980187.5          521973.49230769   980187.5         1163378.94736842
  1048000.        ]
[815000.0, 655000.0, 957500.0, 1330000.0, 722000.0]


In [30]:
test_data = pd.DataFrame({'Rooms': [2], 'Bathroom': [1], 'Landsize': [96], 'BuildingArea': [71], 'YearBuilt': [1880], 'Lattitude': [-37.85010], 'Longtitude': [144.99530]})
test_data = test_data[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']]

val_predicted_home_prices_test = final_model.predict(test_data)
print(val_predicted_home_prices_test)

[ 980187.5]


In [31]:
# Random Forests
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)

forest_model.fit(train_X, train_y)
val_predicted_home_prices = forest_model.predict(val_X)
print(mean_absolute_error(val_y, val_predicted_home_prices))
print(val_predicted_home_prices[:5])
print(val_y.head().tolist())

202806.612825
[  842500.   554500.   951750.  1624650.   685200.]
[815000.0, 655000.0, 957500.0, 1330000.0, 722000.0]


In [32]:
test_data = pd.DataFrame({'Rooms': [2], 'Bathroom': [1], 'Landsize': [96], 'BuildingArea': [71], 'YearBuilt': [1880], 'Lattitude': [-37.85010], 'Longtitude': [144.99530]})
test_data = test_data[['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']]

val_predicted_home_prices_test = forest_model.predict(test_data)
print(val_predicted_home_prices_test)

[ 842500.]
