In [1]:
# BASIC DATA EXPLORATION

In [2]:
# import pandas
import pandas as pd

In [3]:
# save filepath to a variable for easier access
data_filepath = '/home/oktavianu/my-Ai/ML-basics/melb_data.csv'

# read the data and store it in DataFrame
melb_data = pd.read_csv(data_filepath)

# print summary of the data
melb_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [4]:
# Get the columns name
melb_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
# We can use dropna to drop missing values from our dataset, think of na as not available
melb_data = melb_data.dropna(axis=0)

In [6]:
# select our prediction target and assign it to our y variable
prediction = melb_data.Price
y = prediction

In [7]:
# printing our y variable to the console.
print(y)

1        1035000.0
2        1465000.0
4        1600000.0
6        1876000.0
7        1636000.0
           ...    
12205     601000.0
12206    1050000.0
12207     385000.0
12209     560000.0
12212    2450000.0
Name: Price, Length: 6196, dtype: float64


In [8]:
melb_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [9]:
# assign the features to x variable:
x = melb_data[melb_features]

In [10]:
# review statistics of our features
x.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [11]:
# See the first 5 of our dataset
x.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [12]:
# Build our model
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melb_model = DecisionTreeRegressor(random_state=1)

# Fit model
melb_model.fit(x, y)

In [13]:
print("Making predictions for the following five houses: ")
print(x.head())
print("The predictions are: ")
predictions = melb_model.predict(x.head())
for prediction in predictions:
    print(prediction)

Making predictions for the following five houses: 
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are: 
1035000.0
1465000.0
1600000.0
1876000.0
1636000.0


In [14]:
# Compare with the actual price:
print(y.head())

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64


In [15]:
# Evaluate model quality using MAE
from sklearn.metrics import mean_absolute_error

# Predict using model we create earlier
home_price_predictions = melb_model.predict(x)
mean_absolute_error(y, home_price_predictions)

1115.7467183128902

In [16]:
# To make a better or much better model we need to use different data for building and validating the model
# We use method provided by sklearn; train_test_split
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(x, y, random_state=0)

# define model
new_melb_model = DecisionTreeRegressor()

# Fitting the model
new_melb_model.fit(train_x, train_y)

# Get predicted price on validation data
val_pred = new_melb_model.predict(val_x)

# Calculate mean on validtion data
print(mean_absolute_error(val_y, val_pred))

277294.01700021524


In [17]:
# Function to compare MAE scores from dfferent values for max_leaf_nodes
def get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y):
    new_melb_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    new_melb_model.fit(train_x, train_y)
    preds_val = new_melb_model.predict(val_x)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [18]:
# We can use a for-loop to compare the accuracy of models built with different values for max_leaf_nodes.
# Compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 5000  		 Mean Absolute Error:  271320


In [19]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Use dictionary comprehension to iterate over candidate_max_leaf_nodes to find the best or optimum number of leaf nodes 
scores = {leaf_size: get_mae(leaf_size, train_x, val_x, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
print(scores)

# Assign the optimum leaf nodes based on the lowest score of mae
best_leaf_node = min(scores, key=scores.get)
print(best_leaf_node)

{5: 385696.54278937966, 25: 307919.7001056724, 50: 279794.61143891385, 100: 269191.989429751, 250: 269945.1501662939, 500: 261718.1134423186}
500


In [20]:
# Optimal Tree Size
final_melb_model = DecisionTreeRegressor(max_leaf_nodes=best_leaf_node, random_state=0)

# Fitting model
final_melb_model.fit(x, y)

In [21]:
# We're going to use another model from scikit learn
from sklearn.ensemble import RandomForestRegressor

In [22]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_x, train_y)
melb_preds = forest_model.predict(val_x)
print(mean_absolute_error(val_y, melb_preds))

207190.6873773146


In [24]:
# save predictions
output = pd.DataFrame({'SalePrice': melb_preds})
output.to_csv('predictions.csv', index=False)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.00,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.00,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.00,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.00,1910.0,Yarra,-37.80240,144.99930,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.00,1890.0,Yarra,-37.80600,144.99540,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12205,Whittlesea,30 Sherwin St,3,h,601000.0,S,Ray,29/07/2017,35.5,3757.0,...,2.0,1.0,972.0,149.00,1996.0,Whittlesea,-37.51232,145.13282,Northern Victoria,2170.0
12206,Williamstown,75 Cecil St,3,h,1050000.0,VB,Williams,29/07/2017,6.8,3016.0,...,1.0,0.0,179.0,115.00,1890.0,Hobsons Bay,-37.86558,144.90474,Western Metropolitan,6380.0
12207,Williamstown,2/29 Dover Rd,1,u,385000.0,SP,Williams,29/07/2017,6.8,3016.0,...,1.0,1.0,0.0,35.64,1967.0,Hobsons Bay,-37.85588,144.89936,Western Metropolitan,6380.0
12209,Windsor,201/152 Peel St,2,u,560000.0,PI,hockingstuart,29/07/2017,4.6,3181.0,...,1.0,1.0,0.0,61.60,2012.0,Stonnington,-37.85581,144.99025,Southern Metropolitan,4380.0


In [26]:
y

1        1035000.0
2        1465000.0
4        1600000.0
6        1876000.0
7        1636000.0
           ...    
12205     601000.0
12206    1050000.0
12207     385000.0
12209     560000.0
12212    2450000.0
Name: Price, Length: 6196, dtype: float64