In [1]:
import pandas as pd

In [2]:
melbourne_file_path = './melb_data.csv'

melbourne_data = pd.read_csv(melbourne_file_path)

# Get basic descriptive stats of each variable
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [3]:
# View data columns
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
# Drop data with missing values
melbourne_data = melbourne_data.dropna(axis=0)

In [5]:
# rQ1: What features of a home predict how much a home costs?
# Start building out prediction model

# price of a home: "y" (y=mx+b)
y = melbourne_data.Price

# features/factors to investigate influencing price of a home: "x", also known as "features"
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

x = melbourne_data[features]

# View descriptive statistics of features you're investigating
x.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [6]:
# View first few rows of your dataset
x.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [7]:
# Import scikit-learn to create a decision tree
from sklearn.tree import DecisionTreeRegressor

In [8]:
# Define the model. Specify random_state to ensure the same result every time
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit your model using the features your investigating and what you're trying to predict
melbourne_model.fit(x,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [9]:
# Let's use the model we've created to predict houses entering the market

print("Predicting the prices for the following 5 houses")
print(x.head())
print("The prices for the above 5 houses are:")
print(melbourne_model.predict(x.head()))

Predicting the prices for the following 5 houses
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The prices for the above 5 houses are:
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [10]:
# Model Validation
# Summarize model quality by starting with Mean Absolute Error MAE
        # Error = actual - expected (absolute value of this)
            # On average, our predictions are off by Error

In [11]:
# import MAE
from sklearn.metrics import mean_absolute_error

In [12]:
predicted_home_prices = melbourne_model.predict(x)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

In [13]:
# Using one sample todo both: 
    # 1) create your model and 
    # 2) validate your model
# is bad because it causes your model to "look" accurate
# What happens when your model see's new data?
    # It becomes extremely inaccurate
    
# A MODEL'S REAL VALUE COMES FROM PREDICTING NEW DATA!!!
# Exclude data from creating your model and then test that excluded data on the model created
# This is called validation data, data validation,
# Use one sample size to develop model
# Use completely different sample size to test it

In [14]:
# Import train_test_split which breaks the data up into two samples
    # 1 sample for building a model
    # 1 sample for testing against the model
    
from sklearn.model_selection import train_test_split

# split data into training and validation data for both: features (x), and what we're predicting (y)
# random_state guarentees we get the same split everytime
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state=0)

# define model
melbourne_model = DecisionTreeRegressor()

# fit model
melbourne_model.fit(train_x, train_y)

value_predictions = melbourne_model.predict(val_x)
print(mean_absolute_error(val_y, value_predictions))

275347.443081558


In [15]:
# Let's talk about a DecisionTree's depth
# Depth is determined by the number of splits in a tree before getting down to the prediction

# Overfitting
    # When a model matches the training data perfectly, but not new data
    
# Underfitting
    # When a model has too little predictors and fails to caputre other patterns in the data
        # Results in the model performing poor for both, training data and new data

# WE CARE ABOUT MODEL ACCURACY ON NEW DATA

# To Find how Accurate Something is, find the sweet spot between underfitting and overfitting

# max_leaf_nodes helps control under/overfitting of data

In [16]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Define a function to compare MAE scores for different values of max_leaf_nodes

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare MAE with different max_leaf_node values

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
    
# The above is just a for-loop running our function with the list of max_leaf_nodes

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 5000  		 Mean Absolute Error:  271996


In [17]:
# From the output above, 500 is the optimal number of leaves for our model
# Once you have your ideal leaf_nodes, re-run your original model with that in it

In [18]:
# RANDOM FOREST
# Creates many Decision trees and makes predictions by averaging each one

# Import RandomForestGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_x,train_y)
predict_prices = forest_model.predict(val_x)

print(mean_absolute_error(val_y, predict_prices))

218482.25517538196




In [19]:
# How do we handle missing values? Let's talk about it

missing_values_count_by_column = (melbourne_data.isnull().sum())
print(missing_values_count_by_column[missing_values_count_by_column] > 0)

Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
Suburb    False
dtype: bool


In [27]:
# Ways to Drop Data with Missing Values
# 1) Drop Columns with Missing Values
data_without_missing_values = melbourne_data.dropna(axis=1)

# Drop the same datasets for both, training samples and test samples

columns_with_missing_data = [column for column in melbourne_data.columns
                            if melbourne_data[column].isnull().any()]

reduced_training_data = melbourne_data.drop(columns_with_missing_data, axis=1)
# reduced_test_data = test_data.drop(columns_with_missing_data, axis=1)


# The problem with dropping columns with missing values:
    # 1) You lose useful info from columning dropping
    # 2) If test data has missing values where og data doesnt, you get errors
    
# THE ABOVE IS NOT ALWAYS THE BEST SOLUTION


In [33]:
# 2) Imputation: Fill in the missing values
    # Fill in the missing values with some number identifier
    
        # import imputer from sklearn
    
        # from sklearn.impute import SimpleImputer

        # my_imputer = SimpleImputer()

        # data_with_imputed_values = my_imputer.fit_transform(melbourne_data)

# This fills in missing values with the MEAN value of that column

In [None]:
# 3) An Extension to Imputation
    # 