# BASICS

In [5]:
#Import Packages
import pandas as pd
# Set file path
melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)
# print a summary of the data in Melbourne data
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1068828.0,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,141.568645,1964.081988,-37.807904,144.990201,7435.489509
std,0.971079,675156.4,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,90.834824,38.105673,0.07585,0.099165,4337.698917
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16492,144.54237,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.45709,145.52635,21650.0


In [None]:
# Select certain column (Price) of melbourne_data
y = melbourne_data.Price
# Show first entries
y.head()

In [None]:
# Choosing Certain features of melbourne_data 
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
# Show first entries
X.head()

In [None]:
# Drop columns with missing values
data_without_missing_values = melbourne_data.dropna(axis=1)

In [None]:
# Mean Absolute Error (also called MAE).
# error=actual−predicted
#With the MAE metric, we take the absolute value of each error. This converts each error to a positive number. We then take the average of those absolute errors. This is our measure of model quality. In plain English, it can be said as

#On average, our predictions are off by about X.
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

# Handling missing values

In [None]:
# In many cases, you'll have both a training dataset and a test dataset. 
# You will want to drop the same columns in both DataFrames. In that case, you would write
#  if your test data has missing values in places where your training data did not, this will result in an error.

So, it's somewhat usually not the best solution. However, it can be useful when most values in a column are missing.
cols_with_missing = [col for col in original_data.columns 
                                 if original_data[col].isnull().any()]
redued_original_data = original_data.drop(cols_with_missing, axis=1)
reduced_test_data = test_data.drop(cols_with_missing, axis=1)

In [None]:
#Imputation fills in the missing value with some number. 
#The imputed value won't be exactly right in most cases, but it usually gives more accurate models than dropping the column entirely.
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
data_with_imputed_values = my_imputer.fit_transform(original_data)
#The default behavior fills in the mean value for imputation. 
#Statisticians have researched more complex strategies, but those complex strategies typically give no benefit once you plug the results into sophisticated machine learning models.
#One (of many) nice things about Imputation is that it can be included in a scikit-learn Pipeline. 

In [None]:
#Imputation #1 is the standard approach, and it usually works well. 
#However, imputed values may by systematically above or below their actual values 
#(which weren't collected in the dataset). Or rows with missing values may be unique in some other way. 
#In that case, your model would make better predictions by considering which values were originally missing. 

# Set file path
melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
# read the data and store data in DataFrame titled melbourne_data
orignal_data = pd.read_csv(melbourne_file_path) 
# print a summary of the data in Melbourne data

# make copy to avoid changing original data (when Imputing)
new_data = original_data.copy()

# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns 
                                 if new_data[col].isnull().any())
for col in cols_with_missing:
    new_data[col + '_was_missing'] = new_data[col].isnull()

# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = original_data.columns

In [None]:
# Imputation as directly above example #2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

data = pd.read_csv('../input/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

# Regression tree and mean absolute error

In [15]:
#Import Packages
import pandas as pd
# Set file path
melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)


# Decision Tree Regressor model
# scikit-learn library to create your model
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

# Fit model
melbourne_model.fit(X, y)
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
print("Mean Absolute Error is")
print(mean_absolute_error(y, predicted_home_prices))


Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
Mean Absolute Error is
1115.7467183128902


# Score_dataset FUNCTION to calculate MAE on split training sample

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

# Same us above but splitting Samples

In [78]:
#Import Packages
import pandas as pd
# Set file path
melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



# Decision Tree Regressor model
# scikit-learn library to create your model
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)


print("Making predictions for the following 5 houses:")
print(val_X.head())
print("The predictions are")
print(melbourne_model.predict(val_X.head()))
# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print("Mean Absolute Error is")
print(mean_absolute_error(val_y, val_predictions))

Making predictions for the following 5 houses:
       Rooms  Bathroom  Landsize  Lattitude  Longtitude
4850       2       1.0      96.0  -37.85010   144.99530
2307       2       1.0       0.0  -37.89020   144.99070
10090      2       1.0     136.0  -37.85542   144.99571
3645       3       2.0     205.0  -37.79930   145.02670
4930       2       1.0     400.0  -37.73520   144.98520
The predictions are
[ 900000.  696750. 1120000. 1447500.  630000.]
Mean Absolute Error is
273269.8620615451


# Same us above , splitting Samples , Loop through number of MAX LEAF NODE to minimize Mean Absolute Error 

In [27]:
# Compare Results from Regression Tree with defernet MAX LEAF NODES
# The max_leaf_nodes argument provides a very sensible way to control overfitting vs underfitting. 
# The more leaves we allow the model to make, the more we move from the underfitting area to the overfitting area.
#Import Packages
import pandas as pd
# Set file path
melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



# Decision Tree Regressor model
# scikit-learn library to create your model
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)


from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_leaf_nodes in [5, 50, 500, 5000,]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261769
Max leaf nodes: 5000  		 Mean Absolute Error:  272464


# Same us above , splitting Samples , Loop through number of MAX LEAF NODE to minimize Mean Absolute Error

In [73]:
import pandas as pd

melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data = melbourne_data.dropna(axis=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


# Write loop to find the ideal tree size from candidate_max_leaf_nodes
max_leaf_nodes = [5, 25, 50, 100, 250, 500]
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in max_leaf_nodes}
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)

# Fit the model with best_tree_size. Fill in argument to make optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model
final_model.fit(X, y)


print("Making predictions for the following 5 houses:")
print(val_X.head())
print("The predictions are")
print(melbourne_model.predict(val_X.head()))
# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print("Mean Absolute Error is")
print(mean_absolute_error(val_y, val_predictions))
print("best_tree_size")
print(best_tree_size)
print(" Tree size : MAE \n")
print(scores)

Making predictions for the following 5 houses:
       Rooms  Bathroom  Landsize  Lattitude  Longtitude
4850       2       1.0      96.0  -37.85010   144.99530
2307       2       1.0       0.0  -37.89020   144.99070
10090      2       1.0     136.0  -37.85542   144.99571
3645       3       2.0     205.0  -37.79930   145.02670
4930       2       1.0     400.0  -37.73520   144.98520
The predictions are
[ 900000.  550000. 1120000. 1447500.  630000.]
Mean Absolute Error is
276467.81342801807
best_tree_size
500
 Tree size : MAE 

{5: 385696.54278937966, 25: 307919.7001056724, 50: 279794.61143891385, 100: 269191.989429751, 250: 269945.1501662939, 500: 261769.60955432768}


# RANDOM FORESTS

In [75]:
import pandas as pd

melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data = melbourne_data.dropna(axis=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

198797.7404992468



# Ensemble model: XGB Regressor

In [12]:
# XGBoost is an implementation of the Gradient Boosted Decision Trees algorithm. 
# Takes many weak learns and builds a strong learner. 
# https://www.youtube.com/watch?v=ufHo8vbk6g4

# We go through cycles that repeatedly builds new models and combines them into an ensemble model. 
# We start the cycle by calculating the errors for each observation in the dataset. 
# We then build a new model to predict those. We add predictions from this error-predicting model to the 
# "ensemble of models." To make a prediction, we add the predictions from all previous models. 
# We can use these predictions to calculate new errors, build the next model, and add it to the ensemble.
# Naive model -> Calculate errors -> Build model predicting -> Add last model to Ensemble -> Calculate errors -> ...

# n_estimators: specifies how many times to go through the modeling cycle described above.

# The argument early_stopping_rounds offers a way to automatically find the ideal value. 
# Early stopping causes the model to stop iterating when the validation score stops improving, 
# even if we aren't at the hard stop for n_estimators. It's smart to set a high value for n_estimators 
# and then use early_stopping_rounds to find the optimal time to stop iterating.

# Learning Rate: Instead of getting predictions by simply adding up the predictions from each component model, 
# we will multiply the predictions from each model by a small number before adding them in. 
# This means each tree we add to the ensemble helps us less. In practice, 
# this reduces the model's propensity to overfit. So, you can use a higher value of n_estimators without overfitting. 

# In general, a small learning rate (and large number of estimators) will yield more accurate XGBoost models, 
# though it will also take the model longer to train since it does more iterations through the cycle.

#n_jobs: On larger datasets where runtime is a consideration, you can use parallelism to build your models faster.
# It's common to set the parameter n_jobs equal to the number of cores on your machine. 


There's one piece outside that cycle. We need some base prediction to start the cycle. In practice, the initial predictions can be pretty naive. Even if it's predictions are wildly inaccurate, subsequent additions to the ensemble will address those errors.


import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd

melbourne_file_path = "/Users/nikschet/Documents/Jupyter Notebooks/melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data = melbourne_data.dropna(axis=1)

# Select certain column (Price) of melbourne_data
y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]



from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)



from xgboost import XGBRegressor
my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model = XGBRegressor(n_estimators=100, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], verbose=False)


# make predictions
predictions = my_model.predict(val_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, val_y)))


Mean Absolute Error : 246458.3189386966
