## Boston Housing Data

In order to gain a better understanding of the metrics used in regression settings, we will be looking at the Boston Housing dataset.

In [5]:
#from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tests2 as t

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33, random_state=42)

In [10]:
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [12]:
# When can you use the model - use each option as many times as necessary
a = 'regression'
b = 'classification'
c = 'both regression and classification'

models = {
    'decision trees': c,
    'random forest': c,
    'adaptive boosting': c,
    'logistic regression': b,
    'linear regression': a
}

#checks your answer, no need to change this code
t.q1_check(models)

That's right!  All but logistic regression can be used for predicting numeric values.  And linear regression is the only one of these that you should not use for predicting categories.  Technically sklearn won't stop you from doing most of anything you want, but you probably want to treat cases in the way you found by answering this question!


In [14]:
# Import models from sklearn - notice you will want to use
# the regressor version (not classifier) - googling to find
# each of these is what we all do!
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [17]:
# Instantiate each of the models you imported
# For now use the defaults for all the hyperparameters
linear = LinearRegression()
decision_tree = DecisionTreeRegressor()
ada_boost = AdaBoostRegressor()
rf = RandomForestRegressor()


In [18]:
# Fit each of your models using the training data
linear.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
ada_boost.fit(X_train, y_train)
rf.fit(X_train, y_train)


In [19]:
# Predict on the test values for each model
linear_predications = linear.predict(X_test)
tree_predications = decision_tree.predict(X_test)
ada_boost_predications = ada_boost.predict(X_test)
rf_predications = rf.predict(X_test)

In [22]:
# potential model options
a = 'regression'
b = 'classification'
c = 'both regression and classification'

#
metrics = {
    'precision': b,
    'recall': b,
    'accuracy': b,
    'r2_score': a,
    'mean_squared_error': a,
    'area_under_curve': b ,
    'mean_absolute_area': a
}

#checks your answer, no need to change this code
t.q6_check(metrics)

That's right! Looks like you know your metrics!


In [23]:
# Import the metrics from sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [24]:
def r2(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the r-squared score as a float
    '''
    sse = np.sum((actual-preds)**2)
    sst = np.sum((actual-np.mean(actual))**2)
    return 1 - sse/sst

# Check solution matches sklearn
print(r2(y_test, tree_predications),'decision_tree')
print(r2_score(y_test, tree_predications),'decision_tree')

print(r2(y_test, linear_predications),'linear_reg')
print(r2_score(y_test, linear_predications),'linear_reg')

print(r2(y_test, ada_boost_predications),'adaboost')
print(r2_score(y_test, ada_boost_predications),'adaboost')

print(r2(y_test, rf_predications),'random_forest')
print(r2_score(y_test, rf_predications),'random_forest')
print("Since the above match, we can see that we have correctly calculated the r2 value.")

0.7406069508492645 decision_tree
0.7406069508492645 decision_tree
0.7261570836552483 linear_reg
0.7261570836552483 linear_reg
0.8150822754505168 adaboost
0.8150822754505168 adaboost
0.8622175041407224 random_forest
0.8622175041407224 random_forest
Since the above match, we can see that we have correctly calculated the r2 value.


In [25]:
def mse(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean squared error as a float
    '''
    return np.sum((actual - preds)**2)/len(actual)


# Check your solution matches sklearn

print(mse(y_test, tree_predications),'decision_tree')
print(mean_squared_error(y_test, tree_predications),'decision_tree')

print(mse(y_test, linear_predications),'linear_reg')
print(mean_squared_error(y_test, linear_predications),'linear_reg')

print(mse(y_test, ada_boost_predications),'adaboost')
print(mean_squared_error(y_test, ada_boost_predications),'adaboost')

print(mse(y_test, rf_predications),'random_forest')
print(mean_squared_error(y_test, rf_predications),'random_forest')
print("If the above match, you are all set!")

19.63047904191617 decision_tree
19.63047904191617 decision_tree
20.7240234373397 linear_reg
20.7240234373397 linear_reg
13.994297565537385 adaboost
13.994297565537385 adaboost
10.427173766467064 random_forest
10.427173766467064 random_forest
If the above match, you are all set!


In [26]:
def mae(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean absolute error as a float
    '''
    return np.sum(np.abs(actual-preds))/len(actual)

# Check your solution matches sklearn

print(mae(y_test, tree_predications),'decision_tree')
print(mean_absolute_error(y_test, tree_predications),'decision_tree')

print(mae(y_test, linear_predications),'linear_reg')
print(mean_absolute_error(y_test, linear_predications),'linear_reg')

print(mae(y_test, ada_boost_predications),'adaboost')
print(mean_absolute_error(y_test, ada_boost_predications),'adaboost')

print(mae(y_test, rf_predications),'random_forest')
print(mean_absolute_error(y_test, rf_predications),'random_forest')
print("If the above match, you are all set!")
print("If the above match, you are all set!")

3.037724550898204 decision_tree
3.037724550898204 decision_tree
3.1482557548168235 linear_reg
3.1482557548168235 linear_reg
2.69955103873535 adaboost
2.69955103873535 adaboost
2.176592814371257 random_forest
2.176592814371257 random_forest
If the above match, you are all set!
If the above match, you are all set!


In [27]:
#match each metric to the model that performed best on it
a = 'decision tree'
b = 'random forest'
c = 'adaptive boosting'
d = 'linear regression'


best_fit = {
    'mse': b,
    'r2': b,
    'mae': b
}

#Tests your answer - don't change this code
t.check_ten(best_fit)

That's right!  The random forest was best in terms of all the metrics this time!


In [28]:

# cells for work
def print_metrics(y_true, preds, model_name):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (numpy array or pandas series)
    preds - the predictions for those values from some model (numpy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements

    OUTPUT:
    None - prints the mse, mae, r2
    '''
    print('Mean Squared Error ' + model_name + ' :' , format(mean_squared_error(y_true, preds)))
    print('Mean Absolute Error ' + model_name + ' :', format(mean_absolute_error(y_true, preds)))
    print('R2 Score ' + model_name + ' :', format(r2_score(y_true, preds)))
    print('\n\n')

print_metrics(y_test, tree_predications, 'decision tree')
print_metrics(y_test, ada_boost_predications, 'ada_boost_predications')
print_metrics(y_test, linear_predications, 'linear')
print_metrics(y_test, rf_predications, 'random forest')

Mean Squared Error decision tree : 19.63047904191617
Mean Absolute Error decision tree : 3.037724550898204
R2 Score decision tree : 0.7406069508492645



Mean Squared Error ada_boost_predications : 13.994297565537385
Mean Absolute Error ada_boost_predications : 2.69955103873535
R2 Score ada_boost_predications : 0.8150822754505168



Mean Squared Error linear : 20.7240234373397
Mean Absolute Error linear : 3.1482557548168235
R2 Score linear : 0.7261570836552483



Mean Squared Error random forest : 10.427173766467064
Mean Absolute Error random forest : 2.176592814371257
R2 Score random forest : 0.8622175041407224



