In [None]:
# visit https://github.com/mfenner1/mlwpy_live
# click "Code"
# select "Download ZIP"
# unzip on your Desktop

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# if you are on a mac, better graphics output
# otherwise, comment this out
%config InlineBackend.figure_format = 'retina'

In [None]:
from sklearn import (datasets, dummy, ensemble,
                     linear_model, metrics,
                     model_selection as skms,
                     naive_bayes, neighbors, tree)

In [None]:
from utils import (make_learning_curve, 
                   make_complexity_curve, 
                   rms_error, rmse)

In [None]:
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# Part 1:  Fit-Predict-Evaluate

In [None]:
cah_df = pd.read_csv('datasets/cali.csv.gz')
cah_df.columns

In [None]:
cah_df.head(2)

In [None]:
housing_ftrs = cah_df.drop(columns=['Latitude', 
                                    'Longitude', 
                                    'MedHouseVal'])
housing_tgt  = cah_df[['MedHouseVal']]

In [None]:
# but mark!!! we didn't do a train-test split
# you're abs. right!
knn = neighbors.KNeighborsRegressor(n_neighbors=3)
fit = knn.fit(housing_ftrs, housing_tgt)  # furry, purrs --> cat
preds = knn.predict(housing_ftrs)

In [None]:
preds  # ---> like saying "cat"

In [None]:
# how good are those predictions
rms_error(housing_tgt, preds)

# rms_error is the "root-mean-squared-error"
# actual tgt is 10 .... 
# predict 11 ... error is (10 - 11) = -1
# predict 9  ... error is (10 - 9)  =  1
# if we square those errors:  -1 --> 1   and 1 --> 1
# add up all those errors ... 
#      divide by the number of guesses we made
# (that's just a "mean" or "average")
# squaring is pretty big ... let's make small again:
# take the square root ...

# errors --> square them --> mean  --> square root
# root-mean-sqaured-error'

# no TTS --> this is what's called an "insample" or "training"
#            error

### Let's add train-test split

In [None]:
housing_ftrs = cah_df.drop(columns=['Latitude', 'Longitude', 'MedHouseVal'])
housing_tgt  = cah_df[['MedHouseVal']]

(train_ftrs, test_ftrs,
 train_tgt,  test_tgt) = skms.train_test_split(housing_ftrs, 
                                               housing_tgt, 
                                               test_size=.33)

In [None]:
# features:  columns ... examples:  rows
housing_ftrs.shape, train_ftrs.shape, test_ftrs.shape

In [None]:
# let's build a model ... just on the training data
knn = (neighbors.KNeighborsRegressor(n_neighbors=3)
                .fit(train_ftrs, train_tgt))

In [None]:
# we can predict on both train and test
train_preds = knn.predict(train_ftrs)
test_preds = knn.predict(test_ftrs)

In [None]:
# as above, this is "in sample" or "training error"
train_rmse = rms_error(train_tgt, train_preds)
print('train set rmse: {:0.4f}'.format(train_rmse))

In [None]:
# how do we do on novel data  (the test set)
test_rmse = rms_error(test_tgt, test_preds)
print('test set rmse: {:0.4f}'.format(test_rmse))

In [None]:
# we are worse on the test set .... that's normal!

# Part 2:  Models

# Baseline `Predict-the-Mean` Model

In [None]:
# this will "learn" a mean on the training data
# ... and predict that mean for all predictions
base = (dummy.DummyRegressor(strategy='mean') # default
            .fit(train_ftrs, train_tgt))

train_preds = base.predict(train_ftrs)
test_preds  = base.predict(test_ftrs)

In [None]:
train_rmse = rms_error(train_tgt, train_preds)
print('in-sample train rmse: {:0.4f}'.format(train_rmse))

In [None]:
test_rmse = rms_error(test_tgt, test_preds)
print('test rmse: {:0.4f}'.format(test_rmse))

# Two Different Nearest Neighbors Models

In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=3)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse = rms_error(test_tgt, test_preds)

print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(3, 
                                                             train_rmse, 
                                                             test_rmse))

In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=1)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse = rms_error(test_tgt, test_preds)

print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(3, 
                                                             train_rmse, 
                                                             test_rmse))

In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=10)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse = rms_error(test_tgt, test_preds)

print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(3, 
                                                             train_rmse, 
                                                             test_rmse))

# Decision Trees

In [None]:
# DTs have a depth
# a very deep tree ... can make very many splits
# which can make many many little rectangles ...
# which can memorize the training data
dtr = (tree.DecisionTreeRegressor(max_depth=1)
           .fit(train_ftrs, train_tgt))

train_preds = dtr.predict(train_ftrs)
test_preds  = dtr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse  = rms_error(test_tgt,  test_preds)

print('DT-R(depth={:1d}) train/test rmse: {:0.4f} {:0.4f}'.format(1, 
                                                                  train_rmse, 
                                                                  test_rmse))

In [None]:
dtr = (tree.DecisionTreeRegressor(max_depth=8)
           .fit(train_ftrs, train_tgt))

train_preds = dtr.predict(train_ftrs)
test_preds  = dtr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse  = rms_error(test_tgt,  test_preds)

print('DT-R(depth={:1d}) train/test rmse: {:0.4f} {:0.4f}'.format(1, 
                                                                  train_rmse, 
                                                                  test_rmse))

### Cross-Validation

In [None]:
from sklearn.model_selection import KFold
seq = np.array(list('abcdefghi'))
print(seq)
print()

# warning:  here we are shuffling.  see sk docs for more details.
# https://scikit-learn.org/stable/common_pitfalls.html#randomness
three_fold = KFold(n_splits=3, shuffle=True)
for train, test in three_fold.split(seq):
    print('train:', seq[train])
    print('\ttest:', seq[test])

### Evaluate with Cross-Validation

In [None]:
# default cv=None means do 5-fold CV
# train->test->score five times
skms.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), 
                     housing_ftrs, 
                     housing_tgt, 
                     scoring=rmse)

In [None]:
cvs = skms.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), 
                     housing_ftrs, 
                     housing_tgt, 
                     scoring=rmse)
cvs.mean()

In [None]:
models = [dummy.DummyRegressor(strategy='mean'),
          neighbors.KNeighborsRegressor(n_neighbors=3),
          neighbors.KNeighborsRegressor(n_neighbors=10),
          tree.DecisionTreeRegressor(max_depth=1),
          tree.DecisionTreeRegressor(max_depth=3)]

for model in models:
    cvs = skms.cross_val_score(model, 
                               housing_ftrs, 
                               housing_tgt, 
                               scoring=rmse)
    print(model, 
          # cvs, 
          'mean ~ std: {:.3f} ~ {:.3f}'.format(cvs.mean(), 
                                               cvs.std()), 
          sep='\n')

### A Good k for Nearest Neighbors (Complexity Curves)

In [None]:
fig, ax = plt.subplots(1,1)
knn = neighbors.KNeighborsRegressor

n_neighbors = range(11)
make_complexity_curve(knn(),        # model with ()
                      "KNN",        # name for label
                      'n_neighbors', n_neighbors,  #param name/vals
                      housing_ftrs, housing_tgt,  # data
                      ax=ax);

# with nearest neighbors ... low # neighbors is more complex
# ... 1000 examples .... 1-NN ---> 1000 different regions
# .... 1000 examples ... 10-NN ---> ~100 regions
# .... 1000 examples ... 1000-NN  --->  1 region

### Does More Data Help?  (Learning Curves)

In [None]:
# learning curves tell us what happens as we 
# use more and more data to train
# notes:  1. the data is CV-split first and then 
#         2. fractions of that data are used

In [None]:
fig, ax = plt.subplots(1,1)
knn = neighbors.KNeighborsRegressor
make_learning_curve(knn(n_neighbors=5), "KNR(5)", 
                    housing_ftrs, housing_tgt, 
                    ax=ax);

In [None]:
# and for decision trees
fig, axes = plt.subplots(2,1)
dtr = tree.DecisionTreeRegressor

max_depth = range(1,11)
make_complexity_curve(dtr(), "DT-R", 'max_depth', max_depth, 
                      housing_ftrs, housing_tgt, ax=axes[0])

make_learning_curve(dtr(max_depth=2), "DT-R(6)", housing_ftrs, housing_tgt, ax=axes[1])
fig.tight_layout();

### Let's Be Selective about our Features

#### Feature Importances

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest = RandomForestRegressor()
forest.fit(housing_ftrs, housing_tgt)
fis = forest.feature_importances_
fis = pd.Series(fis, index=housing_ftrs.columns)

In [None]:
imp_df = (pd.DataFrame({'ftr_imp':fis})
            .sort_values(by='ftr_imp', ascending=False))
print(len(imp_df))
imp_df.T

In [None]:
imp_df.plot.bar();

In [None]:
hfi = imp_df[:2].index
hfi

#### Using "good" Features

In [None]:
housing_ftrs = cah_df[hfi]
housing_tgt  = cah_df[['MedHouseVal']]

In [None]:
def do_two_graphs(model, hyper_name, 
                  hyper_values, hyper_focus):
    ' produce two plots for given model, hypers, on housing_ftrs/tgt'
    fig, axes = plt.subplots(1,2, figsize=(12,3), sharey=True)
    name = model.__name__
    args = {hyper_name:hyper_focus}
    
    make_complexity_curve(model(), name, 
                          hyper_name, hyper_values, 
                          housing_ftrs, housing_tgt, ax=axes[0])
    label = "{}({})".format(name, hyper_focus)
    make_learning_curve(model(**args), label, 
                        housing_ftrs, housing_tgt, ax=axes[1])
    fig.tight_layout()
    
    cvs = skms.cross_val_score(model(**args), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
    print('mean CV(5) RMSE for {} {:0.3f}'.format(label, cvs.mean()))

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 
              'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 
              'max_depth', range(1,11), 2)