In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%config InlineBackend.figure_format = 'retina'

In [None]:
from sklearn import (datasets, dummy, ensemble,
                     linear_model, metrics,
                     model_selection as skms,
                     naive_bayes, neighbors, tree)

In [None]:
from utils import (make_learning_curve, make_complexity_curve, 
                   rms_error, rmse,
                   manage_ames_nans, manage_ames_ordinal)

In [None]:
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# Part 1:  Fit-Predict-Evaluate

# Simple sklearn (in-sample only)

In [None]:
cah_df = pd.read_csv('datasets/cali.csv.gz')
cah_df.columns

In [None]:
# cah_df.MedHouseVal.plot(kind='hist');
# sns.pairplot(cah_df) # slow

In [None]:
# explain why dropping
housing_ftrs = cah_df.drop(columns=['Latitude', 'Longitude', 'MedHouseVal'])
housing_tgt  = cah_df[['MedHouseVal']]

In [None]:
knn   = neighbors.KNeighborsRegressor(n_neighbors=3)
fit   = knn.fit(housing_ftrs, housing_tgt)
preds = knn.predict(housing_ftrs)

In [None]:
# or a "one-liner" (broken up for readability)
preds = (neighbors.KNeighborsRegressor(n_neighbors=3)
                  .fit(housing_ftrs, housing_tgt)
                  .predict(housing_ftrs))

In [None]:
# ??rms_error to show source code
rms_error(housing_tgt, preds)

# Simple sklearn (train-test)

In [None]:
housing_ftrs = cah_df.drop(columns=['Latitude', 'Longitude', 'MedHouseVal'])
housing_tgt  = cah_df[['MedHouseVal']]

(train_ftrs, test_ftrs,
 train_tgt,  test_tgt) = skms.train_test_split(housing_ftrs, 
                                               housing_tgt, 
                                               test_size=.33)

In [None]:
# here we separate about predictions b/c we want to use it twice
knr = (neighbors.KNeighborsRegressor(n_neighbors=3)
                .fit(train_ftrs, train_tgt))

# predict for train and then for test
train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

In [None]:
train_rmse = rms_error(train_tgt, train_preds)
print('train set rmse: {:0.4f}'.format(train_rmse))

In [None]:
test_rmse = rms_error(test_tgt, test_preds)
print('test set rmse: {:0.4f}'.format(test_rmse))

# Part 2:  Comparing Models on TTS

# Baseline `Predict-the-Mean` Model

In [None]:
base = (dummy.DummyRegressor(strategy='mean') # default
            .fit(train_ftrs, train_tgt))

train_preds = base.predict(train_ftrs)
test_preds  = base.predict(test_ftrs)

In [None]:
train_rmse = rms_error(train_tgt, train_preds)
print('in-sample train rmse: {:0.4f}'.format(train_rmse))

In [None]:
test_rmse = rms_error(test_tgt, test_preds)
print('test rmse: {:0.4f}'.format(test_rmse))

# Two Different Nearest Neighbors Models

In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=3)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse = rms_error(test_tgt, test_preds)

print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(3, 
                                                             train_rmse, 
                                                             test_rmse))


In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=10)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse = rms_error(test_tgt, test_preds)

print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(10, 
                                                             train_rmse, 
                                                             test_rmse))


# Two Decision Tree Models

In [None]:
dtr = (tree.DecisionTreeRegressor(max_depth=1)
           .fit(train_ftrs, train_tgt))

train_preds = dtr.predict(train_ftrs)
test_preds  = dtr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse  = rms_error(test_tgt,  test_preds)

print('DT-R(depth={:1d}) train/test rmse: {:0.4f} {:0.4f}'.format(1, 
                                                                  train_rmse, 
                                                                  test_rmse))


In [None]:
dtr = (tree.DecisionTreeRegressor(max_depth=3)
           .fit(train_ftrs, train_tgt))

train_preds = dtr.predict(train_ftrs)
test_preds  = dtr.predict(test_ftrs)

train_rmse = rms_error(train_tgt, train_preds)
test_rmse  = rms_error(test_tgt,  test_preds)

print('DT-R(depth={:1d}) train/test rmse: {:0.4f} {:0.4f}'.format(3, 
                                                                  train_rmse, 
                                                                  test_rmse))


In [None]:
# other model possibilities:
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

# Part 3:  Cross-Validation and Model Choice

### Cross-Validation

In [None]:
from sklearn.model_selection import KFold
seq = np.array(list('abcdefghi'))
print(seq)
print()

# warning:  here we are shuffling.  see sk docs for more details.
# https://scikit-learn.org/stable/common_pitfalls.html#randomness
three_fold = KFold(n_splits=3, shuffle=True)
for train, test in three_fold.split(seq):
    print('train:', seq[train])
    print('\ttest:', seq[test])

### Evaluate with Cross-Validation

In [None]:
# default cv=None means do 5-fold CV
# train->test->score five times
skms.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), 
                     housing_ftrs, 
                     housing_tgt, 
                     scoring=rmse)

In [None]:
models = [dummy.DummyRegressor(strategy='mean'),
          neighbors.KNeighborsRegressor(n_neighbors=3),
          neighbors.KNeighborsRegressor(n_neighbors=10),
          tree.DecisionTreeRegressor(max_depth=1),
          tree.DecisionTreeRegressor(max_depth=3)]

for model in models:
    cvs = skms.cross_val_score(model, housing_ftrs, housing_tgt, scoring=rmse)
    print(model, 
          # cvs, 
          'mean ~ std: {:.3f} ~ {:.3f}'.format(cvs.mean(), 
                                               cvs.std()), 
          sep='\n')

### A Good k for Nearest Neighbors (Complexity Curves)

In [None]:
fig, ax = plt.subplots(1,1)
knn = neighbors.KNeighborsRegressor

n_neighbors = range(11)
make_complexity_curve(knn(), "KNN", 
                      'n_neighbors', n_neighbors, 
                      housing_ftrs, housing_tgt, 
                      ax=ax);

### Does More Data Help?  (Learning Curves)

In [None]:
# learning curves tell us what happens as we use more and more data to train
# notes:  1. the data is CV-split first and then 2. fractions of that data are used

In [None]:
fig, ax = plt.subplots(1,1)
knn = neighbors.KNeighborsRegressor
make_learning_curve(knn(n_neighbors=5), "KNR(5)", 
                    housing_ftrs, housing_tgt, 
                    ax=ax);

In [None]:
# and for decision trees
fig, axes = plt.subplots(2,1)
dtr = tree.DecisionTreeRegressor

max_depth = range(1,11)
make_complexity_curve(dtr(), "DT-R", 'max_depth', max_depth, 
                      housing_ftrs, housing_tgt, ax=axes[0])

make_learning_curve(dtr(max_depth=2), "DT-R(6)", housing_ftrs, housing_tgt, ax=axes[1])
fig.tight_layout();

# Part 4:  Improving our Fits

### We'll Be Doing This a Lot:  Function-ify!

In [None]:
def do_two_graphs(model, hyper_name, 
                  hyper_values, hyper_focus):
    ' produce two plots for given model, hypers, on housing_ftrs/tgt'
    fig, axes = plt.subplots(1,2, figsize=(12,3), sharey=True)
    name = model.__name__
    args = {hyper_name:hyper_focus}
    
    make_complexity_curve(model(), name, 
                          hyper_name, hyper_values, 
                          housing_ftrs, housing_tgt, ax=axes[0])
    label = "{}({})".format(name, hyper_focus)
    make_learning_curve(model(**args), label, 
                        housing_ftrs, housing_tgt, ax=axes[1])
    fig.tight_layout()
    
    cvs = skms.cross_val_score(model(**args), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
    print('mean CV(5) RMSE for {} {:0.3f}'.format(label, cvs.mean()))

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

### Now to Improve!

In [None]:
# FIXME/TODO:  
# so, how to handle these modifications with Cali if i don't want to muck with different sets (!?!)

# not going to use different feature sets
# add notes for students to try with:
# (1) more features
# (2) more examples
# (3) both (more features and more examples!)

# i definitely want to demo selecting features
# (using feature importances)

# ask students to train on all data in a csv then evluate on a 
# separate HOT file


# so, i'm going to demo necessary code on Cali and students will have that for example code.
#     students can:
#     (1) rerun/experiment with code on Cali
#     (2) modify code ("directly") for Ames
#     (3) modify code for Ames and apply additional techniques 
#         (other learners, preprocessing, etc.)

# so my presentations will be:
# (1) conceptual
# (2) Cali code
# (3) [after they work on exercises]
#     (some) amounts of Ames code. 
#     (which should be quick as I have it layed out in ForwardProgress)


### Let's Be Selective about our Features

#### Feature Importances

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest = RandomForestRegressor()
forest.fit(housing_ftrs, housing_tgt)
fis = forest.feature_importances_
fis = pd.Series(fis, index=housing_ftrs.columns)

In [None]:
imp_df = (pd.DataFrame({'ftr_imp':fis})
            .sort_values(by='ftr_imp', ascending=False))
print(len(imp_df))
imp_df.T

In [None]:
# imp_df.plot.bar();

In [None]:
hfi = imp_df[:2].index
hfi

#### Using "good" Features

In [None]:
housing_ftrs = cah_df[hfi]
housing_tgt  = cah_df[['MedHouseVal']]

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

# Bonus: And Some Ensemble Learners

##### Boosted Learner

In [None]:
# boosting reduces bias (allows more complexity)

In [None]:
gbr = ensemble.GradientBoostingRegressor

n_estimators = np.arange(1,10) * 10
gs = skms.GridSearchCV(gbr(), {'n_estimators':n_estimators}, scoring=rmse, cv=5)
gs.fit(housing_ftrs, housing_tgt)

param_df = pd.DataFrame.from_records(gs.cv_results_['params'])
param_df['mean_rmse'] = gs.cv_results_['mean_test_score']
param_df.sort_values('mean_rmse').head(3)

In [None]:
cvs = skms.cross_val_score(gbr(n_estimators=90), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
print('mean 5-fold CV RMSE:', cvs.mean())

##### Bagged Learner

In [None]:
# RFR (bagging reduces bias and variance)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6,3))
rfr = ensemble.RandomForestRegressor # default = 100 estimators
make_learning_curve(rfr(), "RFR(default)", housing_ftrs, housing_tgt, ax=ax)

cvs = skms.cross_val_score(rfr(), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
print('mean 5-fold CV RMSE:', cvs.mean())

# Next Steps

  * A Book:
    * Machine Learning with Python for Everyone (by me!)
    * https://www.pearson.com/store/p/machine-learning-with-python-for-everyone/P200000009467/9780134845623
  * Videos:
    * Machine Learning in Python for Everyone (Video Collection) (by me!)
    * https://learning.oreilly.com/videos/machine-learning-in/9780138092818/
  * sklearn docs:
    * https://scikit-learn.org/stable/user_guide.html