In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%config InlineBackend.figure_format = 'retina'

In [None]:
from sklearn import (datasets, dummy, ensemble,
                     linear_model, metrics,
                     model_selection as skms,
                     naive_bayes, neighbors, tree)

In [None]:
from utils import (make_learning_curve, make_complexity_curve, 
                   rms_error, rmse,
                   manage_ames_nans, manage_ames_ordinal)

In [None]:
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# Part 1:  Fit-Predict-Evaluate

### Exercise 1:
  * Part A:
    * Read in the data from `datasets/housing_small.csv` using `pandas`.
    * There is a target in the `Target` column.  Make that the output target and everything else the input features.
    *  Build a 3-nearest neighbor model and train it on that entire dataset.
    *  Make predictions on that same dataset.
    *  Evaluate the predictions using root-mean-squared-error.
  * Part B:
    * Read in the data from `datasets/housing_small.csv` using `pandas`.
    * There is a target in the `Target` column.  Make that the output target and everything else the input features.
    * Make a training and testing set from that dataset.
    * Build a 3-nearest neighbor model and train it on the training set.
    * With that trained model, make predictions on both the training and testing sets.
    * Evalute the predictions using root-mean-squared-error.

### Part A: Simple sklearn (in-sample only)

In [None]:
housing_small_df = pd.read_csv('datasets/housing_small.csv')
housing_small_df.columns

In [None]:
housing_ftrs = housing_small_df.drop(columns='Target')
housing_tgt  = housing_small_df[['Target']]

In [None]:
knn   = neighbors.KNeighborsRegressor(n_neighbors=3)
fit   = knn.fit(housing_ftrs, housing_tgt)
preds = knn.predict(housing_ftrs)

In [None]:
# or a "one-liner" (broken up for readability)
preds = (neighbors.KNeighborsRegressor(n_neighbors=3)
                  .fit(housing_ftrs, housing_tgt)
                  .predict(housing_ftrs))

In [None]:
rms_error(housing_tgt, preds)

### Part B: Simple sklearn (train-test)

In [None]:
housing_small_df = pd.read_csv('datasets/housing_small.csv')
print(housing_small_df.columns)

housing_ftrs = housing_small_df.drop(columns='Target')
housing_tgt  = housing_small_df[['Target']]

(train_ftrs, test_ftrs,
 train_tgt,  test_tgt) = skms.train_test_split(housing_ftrs, housing_tgt, test_size=.33)

In [None]:
knr = (neighbors.KNeighborsRegressor(n_neighbors=3)
                .fit(train_ftrs, train_tgt))

train_preds = knr.predict(train_ftrs)
test_preds = knr.predict(test_ftrs)

In [None]:
train_rmse = rms_error(train_tgt, train_preds)
print('in-sample train rmse: {:0.4f}'.format(train_rmse))

In [None]:
test_rmse = rms_error(test_tgt, test_preds)
print('test rmse: {:0.4f}'.format(test_rmse))

# Part 2:  Comparing Models on TTS

### Exercise 2:
  * Part A:
    * On a train-test split built from `datasets/housing_small.csv`, fit and predict using a `dummy.DummyRegressor`.
    * Compute the root-mean-squared-error (RMSE) for training and testing.
  * Part B:
    * Create a train-test split from `datasets/housing_small.csv`.
    * Build and evaluate three different nearest neighbor models (varying the number of neighbors) using RMSE.
  * Part C:
    * Create a train-test split from `datasets/housing_small.csv`.
    * Build and evaluate three different decision tree models (varying the depth of the tree) using RMSE.

### Part A: Baseline `Predict-the-Mean` Model

In [None]:
base = (dummy.DummyRegressor(strategy='mean') # default
            .fit(train_ftrs, train_tgt))

train_preds = base.predict(train_ftrs)
test_preds  = base.predict(test_ftrs)

In [None]:
train_rmse = rms_error(train_tgt, train_preds)
print('in-sample train rmse: {:0.4f}'.format(train_rmse))

In [None]:
test_rmse = rms_error(test_tgt, test_preds)
print('test rmse: {:0.4f}'.format(test_rmse))

### Part B: Two Nearest Neighbors Models

In [None]:
for n_neighbors in [3,10]:
    knr = (neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)
                    .fit(train_ftrs, train_tgt))

    train_preds = knr.predict(train_ftrs)
    test_preds = knr.predict(test_ftrs)

    train_rmse = rms_error(train_tgt, train_preds)
    test_rmse = rms_error(test_tgt, test_preds)

    print('kNN(k={:2d}) train/test rmse: {:0.4f} {:0.4f}'.format(n_neighbors, 
                                                                 train_rmse, 
                                                                 test_rmse))


### Part C: Two Decision Tree Models

In [None]:
for max_depth in [1,3]:
    dtr = (tree.DecisionTreeRegressor(max_depth=max_depth)
               .fit(train_ftrs, train_tgt))

    train_preds = dtr.predict(train_ftrs)
    test_preds  = dtr.predict(test_ftrs)

    train_rmse = rms_error(train_tgt, train_preds)
    test_rmse  = rms_error(test_tgt,  test_preds)

    print('DT-R(depth={:1d}) train/test rmse: {:0.4f} {:0.4f}'.format(max_depth, 
                                                                      train_rmse, 
                                                                      test_rmse))


# Part 3: Cross-Validation and Model Choice

### Exercise 3:
  * Part A:
    * Use `skms.cross_val_score` (imported above) to evaluate the RMSE of a 3-nearest neighbors model on `datasets/housing_small.csv`.  You can use `scoring=rmse` to have `cross_val_score` return the necessary values.
    * Use `skms.cross_val_score` to evaluate the RMSE of the models you built in Exercise 2.
  * Part B:
    * Still working with `datasets/housing_small.csv`, find a good value for the number of neighbors by using `make_complexity_curve`.
    * With the good number of neighbors, generate a learning curve with `make_learning_curve`.
  * Part C:
    * Repeat Part B using a decision tree.

### Part A: Cross-Validation

In [None]:
models = [dummy.DummyRegressor(strategy='mean'),
          neighbors.KNeighborsRegressor(n_neighbors=3),
          neighbors.KNeighborsRegressor(n_neighbors=10),
          tree.DecisionTreeRegressor(max_depth=1),
          tree.DecisionTreeRegressor(max_depth=3)]

for model in models:
    cvs = skms.cross_val_score(model, housing_ftrs, housing_tgt, scoring=rmse)
    print(model, cvs, 'mean ~ std: {:.3f} ~ {:.3f}'.format(cvs.mean(), cvs.std()), sep='\n')

### Part B: A Good Hyper + A Learning Curve (kNN)

In [None]:
fig, axes = plt.subplots(2,1)
knn = neighbors.KNeighborsRegressor

n_neighbors = range(11)
make_complexity_curve(knn(), "KNN", 'n_neighbors', n_neighbors, 
                      housing_ftrs, housing_tgt, ax=axes[0])

make_learning_curve(knn(n_neighbors=5), "KNR(5)", housing_ftrs, housing_tgt, ax=axes[1])
fig.tight_layout();

### Part C: A Good Hyper + A Learning Curve (DT)

In [None]:
fig, axes = plt.subplots(2,1)
dtr = tree.DecisionTreeRegressor

max_depth = range(1,11)
make_complexity_curve(dtr(), "DT-R", 'max_depth', max_depth, 
                      housing_ftrs, housing_tgt, ax=axes[0])

make_learning_curve(dtr(max_depth=2), "DT-R(2)", housing_ftrs, housing_tgt, ax=axes[1])
fig.tight_layout();

### Add-On:  We'll Be Doing This a Lot:  Function-ify!

In [None]:
def do_two_graphs(model, hyper_name, 
                  hyper_values, hyper_focus):
    ' produce two plots for given model, hypers, on housing_ftrs/tgt'
    fig, axes = plt.subplots(1,2, figsize=(12,3), sharey=True)
    name = model.__name__
    args = {hyper_name:hyper_focus}
    
    make_complexity_curve(model(), name, 
                          hyper_name, hyper_values, 
                          housing_ftrs, housing_tgt, ax=axes[0])
    label = "{}({})".format(name, hyper_focus)
    make_learning_curve(model(**args), label, 
                        housing_ftrs, housing_tgt, ax=axes[1])
    fig.tight_layout()
    
    cvs = skms.cross_val_score(model(**args), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
    print('mean CV(5) RMSE for {} {:0.3f}'.format(label, cvs.mean()))

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

# Part 4: Now to Improve!

# Exercise 4:
  *  Part A:
      * We can train pretty well with more complex models, but they are overfitting. Can we use more examples to smooth things out?  Using the data in `datasets/housing_tall.csv`:
        * Reevaluate our baseline mean-only model.
        * Find a good nearest neighbors model build a learning curve for it.
        * Find a good decision tree model build a learning curve for it.
  *  Part B:
      * Does adding more features improve our results?  We'll go back to fewer examples, but use a lot more features.  Using the data in `datasets/housing_wide.csv`:
        * Find a good nearest neighbors model build a learning curve for it.
        * Find a good decision tree model build a learning curve for it.
  *  Part C:
      * Does it help to be selective about our features?  Using a `RandomForestRegressor` along with `feature_importances_` identify a top-10 set of features and use those to build a model.
  *  Part D:
      * Does using a lot of features and a lot of examples help?  Using the data in `datasets/housing_full.csv`:
        * Find good nearest neighbor and decision tree models.
        * Determine if selecting a top-10 set of features (as in Part C) helps.
  * Part E:
    * How have we done overall?  Using the best model you found for `housing_wide.csv` or `housing_all.csv`, train that model on *all* of the data in that `.csv` file.  Evaluate that trained model on the data in `datasets/housing_hot_wide.csv`.

### Part A: More Examples

In [None]:
housing_tall_df = pd.read_csv('datasets/housing_tall.csv')
print(len(housing_tall_df.columns))

print(all(housing_small_df.columns == housing_tall_df.columns),
      len(housing_small_df),
      len(housing_tall_df))

housing_ftrs = housing_tall_df.drop(columns='Target')
housing_tgt  = housing_tall_df[['Target']]

In [None]:
# with more examples, we should have a better estimate of the mean so we need to redo baseline
#           also: let's use cv to estimate error so we are 100% comparing apples to apples

In [None]:
base = dummy.DummyRegressor(strategy='mean')
cvs = skms.cross_val_score(base, 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
print('baseline mean 5-fold CV RMSE:', cvs.mean())

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

### Part B: More Features (back to shorter dataset)

In [None]:
housing_wide_df = pd.read_csv('datasets/housing_wide.csv')
print(len(housing_wide_df.columns))

housing_ftrs = housing_wide_df.drop(columns='Target')
housing_tgt  = housing_wide_df[['Target']]

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

### Part C: Let's Be Selective about our Features

##### Feature Importances

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest = RandomForestRegressor()
forest.fit(housing_ftrs, housing_tgt)
fis = forest.feature_importances_
fis = pd.Series(fis, index=housing_ftrs.columns)

In [None]:
imp_df = (pd.DataFrame({'ftr_imp':fis})
            .sort_values(by='ftr_imp', ascending=False))
print(len(imp_df))
imp_df[:10].T

In [None]:
#imp_df[:10].plot.bar();

In [None]:
hfi = imp_df[:10].index
hfi

##### Using "good" Features

In [None]:
housing_ftrs = housing_wide_df[hfi]
housing_tgt  = housing_wide_df[['Target']]

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

### Part D:  All the Data

In [None]:
housing_full_df = pd.read_csv('datasets/housing_full.csv')
print(len(housing_full_df.columns))

housing_ftrs = housing_full_df.drop(columns='Target')
housing_tgt  = housing_full_df[['Target']]

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

##### Zoom In (Again) on Good Features

In [None]:
forest = RandomForestRegressor()
forest.fit(housing_ftrs, housing_tgt)
fis = forest.feature_importances_
fis = pd.Series(fis, index=housing_ftrs.columns)

In [None]:
imp_df = (pd.DataFrame({'ftr_imp':fis})
            .sort_values(by='ftr_imp', ascending=False))
print(len(imp_df))
imp_df[:10].T

In [None]:
hfi = imp_df[:10].index
hfi

In [None]:
housing_ftrs = housing_full_df[hfi]
housing_tgt  = housing_full_df[['Target']]

In [None]:
do_two_graphs(neighbors.KNeighborsRegressor, 'n_neighbors', range(11), 5)
do_two_graphs(tree.DecisionTreeRegressor, 'max_depth', range(1,11), 2)

### Part E: Train on All Data and Evaluate on Hold-Out Test Set

In [None]:
housing_hot_df = pd.read_csv('datasets/housing_hot_wide.csv')

hot_ftrs = housing_hot_df.drop(columns='Target')[hfi]
hot_tgt  = housing_hot_df[['Target']]

print(len(hot_ftrs.columns))

In [None]:
hot_preds = (neighbors.KNeighborsRegressor(n_neighbors=5)
                    .fit(housing_ftrs, housing_tgt)
                    .predict(hot_ftrs))

In [None]:
rms_error(hot_tgt, hot_preds)

# Bonus:  And Some Ensemble Learners

##### Boosted Learner

In [None]:
# boosting reduces bias (allows more complexity, less underfit)

In [None]:
gbr = ensemble.GradientBoostingRegressor

n_estimators = np.arange(1,10) * 10
gs = skms.GridSearchCV(gbr(), {'n_estimators':n_estimators}, scoring=rmse, cv=5)
gs.fit(housing_ftrs, housing_tgt)

param_df = pd.DataFrame.from_records(gs.cv_results_['params'])
param_df['mean_rmse'] = gs.cv_results_['mean_test_score']
param_df.sort_values('mean_rmse').head(3)

In [None]:
cvs = skms.cross_val_score(gbr(n_estimators=90), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
print('mean 5-fold CV RMSE:', cvs.mean())

##### Bagged Learner

In [None]:
# RFR (bagging reduces bias and variance:  improves both under and overfitting)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6,3))
rfr = ensemble.RandomForestRegressor # default = 100 estimators
make_learning_curve(rfr(), "RFR(default)", housing_ftrs, housing_tgt, ax=plt.gca());

cvs = skms.cross_val_score(rfr(), 
                           housing_ftrs, housing_tgt, 
                           cv=5, scoring=rmse)
print('mean 5-fold CV RMSE:', cvs.mean())

# Train on All Data and Evaluate on Hold-Out Test Set

In [None]:
housing_hot_df = pd.read_csv('datasets/housing_hot_wide.csv')

hot_ftrs = housing_hot_df.drop(columns='Target')[hfi]
hot_tgt  = housing_hot_df[['Target']]

print(len(hot_ftrs.columns))

In [None]:
hot_preds = (ensemble.RandomForestRegressor()
                    .fit(housing_ftrs, housing_tgt)
                    .predict(hot_ftrs))

In [None]:
rms_error(hot_tgt, hot_preds)