In [1]:
# general imports usually needed
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sbn
import pandas as pd

In [2]:
# libraries more specific to this lecture notebook
import os.path
import sys
sys.path.append('../../src')
from ml_python_class.config import DATA_DIR
from ml_python_class.custom_funcs import fetch_compressed_data

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
# Notebook wide global definitions
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

In [4]:
# notebook wide settings to make plots more readable and visually better to understand
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rc('figure', titlesize=18)
mpl.rcParams['figure.figsize'] = (10.0, 8.0) # default figure size if not specified in plot

# 2. (Chapter 2) End-to-End Machine Learning Project

To recap, we will recreate all of the steps to download the data, create test and 
training data test sets, and transform the data to make it ready for machine learning
training.


In [5]:
# fetch and uncompress housing data if it needs to be downloaded
housing_file = os.path.join(DATA_DIR, 'housing.csv')
fetch_compressed_data(HOUSING_URL, housing_file)

Received   409600 /   409488 bytes
File 'housing.tgz' successfully downloaded
Successfully extracted 'housing.csv'


In [6]:
# load the csv file into a pandas DataFrame
housing = pd.read_csv(housing_file)

In [7]:
# perform a 20%/80% test/train split using stratified shuffle sampling.
# We sample based on the temporarily defined income_cat categorical variable, to
# ensure we have a similar distribution of incomes in both training and test 
# data

# temporarily create and add an income category with 5 levels for the stratified split
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
# remove the income_cat attribute we no longer need
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

# result is 80% in train and 20% in test, stratified by income category,
# same attributes as in raw data
print(strat_train_set.shape)
print(strat_test_set.shape)

(16512, 10)
(4128, 10)


In [8]:
# strat_train_set and strat_test_set we will leave untouched so we can start from this
# point if needed.

# But to simplify things, we will reuse housing as our variable name, copying the 
# train data set, and splitting out the labels now
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

print(housing.shape)
print(housing_labels.shape)

(16512, 9)
(16512,)


In [9]:
# create our custom transformer to define combined ttributes
# indexes of the attributes we are using to create new combined attributes with
# For a real project, this type of transformer might be added to the project wide 
# module for reuse across notebooks
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    """Create a new Transformer class to add in combined attributes to the housing
    dataset.  This transformer inherits from both the sklearn BaseEstimator, 
    and the TransformerMixin.  The former automatically creates a `fit_transform()` 
    from our `fit()` and `transform()` methods.  The latter addes in 
    `get_params()` and `set_params()` methods to the transformer.
    """
    
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        """Class meta-parameter add_bedrooms_per_room.  By default this 
        transformer adds this attribute, but we can exclude by setting this 
        class metaparameter to false.
        """
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
        
    def fit(self, X, y=None):
        """Define a fit() function for this transformer.  We don't really have
        anything to do to "fit" this transformer in our case.
        """
        return self # nothing else to do
    
    def transform(self, X, y=None):
        """The purpose of this transformer is to add in the combined attributes
        rooms_per_household, population_per_household, and bedrooms_per_room if 
        desired.
        """
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            # we return the original array X with newly created features concatenated to end 
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            # we return the original array only with the default 2 features concatenated to end
            return np.c_[X, rooms_per_household, population_per_household]

In [10]:
# finally the data transformation pipeline.  In a real project we might pull out
# all of the above, and our standard pipeline into a function to delive 
# the cleaned and transformed training data set for our ML training and tuning
# first our numerical pipeline transformer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [11]:
# then the full pipeline, adding in a transformer to do one-hot encoding on the 
# categorical attribute
num_attribs = list(housing.drop('ocean_proximity', axis=1))
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs),
])


In [12]:
# do the transformation to get our data, ready and prepared for training
housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared.shape)

(16512, 16)


## 2.5 Select and Train a Model

We are finally ready to try and train some ML models.  A lot of your time as a working 
data scientist would be spent performing the steps to get to this point.  In this class,
our focus is in learning about how various ML models work, and the internals of how they 
are implemented.  So after this example, we will usually start with a data set 
that has been pretty much prepared and transformed, and get right to trying to 
train ML models to model the data.

We will learn about all of the following ML algorithms, so don't try to understand the 
details yet.  We will start with the simplest model, a linear regression model.

All such models from `sklearn` are full estimator - predictors.  The purpose of a supervised 
learning ML algorithm is to generate a hypothesis function $h()$ from the training data, and
use the learned hypothesis to predict the labels or values of unseen data.

So to train `sklearn` models, we first create an instance of the model.  We then `fit()` 
the model to the training data, giving the prepared training data, and since this is 
supervised learning, the expected correct labels our outputs for the training samples.
Once the model has been fitted, we can then evaluate its performance by testing out the
quality of its predictions on our data.

First, create a linear regression instance, and "fit" it to the training data.  This 
causes the model to learn (to the best of the algorithms ability) its hypothesis of the
transformation of inputs to outputs.


In [13]:
# try a linear regression model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# these are actually the learned or fitted parameters, we will talk about
# this later
print(lin_reg.coef_)
print(lin_reg.intercept_)

[-55650.4116403  -56716.45236929  13732.83841856  -1933.1277138
   7330.04062103 -45708.26306673  45455.47519691  74714.39134154
   6605.12802802   1042.95709453   9249.75886697 -18016.52432168
 -55219.15208555 110357.78363967 -22479.84008184 -14642.2671506 ]
236914.9973281598


We now have a trained model.  We can now test out and evaluate how well
it makes predictions.

We can look at the quality of some of the predictions it makes for the data 
we used in its training.  For example, lets get the first 5 samples of the 
training data, and compare the predictions the model makes with the correct 
labels.  

Notice we are using the estimator part of the object here, the `preedict()`
function to return predictions for a set of input samples.

Also notice that it is critical that new data be put through the 
data transformation pipeline before we run the prediction.  It will of course 
not work if you feed in untransformed input to the model, it has learned the 
mapping from the transformed inputs to the output labels.

In [14]:
# test the model
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)

print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

Predictions:  [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Labels:  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


How does the model do in its predictions here?  You may or may not have had 
some expections on what you would see here.  It works, but the predictions 
don't exactly look real accurate.  The first prediction is off by over \\$76k,
which is about 76/210 or about 35%.

To evaluate the model we need a more formal measure of its performance.
We introduced the RMSE before.  Lets see what the root mean squared error is
over the whole training set.  Notice that the metric from `sklearn` only gives
the mean squared error, so we have to take the final square root ourself to 
get the RMSE.

In [15]:
# measure the regression models RMSE on the training set
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

print(lin_rmse)

68628.19819848922


What does this mean?  Since we took the square root of the average of the 
squared differences, this means that on average our models predictions are 
off by over \\$68k.  

This is an example of a model underfitting the training data.  It is doing 
something, but this may not be a good enough predictor to meet our 
business or data science needs.

To fix an underfitting model we can get better or more features, reduce the 
constraints on the model, or use a better model.  We can't get more data 
here too easily.  We could try and find some better custom features like the 
`bedrooms_per_room` feature we found and added. This model is not regularized, 
so we don't have any constraints we can loosen (we will talk about regularization
later in the class).

So for the moment we are reduced to trying out a more powerful ML modeling 
algorithm.  We will study Decision Trees in this class.  A `DecisionTreeRegressor` is potentially a much more powerful modeler than
a linear regrression.

The pattern will remain the same, fit the model, then use predict on data 
to evaluate model performance.

In [16]:
# try a more complex DecisionTree regressor model
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor()

In [17]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

0.0


Wow! absolutely no error at all.  This means that the decision tree 
gave an absolute perfect prediction for all of the 16k+ samples we used 
in training.  So we are done right?

Well not really, the textbook is leading you down a socratic path here a bit 
to try and get you to understand the danger of overfitting.  But in general,
a models performance on the data it was trained with means very little.
It tells us nothing, really, about how well the model will really do with 
data it has never seen before.

So now we should pull out the test data set for the evaluation of how well 
the model does on unseen data, right?  Not quite. The pristine test set 
should only be used for the absolute final evaluation.  So if we are going to
train and compare multiple models, we actually need to split the training 
set into 2 pieces, and evaluate how well the different models do on those
parts.  This is known as splitting the training data into training and 
validation sets.  Then when we compare models to one another, we compare 
how well they don on the held back validation data.


### Better Evaluation Using Cross-Validation

We could just use a `train_test_split()` function on the current training
data to split into training and validation data.

A better alternative is to use what is known as **K-fold cross-validation**.
The textbook uses 10-fold (which is pretty standard) as an example. For a
10-fold cross-validation, we break the training data up into 10 roughly 
equal pieces.  Then we fit and evaluate the model 10 different times (and compute the average and variance of the performance across the 10 training 
runs).  Each run we take out 1 of the 10 folds, train with the data from the 
other 9 folds, and evaluate performance on the held out fold.

This gives a much better estimate of how well a model will generalize 
on unseen data.  And also, since we train the model multiple times, we will 
be able to get information about how much variance we can expect to see 
in model performance across different training runs of a model.

`sklearn` makes using K-fold cross-validation relatively straight forward.


In [18]:
# validate the decision tree regressor using K-fold cross-validation with 10 folds
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)

A few technical details here.  For some reason that I could not tell you 
why, this function expects a utility function (where greater is better,
rather than a cost function where smaller is better).  Thus we 
compute the negative of the mean squared error, so that larger values are 
actually closer to 0 and thus better.  To get the actual RMSE result, we just
take the negative of the scores and compute the square root as before.

But notice all of the details of creating the folds, training the 
model, and calculating the scores, is handled by this one function. 
The result is simply a list of scores that we can transform into the RMSE
score for each of the 10 folds that we can most easily understand.

So lets see how the decision tree really performs.

In [19]:
def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standard Devaition: ', scores.std())
    
display_scores(tree_rmse_scores)

Scores:  [69033.8058949  68162.25430746 70706.76812406 68940.21283964
 72385.23257385 75892.76543723 72250.16684161 71998.98016165
 76560.55250058 68858.38063159]
Mean:  71478.91193125585
Standard Devaition:  2790.68164805698


So yea, not so impressive now.  A bit worse in fact than the linear regressor, 
though the linear regressor would probably fall a bit if we also evaluated 
it with cross-validation.

This says that the average error is a bit over \\$70k for each prediction.
It also say on average prediction errors are off by $\pm2676$ dollars.
The amount of variance you see from cross-validtion can be very 
valuable in letting you know if one model just got unusually lucky or 
unlucky in its training.

We can do the cross validation for linear regression also, just so we can 
compare apples with apples for all of our models.

In [20]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores:  [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean:  69052.46136345083
Standard Devaition:  2731.674001798347


A final note here.  Though the performance looks about the same, it is good
to remember and understand that the models are performing poorly for 
completely different reasons.

We know that the decision tree model is badly overfitting, because it gets 
perfect performance (no error) if evaluated on the data it was trained with.
So you should understand that the scores you get from cross-validation 
above are the scores on the held back, unseen validation data for each 
of the k-fold trained models.  If you look at the training error, you 
would see that the decision tree always gets close to 0 on the data it trained 
with, while the linear regressor is getting a similar high error both for 
the data it trains with and the unseen validation data.

We next try one last model.  We will also talk about ensemble ML models, and
random forests.  A random forest is actually a collection of many
smaller decision trees.  Random forests are usually in practice better performers
than single decision trees.  We can also though more easily tune 
decision trees to prevent overfitting.

In [21]:
# try a random forest regressor to try and fix overfitting of single decision trees
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor()

In [22]:
# warning, this will take significnatly more time than other stuff up to this
# point
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores:  [49448.36824694 47720.58800827 49639.46599524 52008.69428008
 49565.10941345 53498.59965755 48680.77985856 47895.2904889
 52769.33154869 50564.99914358]
Mean:  50179.122664126335
Standard Devaition:  1895.57113723086


This final model does finally show some improvement.  It moves the
average error from around \\$70k to down to \\$50k, which is a significant
improvement in performance of the model.

## 2.6 Fine-Tune Your Model

Once you have an idea of a model or models that might be promising, 
you want to see if  you can tune the models to optimize your 
performance.

Randomly changing meta-parameters or tweaking models will only get you 
so far.  At some point an experienced data scientist will want to take 
a more structured approach to fine tuning models to try and improve their 
performance.

We can think of fine tuning as a type of search optimization, where we have
different types of models, and meta-parameters we can set for the various 
models.  We want to explore the space of models and their tuning parameters
effectively to find the places where optimum performance occurs.

### Grid Search

Grid search of models and parameters is an attempt to exhaustively 
search ranges of combinations of parameters in order to evaluate 
model performance.  

Since the `RandomForestRegressor` looks promising, we might ask, might 
tweaking some of its main meta-parameters result in even better performance.
Our textbook picks several meta-parameters for the example.  Again don't 
worry about what these are right now, we will cover them when we talk
about random forests and ensembles a bit.

What you should understand from this next example is that we define a list of 
3 and 4 parameters to try for the first, and a list of 1, 2 and 3 parameters 
to try the second time.  All combinations of each line in the defined grid 
will be tried.  Thus we will try the $3 \times 4 = 12$ combinations of the 
first grid line, followed by the $1 \times 2 \times 3 = 6$ combinations 
of the second, for a total of 18 different meta-parameter settings.

So here we are not only performing a 5-fold cross validation, but we perfrom
this over all 18 different defined meta-paramter combinations of the grid.

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

The `GridSearchCV` estimator again allows for inspection.  So after completing
the grid search, we can find the scores, and access the best parameters 
found in the search, and the actual trained best estimator.

In [24]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [25]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [26]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

65147.98950297416 {'max_features': 2, 'n_estimators': 3}
55850.822000524095 {'max_features': 2, 'n_estimators': 10}
52972.25599239012 {'max_features': 2, 'n_estimators': 30}
59558.2448846579 {'max_features': 4, 'n_estimators': 3}
52617.09735067804 {'max_features': 4, 'n_estimators': 10}
50592.68794985156 {'max_features': 4, 'n_estimators': 30}
58813.29611864584 {'max_features': 6, 'n_estimators': 3}
52614.65281652817 {'max_features': 6, 'n_estimators': 10}
49983.94015678573 {'max_features': 6, 'n_estimators': 30}
58504.63785823987 {'max_features': 8, 'n_estimators': 3}
52380.935833131094 {'max_features': 8, 'n_estimators': 10}
49959.50605634534 {'max_features': 8, 'n_estimators': 30}
61924.517363363906 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54507.76649556334 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59884.00494717524 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52712.6343130774 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

The best performer with `max_features` of 6 and `n_estimators` of 30
made a slight improvement, breaking \\$50k for the average error.  
But since `n_estimators` as actually the maximum of the specified grid,
it might be worthwhile trying larger values of this parameter in another
grid search at least.

### Analyze the Best Models and their Errors

Another step you should perform is to inspect your best models (so far)
to peak into their innards and try and understand what is making them
work, or not work so well, as the case may be.

A common thing to do along these lines is to determine the relative 
importance of each attribute in making predictions.  Because of how
a random forest work, we can get estimates of the importance of each
feature of our dataset in the hypothesis function that was formed.

In [27]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.47696451e-02, 6.17643775e-02, 4.44793756e-02, 1.63952596e-02,
       1.49236638e-02, 1.50951175e-02, 1.44182782e-02, 3.74471083e-01,
       4.64529462e-02, 1.13691702e-01, 5.95413146e-02, 3.91677490e-03,
       1.65029163e-01, 1.74794176e-04, 1.62594273e-03, 3.25056242e-03])

Lets associate each score with the attribute name, and sort from the 
most important to the least

In [28]:
extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3744710829825028, 'median_income'),
 (0.1650291631365935, 'INLAND'),
 (0.11369170164584665, 'pop_per_hhold'),
 (0.0647696450511246, 'longitude'),
 (0.06176437752373177, 'latitude'),
 (0.0595413145828299, 'bedrooms_per_room'),
 (0.04645294618328468, 'rooms_per_hhold'),
 (0.04447937562211069, 'housing_median_age'),
 (0.0163952596004554, 'total_rooms'),
 (0.015095117479236279, 'population'),
 (0.014923663790704509, 'total_bedrooms'),
 (0.014418278182784244, 'households'),
 (0.003916774898643951, '<1H OCEAN'),
 (0.0032505624186449, 'NEAR OCEAN'),
 (0.0016259427257860767, 'NEAR BAY'),
 (0.00017479417572013043, 'ISLAND')]

So not surprisingly, since we knew `median_income` was most highly correlated 
with the housing price, it ends up as the most important feature.  But 
one of the categorical attributes is contributing a lot.  Also the 
two custom derived features, `rooms_per_houshold` and `bedrooms_per_room` 
are contributing some to the model as well.

### Evaluate Your System on the Test Set

After tweaking your models for a while, you eventually 
have a system that is performing sufficiently well.  Now just before 
moving it to production (or publishing your results) it is time 
to make the final evaluation on the completely pristine 
test data set that you have not looked at. 

There is nothing special here, we will use our pipeline to transform 
the data, and use the best model we selected for reporting/production.

In [29]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print(final_rmse)

48481.90596516397


It also helps to know some statistics.  Given the final 
performance predictions, we can calculate confidence intervals
on our systems performance.  From the variation of the errors 
we can compute a t test confidence interval, like the following:

In [30]:
# compute a 95% confidence interval for the genrealization error
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test)**2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

array([46426.34509412, 50453.78968156])

This confidence interval is a very useful final result.  Given the assumption
that the errors are roughly normally distributed, we are 95% confident that 
the true average error of this final model is somewhere within this range.
This means that if we use it on live data, we are pretty sure the average 
predictions are no worse than the upper bounds shown in this 
confidence interval.

In [31]:
# display version information of library versions used in this notebook
from ml_python_class.custom_funcs import version_information
version_information()

              Module   Versions
--------------------   ------------------------------------------------------------
         matplotlib:   ['3.3.0']
              numpy:   ['1.18.5']
             pandas:   ['1.0.5']
            seaborn:   ['0.10.1']
