In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [58]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse, make_scorer, mean_absolute_error as mae

In [15]:
# datasets used
ttt = pd.read_csv('./tic-tac-toe.csv')
candy = pd.read_csv('./candy-data.csv')

# The problems with holdout sets

## Two samples

In [13]:
# create two diff samples of 200 observations
sample1 = ttt.sample(200, random_state=1111)
sample2 = ttt.sample(200, random_state=1171)

# print the number of common observations
print(f'common observations: {len([index for index in sample1.index if index in sample2.index])}\n')

# print the number of observations in the Class column for both samples
print(f"Class in sample1:\n{sample1['Class'].value_counts()}\n")
print(f"Class in sample2:\n{sample2['Class'].value_counts()}")

common observations: 40

Class in sample1:
positive    134
negative     66
Name: Class, dtype: int64

Class in sample2:
positive    123
negative     77
Name: Class, dtype: int64


Notice that there are a varying number of postive observations for both sample test sets. Sometimes creating a single test holdout is not enough to achieve high levels of model validation. You need to use something more robust.

# Cross-validation

In [16]:
candy.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [45]:
X = candy.drop(['competitorname', 'winpercent'], axis=1).to_numpy()
y = candy['winpercent'].to_numpy()

## sklearn's `KFold`

In [27]:
# use KFold
kf = KFold(n_splits=5,
           shuffle=True,
           random_state=1111)

# create splits
splits = kf.split(X)

# print the number of indices
for train_idx, val_idx in splits:
    print(f'No. of training indices: {len(train_idx)}')
    print(f'No. of validation indices: {len(val_idx)}')

No. of training indices: 68
No. of validation indices: 17
No. of training indices: 68
No. of validation indices: 17
No. of training indices: 68
No. of validation indices: 17
No. of training indices: 68
No. of validation indices: 17
No. of training indices: 68
No. of validation indices: 17


## Using KFold indices

In [48]:
# instantiate random forest regressor
rfc = RandomForestRegressor(n_estimators=25,
                            random_state=1111)

# access the training and validations idxs of splits
for tr, vl in kf.split(X):
    # split data
    X_train, y_train = X[tr], y[tr]
    X_val, y_val = X[vl], y[vl]
    
    # fit
    rfc.fit(X_train, y_train)
    
    # eval
    pred = rfc.predict(X_val)
    print(f'split accuracy: {mse(y_val, pred)}')

split accuracy: 151.5028145199104
split accuracy: 173.4624060357644
split accuracy: 132.7340977072911
split accuracy: 81.50364942339418
split accuracy: 217.17904656079338


# sklearn's `cross_val_score()`

## sklearn's methods

In [53]:
# load modules

## Implement `cross_val_score()`

In [57]:
rfr = RandomForestRegressor(n_estimators=25,
                            random_state=1111)
mse_ = make_scorer(mse)

# set up cv
cv = cross_val_score(estimator=rfr,
                     X=X,
                     y=y,
                     cv=10,
                     scoring=mse_)

print(f'mean MSE: {cv.mean(): .2f}')

mean MSE:  155.56


# Leave-one-out-cross-validation (LOOCV)
When to use?
* The amount of training data is limited
* You want the absolute best error estimate for new data

Be cautious when...
* Computational resources are limited
* You have a lot of data
* You have a lot of parameters to test

In [59]:
# create scorer
mae_ = make_scorer(mae)

rfr = RandomForestRegressor(n_estimators=15,
                            random_state=1111)

# implement LOOCV
scores = cross_val_score(estimator=rfr,
                         X=X,
                         y=y,
                         cv=X.shape[0],
                         scoring=mae_)

print(f'mean MAE: {np.mean(scores): .2f}')
print(f'std of MAE: {np.std(scores): .2f}')

mean MAE:  9.46
std of MAE:  7.27
