# Cross Validation

In [44]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

## KFold

In [8]:
# via https://campus.datacamp.com/courses/winning-a-kaggle-competition-in-python/dive-into-the-competition?ex=9
# also https://campus.datacamp.com/courses/model-validation-in-python/cross-validation

# You probably want to use cross_val_score instead of doing this manually

kf = KFold(n_splits=3, shuffle=True, random_state=123)

fold = 0
for train_index, test_index in kf.split(train):
    cv_train = train.iloc[train_index]
    cv_test = train.iloc[test_index]
    print('Fold: {}'.format(fold))
    print('CV train shape: {}'.format(cv_train.shape))
    print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
    fold += 1

In [41]:
print("Data:")
arr = np.array(range(0, 20))
print(arr)

kf = KFold(n_splits = 4, random_state=123)
splits = kf.split(arr)

fold = 1
for train_index, test_index in splits:
    cv_train = arr[train_index]
    cv_test = arr[test_index]
    
    print("\nFold", fold)
    print(" Train (", len(cv_train), "values ):", cv_train)
    print(" Test (", len(cv_test), "values ):", cv_test)
    fold += 1

Data:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

Fold 1
 Train ( 15 values ): [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
 Test ( 5 values ): [0 1 2 3 4]

Fold 2
 Train ( 15 values ): [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19]
 Test ( 5 values ): [5 6 7 8 9]

Fold 3
 Train ( 15 values ): [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19]
 Test ( 5 values ): [10 11 12 13 14]

Fold 4
 Train ( 15 values ): [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
 Test ( 5 values ): [15 16 17 18 19]


## Stratified KFold

In [None]:
# "The general rule is to prefer Stratified K-Fold over usual K-Fold in any classification problem."
# via https://campus.datacamp.com/courses/winning-a-kaggle-competition-in-python/dive-into-the-competition?ex=9

# Import StratifiedKFold
from sklearn.model_selection import StratifiedKFold

# Create a StratifiedKFold object
str_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)

# Loop through each split
fold = 0
for train_index, test_index in str_kf.split(train, train['interest_level']):
    # Obtain training and testing folds
    cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
    print('Fold: {}'.format(fold))
    print('CV train shape: {}'.format(cv_train.shape))
    print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
    fold += 1

## Time Series Split

In [None]:
# "It works as expected, training only on the past data and predicting the future."
# via https://campus.datacamp.com/courses/winning-a-kaggle-competition-in-python/dive-into-the-competition?ex=12

# Create TimeSeriesSplit object
time_kfold = TimeSeriesSplit(n_splits=3)

# Sort train data by date
train = train.sort_values("date")

# Iterate through each split
fold = 0

for train_index, test_index in time_kfold.split(train):
    cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
    
    print('Fold :', fold)
    print('Train date range: from {} to {}'.format(cv_train.date.min(), cv_train.date.max()))
    print('Test date range: from {} to {}\n'.format(cv_test.date.min(), cv_test.date.max()))
    fold += 1

## Cross Val Score

In [None]:
rfc = RandomForestRegressor(n_estimators=25, random_state=1111)
mse = make_scorer(mean_squared_error)

# scoring can be "A str (see model evaluation documentation) or a scorer callable object / function 
# with signature scorer(estimator, X, y) which should return only a single value."

cv = cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=10, scoring=mse)

# Print the mean error
print(cv.mean())

## Leave One Out Cross Validation (LOOCV)

* Have as many folds as their are observations
* Keep 1 training point, then the rest are validation set
* Good when training data is limited and you want the best error estimate for new data
* It's computationally expensive though
