# Validation

## Splitting data

In [1]:
import numpy as np

X = np.array([
    [1, 2],
    [3, 4],
    [5, 6],
    [7, 8],
    [9, 9],
    [7, 7]
])

y = np.array([0, 0, 0, 1, 1, 1])

groups = np.array([0, 1, 2, 2, 1, 0])

### Group k-fold

In [2]:
from sklearn.model_selection import GroupKFold

gkf = GroupKFold(n_splits=3)

for train_index, test_index in gkf.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [1 2 3 4], TEST: [0 5]


### Group shuffle split

In [3]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=3, random_state=37)

for train_index, test_index in gss.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]


### Leave one group out

In [4]:
from sklearn.model_selection import LeaveOneGroupOut

logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [1 2 3 4], TEST: [0 5]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 4 5], TEST: [2 3]


### Leave p-groups out

In [5]:
from sklearn.model_selection import LeavePGroupsOut

lpgo = LeavePGroupsOut(n_groups=2)

for train_index, test_index in lpgo.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [2 3], TEST: [0 1 4 5]
TRAIN: [1 4], TEST: [0 2 3 5]
TRAIN: [0 5], TEST: [1 2 3 4]


### Leave one out

In [6]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()

for train_index, test_index in loo.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [1 2 3 4 5], TEST: [0]
TRAIN: [0 2 3 4 5], TEST: [1]
TRAIN: [0 1 3 4 5], TEST: [2]
TRAIN: [0 1 2 4 5], TEST: [3]
TRAIN: [0 1 2 3 5], TEST: [4]
TRAIN: [0 1 2 3 4], TEST: [5]


### Leave p-out

In [7]:
from sklearn.model_selection import LeavePOut

lpo = LeavePOut(p=3)

for train_index, test_index in lpo.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [2 4 5], TEST: [0 1 3]
TRAIN: [2 3 5], TEST: [0 1 4]
TRAIN: [2 3 4], TEST: [0 1 5]
TRAIN: [1 4 5], TEST: [0 2 3]
TRAIN: [1 3 5], TEST: [0 2 4]
TRAIN: [1 3 4], TEST: [0 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]
TRAIN: [1 2 4], TEST: [0 3 5]
TRAIN: [1 2 3], TEST: [0 4 5]
TRAIN: [0 4 5], TEST: [1 2 3]
TRAIN: [0 3 5], TEST: [1 2 4]
TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [0 2 5], TEST: [1 3 4]
TRAIN: [0 2 4], TEST: [1 3 5]
TRAIN: [0 2 3], TEST: [1 4 5]
TRAIN: [0 1 5], TEST: [2 3 4]
TRAIN: [0 1 4], TEST: [2 3 5]
TRAIN: [0 1 3], TEST: [2 4 5]
TRAIN: [0 1 2], TEST: [3 4 5]


### K-fold

In [8]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle=True, random_state=37)

for train_index, test_index in kf.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [0 1 2], TEST: [3 4 5]


### Stratified k-fold

In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=37)

for train_index, test_index in skf.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]


### Shuffle split

In [10]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=2, random_state=37)

for train_index, test_index in ss.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]


### Stratified shuffle split

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = ShuffleSplit(n_splits=2, random_state=37)

for train_index, test_index in sss.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]


### Predefined split

In [12]:
from sklearn.model_selection import PredefinedSplit

test_fold = np.array([1, 1, 0, 1, 1, 0])

ps = PredefinedSplit(test_fold)

for train_index, test_index in ps.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [0 1 3 4], TEST: [2 5]
TRAIN: [2 5], TEST: [0 1 3 4]


In [13]:
test_fold = np.array([0, 1, 2, 0, 1, 2])

ps = PredefinedSplit(test_fold)

for train_index, test_index in ps.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')

TRAIN: [1 2 4 5], TEST: [0 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 3 4], TEST: [2 5]


## K-fold cross validation example

### Data

In [14]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple

def get_data(n_features=20, n_samples=2000, n_missing=100):
    def generate_coordinates(m, n):
        seen = set()

        x, y = randint(0, m - 1), randint(0, n - 1)

        while True:
            seen.add((x, y))
            yield (x, y)
            x, y = randint(0, m - 1), randint(0, n - 1)
            while (x, y) in seen:
                x, y = randint(0, m - 1), randint(0, n - 1)
                
    def make_missing(X):
        coords = generate_coordinates(n_samples, n_features)
    
        for _ in range(n_missing):
            i, j = next(coords)
            X[i][j] = np.nan
    
    X, y = make_classification(**{
        'n_samples': n_samples, 
        'n_features': n_features, 
        'n_informative': 2, 
        'n_redundant': 2, 
        'n_repeated': 0, 
        'n_classes': 2, 
        'n_clusters_per_class': 2, 
        'random_state': 37
    })
    
    make_missing(X)
    return X, y

np.random.seed(37)

X, y = get_data()

### Pipeline

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score

def get_rf_pipeline():
    imputer = IterativeImputer(missing_values=np.nan, random_state=37)
    scaler = StandardScaler()
    pca = PCA(n_components=3, random_state=37)
    rf = RandomForestClassifier(n_estimators=100)

    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler), 
        ('pca', pca),
        ('rf', rf)
    ])
    
    return pipeline

def get_lr_pipeline():
    imputer = IterativeImputer(missing_values=np.nan, random_state=37)
    scaler = StandardScaler()
    lr = LogisticRegression(penalty='l1', solver='liblinear')

    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler), 
        ('lr', lr)
    ])
    
    return pipeline

### Validation

In [16]:
import pandas as pd

def do_validation(train_index, test_index, X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rf = get_rf_pipeline()
    rf.fit(X_train, y_train)
    
    y_preds = rf.predict_proba(X_test)[:,1]
    
    rf_roc, rf_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)

    lr = get_lr_pipeline()
    lr.fit(X_train, y_train)
    
    y_preds = lr.predict_proba(X_test)[:,1]
    
    lr_roc, lr_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)
    
    return rf_roc, lr_roc, rf_pr, lr_pr

kf = KFold(n_splits=10, shuffle=True, random_state=37)

results = [do_validation(train_index, test_index, X, y) 
           for train_index, test_index in kf.split(X, y)]
df = pd.DataFrame(results, columns=['rf_roc', 'lr_roc', 'rf_pr', 'lr_pr'])
df

Unnamed: 0,rf_roc,lr_roc,rf_pr,lr_pr
0,0.943409,0.948017,0.915508,0.909468
1,0.965886,0.963285,0.950626,0.940054
2,0.973927,0.976379,0.97609,0.976078
3,0.957862,0.972275,0.949563,0.967354
4,0.974789,0.956824,0.980452,0.949806
5,0.96722,0.971374,0.971068,0.976793
6,0.977564,0.969651,0.955176,0.945222
7,0.970403,0.955929,0.974535,0.94496
8,0.992799,0.990399,0.994749,0.98986
9,0.939094,0.940994,0.924518,0.928002


In [17]:
df.mean()

rf_roc    0.966295
lr_roc    0.964513
rf_pr     0.959228
lr_pr     0.952760
dtype: float64