In [56]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

def bootstrapping(X_train, y_train, frac=0.8, seed=0, n_bootstraps=20):
    np.random.seed(seed)
 
    inds = X_train.index
    n = len(X_train)
    num_samples = int(n * frac)
    X_train_boots, y_train_boots, inds_train_boots = [], [], []
    for _ in range(n_bootstraps):
        inds_sel = np.random.choice(inds, num_samples)
        X_train_sel, y_train_sel = X_train.loc[inds_sel], y_train.loc[inds_sel]
        X_train_boots.append(X_train_sel); y_train_boots.append(y_train_sel); inds_train_boots.append(inds_sel)
    return X_train_boots, y_train_boots, inds_train_boots

def cv_train_test_split(X, y, fold=10, seed=0, n_bootstraps=20):
    kf = KFold(n_splits=fold, shuffle=True, random_state=seed)
    
    X_trains, y_trains, X_tests, y_tests = [], [], [], []
    all_indices = np.arange(len(X))
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print(f"Train init: {len(X_train)}, Test init: {len(X_test)}")
        
        # Bootstrapping on the training set for each fold
        X_train_boots, y_train_boots, _ = bootstrapping(X_train, y_train, frac=0.8, seed=seed, n_bootstraps=n_bootstraps)
        
        print(f"Train boots: {len(X_train_boots[0])}")

        X_trains.append(X_train_boots)
        y_trains.append(y_train_boots)
        X_tests.append(X_test)
        y_tests.append(y_test)
        
        print(f"Train tot: {len(X_trains)}, Test tot: {len(X_tests)}")

    
    return X_trains, y_trains, X_tests, y_tests

In [57]:
x = np.linspace(1, 100, 100)
y = x

In [59]:
X_trains, y_trains, X_tests, y_tests = cv_train_test_split(pd.DataFrame(x), pd.Series(y), fold=10, seed=0, n_bootstraps=20)

Train init: 90, Test init: 10
Train boots: 72
Train tot: 1, Test tot: 1
Train init: 90, Test init: 10
Train boots: 72
Train tot: 2, Test tot: 2
Train init: 90, Test init: 10
Train boots: 72
Train tot: 3, Test tot: 3
Train init: 90, Test init: 10
Train boots: 72
Train tot: 4, Test tot: 4
Train init: 90, Test init: 10
Train boots: 72
Train tot: 5, Test tot: 5
Train init: 90, Test init: 10
Train boots: 72
Train tot: 6, Test tot: 6
Train init: 90, Test init: 10
Train boots: 72
Train tot: 7, Test tot: 7
Train init: 90, Test init: 10
Train boots: 72
Train tot: 8, Test tot: 8
Train init: 90, Test init: 10
Train boots: 72
Train tot: 9, Test tot: 9
Train init: 90, Test init: 10
Train boots: 72
Train tot: 10, Test tot: 10


In [43]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in kf.split(x):
    print(len(train_index), len(test_index))

80 20
80 20
80 20
80 20
80 20
