# SciKit Learn Training - RepeatedKFold

In [1]:
import pandas as pd
import numpy as np

### RepeatedKFold - API

__RepeatedKFold__ - jest wykorzystywany przy cross-validacji.
Cross Validation - polega na podziale całego zbioru wejściwego na N podzbiorów. Następnie N-1 podzbiorów jest traktowane jako zbiór treningowy a ostatni podzbiór N jako zbiór testowy (validacyjny).  
Cały proces uczenia i validacji jest powtarzany N razy, przy czym za każdym razem jako zbiór testowy wybierany jest inny podzbiór z N (automatycznie zbiór uczący jest przez to inny).  
Dodatkowo cały proces dzielenia zbioru wejściowego na N podzbiorów też może być powtarzany.
Właśnie do tego służy klasa __RepeatedKFold__. Metoda split zwraca indeksy elementów ze zbioru train i test dla każdej iteracji.


In [2]:
from sklearn.model_selection import RepeatedKFold

In [3]:
x = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 8], [8, 9], [9, 0], [0, 1]])
y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4])

In [8]:
# n_splits - this is a number of partitions that the input set will be split into. (one partition will be used for test, the rest for training)
# n_repeats - the number of repetitions of the split process (the generator will return multiplied number of results)
# random_state - it's a seed in the random process.
rkf1 = RepeatedKFold(n_splits=2, n_repeats=1, random_state=100)

In [9]:
rkf1.get_n_splits(x, y)

2

In [10]:
split1 = rkf1.split(x) # generates indexes to split train and test datasets. it's a generator so it can be iterated only once
split1

<generator object _RepeatedSplits.split at 0x7f7560975cf0>

In [11]:
# the split always generates a tuple of (train, test) - the output is a random split
for train, test in split1: 
    print(f'train={train}')
    print(f'test={test}')

train=[0 2 3 8 9]
test=[1 4 5 6 7]
train=[1 4 5 6 7]
test=[0 2 3 8 9]


In [12]:
# for better visualisaton of what has happened let's add an index of a split (partition) (partition is called a fold)
split1 = rkf1.split(x)
for fold, (train, test) in enumerate(split1): 
    print(f'fold={fold}')
    print(f'  train={train}')
    print(f'  test={test}')

fold=0
  train=[0 2 3 8 9]
  test=[1 4 5 6 7]
fold=1
  train=[1 4 5 6 7]
  test=[0 2 3 8 9]


In [13]:
rkf2 = RepeatedKFold(n_splits=5, n_repeats=1, random_state=100)
split2 = rkf2.split(x) 
for fold, (train, test) in enumerate(split2): 
    print(f'fold={fold}')
    print(f'  train={train}')
    print(f'  test={test}')

fold=0
  train=[0 1 2 3 4 5 8 9]
  test=[6 7]
fold=1
  train=[0 2 3 4 6 7 8 9]
  test=[1 5]
fold=2
  train=[0 1 3 5 6 7 8 9]
  test=[2 4]
fold=3
  train=[1 2 4 5 6 7 8 9]
  test=[0 3]
fold=4
  train=[0 1 2 3 4 5 6 7]
  test=[8 9]


In [14]:
rkf10 = RepeatedKFold(n_splits=10, n_repeats=1, random_state=100)
split10 = rkf10.split(x) 
for fold, (train, test) in enumerate(split10): 
    print(f'fold={fold}')
    print(f'  train={train}')
    print(f'  test={test}')

fold=0
  train=[0 1 2 3 4 5 6 8 9]
  test=[7]
fold=1
  train=[0 1 2 3 4 5 7 8 9]
  test=[6]
fold=2
  train=[0 2 3 4 5 6 7 8 9]
  test=[1]
fold=3
  train=[0 1 2 3 4 6 7 8 9]
  test=[5]
fold=4
  train=[0 1 2 3 5 6 7 8 9]
  test=[4]
fold=5
  train=[0 1 3 4 5 6 7 8 9]
  test=[2]
fold=6
  train=[1 2 3 4 5 6 7 8 9]
  test=[0]
fold=7
  train=[0 1 2 4 5 6 7 8 9]
  test=[3]
fold=8
  train=[0 1 2 3 4 5 6 7 8]
  test=[9]
fold=9
  train=[0 1 2 3 4 5 6 7 9]
  test=[8]


In [15]:
# Used n_repreats > 1
rkf5_r2 = RepeatedKFold(n_splits=5, n_repeats=2, random_state=100)
split5_r2 = rkf5_r2.split(x) 
for fold, (train, test) in enumerate(split5_r2): 
    print(f'fold={fold}')
    print(f'  train={train}')
    print(f'  test={test}')

fold=0
  train=[0 1 2 3 4 5 8 9]
  test=[6 7]
fold=1
  train=[0 2 3 4 6 7 8 9]
  test=[1 5]
fold=2
  train=[0 1 3 5 6 7 8 9]
  test=[2 4]
fold=3
  train=[1 2 4 5 6 7 8 9]
  test=[0 3]
fold=4
  train=[0 1 2 3 4 5 6 7]
  test=[8 9]
fold=5
  train=[0 1 2 3 4 5 6 9]
  test=[7 8]
fold=6
  train=[0 1 2 4 6 7 8 9]
  test=[3 5]
fold=7
  train=[0 1 2 3 5 7 8 9]
  test=[4 6]
fold=8
  train=[2 3 4 5 6 7 8 9]
  test=[0 1]
fold=9
  train=[0 1 3 4 5 6 7 8]
  test=[2 9]


In [None]:
scores = cross_validate(model, X, y, cv=5, scoring=metrics)