In [1]:
%matplotlib inline
import numpy as np
import sklearn
from sklearn import linear_model, metrics
import pandas as pd
from matplotlib import pyplot as plt
from mylibs import transform as tf

In [2]:
np.random.seed(0)
x = np.random.rand(20) # 20 valores
x = (x * 100).round(2) # valores até 100
x = np.resize(x, (20, 1))

In [3]:
tf.normalize(x)

array([[0.56025437],
       [0.73661897],
       [0.61748808],
       [0.55612083],
       [0.42766296],
       [0.66316905],
       [0.44239534],
       [0.92379438],
       [1.        ],
       [0.38494966],
       [0.81770005],
       [0.53916269],
       [0.58060413],
       [0.95961844],
       [0.05384208],
       [0.0709062 ],
       [0.        ],
       [0.86104928],
       [0.80339163],
       [0.90068892]])

In [4]:
tf.standardize(x)

array([[-0.11870903],
       [ 0.48434953],
       [ 0.07699507],
       [-0.13284322],
       [-0.5720902 ],
       [ 0.23319593],
       [-0.52171451],
       [ 1.12437442],
       [ 1.38495081],
       [-0.71814345],
       [ 0.761597  ],
       [-0.19082962],
       [-0.04912535],
       [ 1.24687069],
       [-1.85032791],
       [-1.79197909],
       [-2.03443473],
       [ 0.90982474],
       [ 0.71267098],
       [ 1.04536795]])

In [5]:
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [2, 4], [1, 3]])
y = np.array([1, 2, 3, 4, 5, 6])
kf = KFold(n_splits=5)
kf.get_n_splits(X)

5

In [6]:
print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [7]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 3 4 5] TEST: [0 1]
TRAIN: [0 1 3 4 5] TEST: [2]
TRAIN: [0 1 2 4 5] TEST: [3]
TRAIN: [0 1 2 3 5] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


In [8]:
def iter_test_indices(X,y=None, groups=None):
    n_samples = _num_samples(X)
    indices = np.arange(n_samples)
    if self.shuffle:
        check_random_state(self.random_state).shuffle(indices)

    n_splits = self.n_splits
    fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
    fold_sizes[:n_samples % n_splits] += 1
    current = 0
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        yield indices[start:stop]
        current = stop

In [9]:
iter_test_indices(X.size)

<generator object iter_test_indices at 0x0000026489670FC0>

In [26]:
def slipt_k_fold(n_elem, n_splits, shuffle, seed):
    total = [ i for i in range(n_elem)]
    
    if shuffle:
        np.random.shuffle(total)# no array dos índices vamos misturar
    if seed:
        np.random.seed(seed)
    
    fold_sizes = (n_elem // n_splits) * np.ones(n_splits, dtype=np.int)
    fold_sizes[:n_elem % n_splits] += 1
    
    idx_train = [0] * n_splits
    idx_test = [0] * n_splits
    i = 0
    current = 0
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        idx_test[i] = total[start:stop]
        range2 = np.arange(start, stop)
        idx_train[i] = np.delete(total, range2)
        current = stop
        i = i + 1
        
    return idx_test, idx_train

In [27]:
test, train = slipt_k_fold(20, 5, True, 2)

In [28]:
train

[array([ 8, 15,  0, 19, 12,  1, 16, 18, 17, 11, 10,  4,  6,  3,  7,  5]),
 array([13, 14,  2,  9, 12,  1, 16, 18, 17, 11, 10,  4,  6,  3,  7,  5]),
 array([13, 14,  2,  9,  8, 15,  0, 19, 17, 11, 10,  4,  6,  3,  7,  5]),
 array([13, 14,  2,  9,  8, 15,  0, 19, 12,  1, 16, 18,  6,  3,  7,  5]),
 array([13, 14,  2,  9,  8, 15,  0, 19, 12,  1, 16, 18, 17, 11, 10,  4])]

In [29]:
test

[[13, 14, 2, 9],
 [8, 15, 0, 19],
 [12, 1, 16, 18],
 [17, 11, 10, 4],
 [6, 3, 7, 5]]

In [19]:
def __folds(n_elem, n_splits=2, shuffle=True, seed=0):
    total = np.arange(n_elem)
    if shuffle:
        np.random.shuffle(total)
    if seed:
        np.random.seed(seed)
    
    fold_size = n_elem // n_splits
    split_fold = np.arange(0, n_elem, fold_size)[1:n_splits]
    return np.split(total, split_fold)
    
def split_k_fold_gaby(n_elem, n_splits=2, shuffle=True, seed=0):
    '''Separa em K folds'''
    if n_splits < 2:
        raise Exception('Deve ter pelo menos 2 n_plits')
        
    train = []
    test  = []
    total = __folds(n_elem, n_splits, shuffle, seed)
    
    for i in range(n_splits):
        x = np.delete(total, [i])
        if type(x[0]) == np.ndarray:
            x = np.concatenate(x)
        train.append(x)
        test.append(total[i])
    
    return train, test


In [22]:
split_k_fold_gaby(20, 5, True, 2)

([array([ 4, 18,  0,  9,  5,  3, 10,  1, 17,  7, 16, 14,  2, 11,  6, 19, 13,
         15,  8]),
  array([12, 18,  0,  9,  5,  3, 10,  1, 17,  7, 16, 14,  2, 11,  6, 19, 13,
         15,  8]),
  array([12,  4,  0,  9,  5,  3, 10,  1, 17,  7, 16, 14,  2, 11,  6, 19, 13,
         15,  8]),
  array([12,  4, 18,  9,  5,  3, 10,  1, 17,  7, 16, 14,  2, 11,  6, 19, 13,
         15,  8]),
  array([12,  4, 18,  0,  5,  3, 10,  1, 17,  7, 16, 14,  2, 11,  6, 19, 13,
         15,  8])],
 [array([12,  4, 18,  0]),
  array([ 9,  5,  3, 10]),
  array([ 1, 17,  7, 16]),
  array([14,  2, 11,  6]),
  array([19, 13, 15,  8])])