# K fold from unbalanced dataset

the aim of this function is to define of which k_fold is associated each sample, by maintaining the proportions for each k_fold of the classes of the original dataset.

In [1]:
import pandas as pd
import numpy as np

In [5]:
def sampler_cross_validation_of_unbalanced_dataset(y, K, classes = np.array([0, 1, 2]), random_state = 123):
    '''this function return for each sample the associated K, 
    where each K will have the same the same proportion of classes'''
    
    #init the array with the labels
    k_label = np.zeros(y.shape[0])
    
    #set seed
    np.random.seed(random_state)
    
    #for each class, I split it equally in K classes
    for c in classes:
        #obtain the indexes of c elements
        c_idx = y == c #boolean array
        k_label[c_idx] = np.random.randint(K, size=sum(c_idx))#generate random fold
    
    return k_label

def sampler_report(y, k_label, classes = np.array([0, 1, 2])):
    '''show the proporions for each K-fold
    '''
    
    #obrain the number of K
    K = int(k_label.max()) + 1
    
    print("num of K-fold", K)
    
    #
    for k in range(K):
        #fold 
        print('\n Fold n ', k)
        k_idx = k_label == k
        y_fold = y[k_idx] #get the y of the k fold
        print('Num of samples', y_fold.shape[0])
        for c in classes:
            print('Class ', c, ':', (y_fold == c).sum()/y_fold.shape[0])
        

In [10]:
#test 
y = pd.read_csv('data/unbalanced_labels.csv')['2']

sampler_report(y,sampler_cross_validation_of_unbalanced_dataset(y, 5))

num of K-fold 5

 Fold n  0
Num of samples 4880
Class  0 : 0.05758196721311475
Class  1 : 0.7690573770491803
Class  2 : 0.17336065573770493

 Fold n  1
Num of samples 4949
Class  0 : 0.06061830672863205
Class  1 : 0.7639927258031926
Class  2 : 0.1753889674681754

 Fold n  2
Num of samples 5094
Class  0 : 0.05182567726737338
Class  1 : 0.7767962308598351
Class  2 : 0.17137809187279152

 Fold n  3
Num of samples 5016
Class  0 : 0.061004784688995214
Class  1 : 0.7812998405103668
Class  2 : 0.15769537480063797

 Fold n  4
Num of samples 4843
Class  0 : 0.05760892009085278
Class  1 : 0.7805079496180054
Class  2 : 0.16188313029114185
