In [1]:
import pandas as pd
import yaml
import numpy as np
import random
from math import log2

In [2]:
class Dataset():
    data = pd.DataFrame()
    categorical_features = []
    continuous_features = []
    yaml_structure = {}
    target_feature = ''
    target_type = ''

    '''
    Load the CSV/TSV file, saves it in data
    '''
    def load_dataset(self, input_path, separator):
        self.data = pd.read_csv(input_path, sep=separator)
    
    
    '''
    Read the structure and get the types of the columns in two list [categorical, coninuous]
    Set target_column and target_type
    '''
    def read_structure(self, input_file, target_column):
        with open(input_file) as f:
            self.yaml_structure = yaml.load(f, Loader=yaml.FullLoader)
        self.target_feature = target_column
        self.target_type = self.yaml_structure['target']['type']
        
        self.categorical_features = []
        self.continuous_features = []
        
        for feature in self.yaml_structure['features']:
            if feature['type'] == 'continuous':
                self.continuous_features.append(feature['name'])
            else:
                self.categorical_features.append(feature['name'])

    
    '''
    Initialization
    '''
    def __init__(self, file_dataset_path, file_structure_path, char_separator='\t', target_column='target'):
        self.load_dataset(file_dataset_path, char_separator)
        self.read_structure(file_structure_path, target_column)

In [3]:
def Boostrap(data):
    [m,n] = data.shape
    index_train = [] # index set of training set
    index_test  = [] # index set of test set
    tol = 0
    data_split = {'train':index_train, 'test': index_test} # set that contain train and test set split of Dataset
    while tol<100:
        for i in range(0,m):
            index_train.append(np.random.randint(0, m))
        for i in range(0,m):
            try:
                index_train.index(i)
            except:
                index_test.append(i)
        
        if len(index_test)<=round(0.35*m):
                tol =100
        else:
            index_train = []
            index_test  = []
            if tol == 99:
                print(tol)
                tol = 0
        tol = tol+1

    data_split['train'] = data.iloc[index_train]
    data_split['test']  = data.iloc[index_test]
    return data_split

In [4]:
INPUT_PATH = 'data/wine_recognition/wine-recognition.tsv'
STRUCTURE_PATH = 'data/wine_recognition/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, '\t', 'target')

In [5]:
def K_folds(data,k): # Split a group in k- subgroups
        N = data.shape[0]
        index = np.random.randint(0, N,size=N)
        n_folds = N//k
        idfold = np.arange(0,N,n_folds)
        k_folds = {}
        for i in range(0,k):
            if i == k-1:
                k_folds[i] = data.iloc[index[idfold[i]:N]]
            else:
                k_folds[i] = data.iloc[index[idfold[i]:idfold[i+1]]]
        return k_folds
    
def K_folds_final(k_folds,k): # Takes k-1 folds for training, and the remaining fold for testing
        date_fold = {}
        temp = {}
        aux = 0
        for i in range(0,k):
            for j in range(0,k):
                if i != j:
                    temp[aux] = k_folds[j]
                    aux = aux + 1

            date_fold[i] = {'train': temp, 'test': k_folds[i]}
            temp={}
            aux = 0
        for i in range(0,k):
            a = date_fold[i]
            b = a['train']
            c = b[0]
            for j in range(1,k-1):
                c = np.append(c,b[j],axis=0)
            date_fold[i]['train']=c    
        return date_fold

In [8]:
# k-fold cross-validation stratified
# c1: represent the one class(ceros)
c1 = obj.data[obj.data.target==1]
# c2: represent the two class(ones)
c2 = obj.data[obj.data.target==2]
# c3: represent the two class(ones)
c3 = obj.data[obj.data.target==3]

k = 10 # number of split of the k-folds

k_folds_c1 = K_folds(c1,k) # Split of c1 in k=folds
k_folds_c2 = K_folds(c2,k) # Split of c2 in k=folds
k_folds_c3 = K_folds(c3,k) # Split of c3 in k=folds
k_folds = {} # represent of k-fold cross-validation stratified total
for i in range(0,k):
        k_folds[i] = np.append(np.append(k_folds_c1[i],k_folds_c2[i],axis=0),k_folds_c3[i],axis=0)


k_folds_final = K_folds_final(k_folds,k) # contains all test and training combinations of the k groups


In [9]:
for i in range(10):
    print(len(k_folds_final[i]['train']),len(k_folds_final[i]['test']))

162 16
162 16
162 16
162 16
162 16
162 16
162 16
162 16
162 16
144 34


In [10]:
# size of the dateset
[m,n] = obj.data.shape
#data_slt = Boostrap(m,obj.data)
#print(datas['train'])
#print('#########%%%%%%%%%%%%')
#print(datas['test'])
# Set that contain s boostrap slpited in train and test set
S_data_boostrap = {}
S_data_boostrap_total = {}
for j in range(k):
    aux = pd.DataFrame(k_folds_final[j]['train'])
    for i in range(0,50):
        S_data_boostrap[i] = Boostrap(aux)
        #print('########################')
        #print(len(S_data_boostrap[i]['train']))
        #print(len(S_data_boostrap[i]['test']))
        #print('%%%%%%%%%%%%%%%%%%%%%%%%')
    S_data_boostrap_total[j]={'train': S_data_boostrap, 'test': pd.DataFrame(k_folds_final[j]['test'])}

In [20]:
print(S_data_boostrap_total[2]['train'][0]['test'])

      0      1     2     3     4      5     6     7     8     9     10    11  \
0    1.0  14.39  1.87  2.45  14.6   96.0  2.50  2.52  0.30  1.98  5.25  1.02   
2    1.0  13.05  1.73  2.04  12.4   92.0  2.72  3.27  0.17  2.91  7.20  1.12   
3    1.0  13.76  1.53  2.70  19.5  132.0  2.95  2.74  0.50  1.35  5.40  1.25   
5    2.0  11.96  1.09  2.30  21.0  101.0  3.38  2.14  0.13  1.65  3.21  0.99   
7    2.0  12.04  4.30  2.38  22.0   80.0  2.10  1.75  0.42  1.35  2.60  0.79   
15   3.0  13.17  5.19  2.32  22.0   93.0  1.74  0.63  0.61  1.55  7.90  0.60   
19   1.0  14.22  1.70  2.30  16.3  118.0  3.20  3.00  0.26  2.03  6.38  0.94   
21   2.0  12.21  1.19  1.75  16.8  151.0  1.85  1.28  0.14  2.50  2.85  1.28   
23   2.0  12.77  3.43  1.98  16.0   80.0  1.63  1.25  0.43  0.83  3.40  0.70   
25   2.0  12.64  1.36  2.02  16.8  100.0  2.02  1.41  0.53  0.62  5.75  0.98   
26   2.0  11.65  1.67  2.62  26.0   88.0  1.92  1.61  0.40  1.34  2.60  1.36   
28   3.0  12.51  1.24  2.25  17.5   85.0