In [1]:
import pandas as pd
import yaml
import numpy as np
import random
from math import log2, sqrt
from dataset import Dataset
from node_tree import Node

# Gain Ratio

In [2]:
def get_gain_ratio(df, TGT_COL='target', EVAL_COL='', CATEGORICAL=False):
#     print(df)
    uniq_tgts = df[TGT_COL].unique()
    uniq_d_a_vals = df[EVAL_COL].unique() #uniq_evals
    len_d_tot = len(df)
    
    # Validate if len(df)>0, else return 0
    if len_d_tot == 0: return 0
    
    sel_cut_point = None
    
    inf_d_tot = 0
    for curr_tgt in uniq_tgts:
        curr_len = len(df.loc[df[TGT_COL] ==curr_tgt])
        if curr_len!=0:
            inf_d_tot -= (curr_len/len_d_tot)*log2(curr_len/len_d_tot) ## Respect to Target Column
    
    gain_ratio_a = 0
    if CATEGORICAL is True:
        inf_d_a = 0
        split_info_a = 0
        for curr_d_aj in uniq_d_a_vals: #curr_eval
            '''Gain A: Begin'''
            len_d_aj = len(df[df[EVAL_COL] == curr_d_aj]) #len_eval
            inf_d_aj = 0
            for curr_tgt in uniq_tgts:
                curr_len = len(df.loc[(df[EVAL_COL] == curr_d_aj) & (df[TGT_COL] ==curr_tgt)])
                if curr_len != 0 and len_d_aj != 0:
                    inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
    #         print(f'curr_val: {curr_eval}, value: {curr_info}')
            inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
            '''Gain A: End'''
            '''Split Info A: Begin'''
            if len_d_aj != 0 and len_d_tot != 0:
                split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)  ## Respect to column to be split
            '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0:
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
#         print(f'{EVAL_COL}:   inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
    else: 
        ### Mean for cut point:
        mean_val_point = df[EVAL_COL].mean()
    
#         for curr_cut_point in qt_points:
        inf_d_a = 0
        split_info_a = 0
        '''Gain A: Begin'''
        '''    < cut_point '''
        len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] < mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''   >= cut_point '''
        len_d_aj =  len_d_tot - len_d_aj
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] >= mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''Gain A: End'''
        '''Split Info A: Begin'''
        if len_d_aj != 0 and len_d_tot != 0 and len_d_aj != len_d_tot:
            #if len_d_aj < 10: print(f'values: {list(df[EVAL_COL])}')
            #print(f'col2:{EVAL_COL} len_d_aj: {len_d_aj}, len_d_tot: {len_d_tot}, point:{mean_val_point}')
            len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
            len_d_aj = len_d_tot - len_d_aj
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
        '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0: 
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
        sel_cut_point = mean_val_point
#         print(f'For cut_point: {mean_val_point}, inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
            
    return (gain_ratio_a, sel_cut_point)

In [3]:
'''
cols: [(nom_col, True), (nom_col1, True), (nom_col2, False) ... ]  ........ True if categorical, Otherwise False
'''
def select_best_column(df, tgt_col='target', cols=[], m=5):
    best_param = ('None', 0, 0)
    rand_cols = random.sample(cols, k=m)
#     print(f'rand_cols: {rand_cols}')
    for col in rand_cols:
        curr_entr, cut_point = get_gain_ratio(df, TGT_COL=tgt_col, EVAL_COL=col[0], CATEGORICAL=col[1])
        if best_param[2] <= curr_entr:
            best_param = (col[0], cut_point, curr_entr)
    return best_param
    

In [4]:
def build_tree(parent_node, df, tgt_col, cols, m):
#     print(df)
    # (col_name, cut_point, score)
    best_param = select_best_column(df, tgt_col=tgt_col, cols=cols, m=m)
#     print(f'best_param: {best_param}')
    new_cols = []
    for c in cols:
        if c[0] != best_param[0]:
            new_cols.append(c)
#     print(f'new_cols: {new_cols}')
    
    '''Numerical'''
    if best_param[1] is not None: # Numerical
        parent_node.set_children_type('N')
               
        ''' Left child '''
        df_temp = df[df[best_param[0]] < best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': True}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
        
        if len(tgt_uniq_vals) == 1:  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if len(tgt_uniq_vals) > 1:
#             print(f'1. tgt_uniq_vals: {list(df_temp[tgt_col])}')
            build_tree(node_temp, df_temp, tgt_col, new_cols, m)
            
        ''' Right child '''
        df_temp = df[df[best_param[0]] >= best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': False}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
        
        if len(tgt_uniq_vals) == 1:  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if len(tgt_uniq_vals) > 1:
#             print(f'2. tgt_uniq_vals: {list(df_temp[tgt_col])}')
            build_tree(node_temp, df_temp, tgt_col, new_cols, m)
    else:
        '''Categorical'''
        parent_node.set_children_type('C')
        uniq_vals = df[best_param[0]].unique()
        
        for val in uniq_vals:
            df_temp = df[df[best_param[0]] == val]
            data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': val, 'check_less': None}
            tgt_uniq_vals = df_temp[tgt_col].unique()
            
            data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
            
            if len(tgt_uniq_vals) == 1:  # Is leaf
                data_temp['is_leaf'] = True
                data_temp['answer'] = tgt_uniq_vals[0]
            
            node_temp = Node(data_temp)
            parent_node.insert_node(node_temp)
            
            if len(tgt_uniq_vals) > 1:
#                 print(f'cat: {list(df_temp[tgt_col])}')
                build_tree(node_temp, df_temp, tgt_col, new_cols, m)
        
#     print(f'best_param: {best_param}')

In [5]:
def generate_matrix(obj, cols, k, n_tree):
    k_folds_strat = obj.get_n_class_k_final_folds(k)
    num_test_max = max(len(k_folds_strat[k-1]["test"]), len(k_folds_strat[k-2]["test"]))
#     mat = np.zeros((k, n_tree, num_test_max))
    mat = np.full((k, n_tree, num_test_max), -1)
    mat_real_value_by_fold = np.full((k, num_test_max), -1)
    mat_real_value_by_fold.astype(int)
    for i in range(k): # Iterate over k folds
        curr_boots = obj.get_n_bootstrap(k_folds_strat[i]["train"], n_tree)
        print(f'\n____Fold{i}____')
        for j in range(len(curr_boots)): # Iterate over each tree
            print(f'__TREE{j}__', end='')
            # Declare and initialize root tree node 
            ans = max(set(list(curr_boots[j]['train']['target'])), key = list(curr_boots[j]['train']['target']).count)
            data_root = {'is_leaf': False, 'answer': ans, 'column': None, 'value_to_check': None, 'check_less': None}
            root_node = Node(data_root)
            num_rand_cols = int(sqrt(len(cols)))
            build_tree(root_node, curr_boots[j]['train'], 'target', cols, num_rand_cols)
#             root_node.print_tree(level=0) # print the tree structure
            k_it = 0
            for idx, row in k_folds_strat[i]['test'].iterrows():
                mat[i][j][k_it] = root_node.get_classification(row)
                mat_real_value_by_fold[i][k_it] = row[obj.target_feature]
                k_it+=1


    
    return (mat, mat_real_value_by_fold)

# WINE DATASET

In [6]:
'''
INITIAL PARAMETERS

'''
INPUT_PATH = 'data/wine_recognition/wine-recognition.tsv'
STRUCTURE_PATH = 'data/wine_recognition/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, '\t', 'target')
random.seed(1)
k = 10
n_tree = 50

cols = []
cols = cols + (list(zip([str(j['name']) for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
cols = cols + (list(zip([str(j['name']) for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))

(mat_pred_1, mat_real_1) = generate_matrix(obj, cols, k, n_tree)


____Fold0____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48____TREE49__
____Fold1____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48__

In [7]:
temp = mat_real_1[0]
c1 = temp[temp==1] 
c2 = temp[temp==2]
c3 = temp[temp==3]
print(c1,c2,c3)
print(c1.shape[0],c2.shape[0],c3.shape[0])
temp_aprox = mat_pred_1[0,2]
c1_aprox = temp_aprox[0:c1.shape[0]]
c2_aprox = temp_aprox[c1.shape[0]:c1.shape[0]+c2.shape[0]]
c3_aprox = temp_aprox[c2.shape[0]+c1.shape[0]:c3.shape[0]+c2.shape[0]+c1.shape[0]]
c_aprox = {1:c1_aprox,2:c2_aprox,3:c3_aprox}
print(c1_aprox,c2_aprox,c3_aprox)
# Compute of the confusition matrix
M_C = np.zeros((3,3))
cont1 = 0
cont2 = 0
cont3 = 0
for i in range(1,4):
    c = temp[temp==i]
    c1_aprox = c_aprox[i]
    for j in range(0,c.shape[0]):
        if c1_aprox[j]==1:
            cont1 = cont1+1
        elif c1_aprox[j]==2:
             cont2 = cont2+1
        elif c1_aprox[j]==3:
             cont3 = cont3+1
    M_C[i-1,0] = cont1
    M_C[i-1,1] = cont2
    M_C[i-1,2] = cont3
    cont  = 0
    cont1 = 0
    cont2 = 0
    cont3 = 0
# we calculate the Accuracy
error = np.array(np.where(mat_real_1[0]!=mat_pred_1[0,0]))
n = np.where(mat_real_1[0]!=-1)
Accuracy = 1-error.shape[1]/len(n[0])

### Reduce of the confusition matrix for a two dimensional matrix
## +--
c_p  = M_C[0,0]
M_c_2d = np.zeros((2,2))
M_c_2d[0,0] = M_C[0,0]
M_c_2d[1,1] = np.sum(M_C[1:3,1:3])
M_c_2d[0,1] = np.sum(M_C[0,1:2])
M_c_2d[1,0] = np.sum(M_C[1:2,0])
## -+-
c_p  = M_C[0,0]
M_c_2d1 = np.zeros((2,2))
M_c_2d1[0,0] = M_C[1,1]
M_c_2d1[1,1] = np.sum([M_C[0,0],M_C[0,1],M_C[2,0],M_C[2,2]])
M_c_2d1[0,1] = np.sum([M_C[1,0],M_C[1,2]])
M_c_2d1[1,0] = np.sum([M_C[0,1],M_C[2,1]])
## --+
c_p  = M_C[0,0]
M_c_2d2 = np.zeros((2,2))
M_c_2d2[0,0] = M_C[2,2]
M_c_2d2[1,1] = np.sum(M_C[0:2,0:2])
M_c_2d2[0,1] = np.sum(M_C[2,0:2])
M_c_2d2[1,0] = np.sum(M_C[0:2,2])


## computing recall and precision for F_measure
beta = 1
recall1  =  M_c_2d[0,0]/np.sum(M_c_2d[0,0:3]) 
precition1 = M_c_2d[0,0]/np.sum(M_c_2d[0:3,0]) 
f_m1 =  (1+beta**2)*precition1*recall1/(beta**2*precition1+recall1)
print(f_m1)
recall2  =  M_c_2d1[0,0]/np.sum(M_c_2d1[0,0:3]) 
precition2 = M_c_2d1[0,0]/np.sum(M_c_2d1[0:3,0]) 
f_m2 =  (1+beta**2)*precition2*recall2/(beta**2*precition2+recall2)
print(f_m2)
recall3  =  M_c_2d2[0,0]/np.sum(M_c_2d2[0,0:3]) 
precition3 = M_c_2d2[0,0]/np.sum(M_c_2d2[0:3,0]) 
f_m3 =  (1+beta**2)*precition3*recall3/(beta**2*precition3+recall3)
print(f_m3)

[1 1 1 1 1] [2 2 2 2 2 2 2] [3 3 3 3]
5 7 4
[1 1 1 2 1] [2 2 2 2 3 2 2] [3 3 2 2]
0.888888888888889
0.75
0.5714285714285715


In [213]:
M_C

array([[5., 0., 0.],
       [0., 7., 0.],
       [2., 1., 1.]])

In [75]:
print(mat_real_1[0,0])
print(mat_pred_1)

1
[[[ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  ...
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]]

 [[ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  ...
  [ 2  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]]

 [[ 1  2  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  ...
  [ 2  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]]

 ...

 [[ 1  1  1 ... -1 -1 -1]
  [ 1  2  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  ...
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]]

 [[ 1  1  1 ... -1 -1 -1]
  [ 1  1  2 ... -1 -1 -1]
  [ 1  1  2 ... -1 -1 -1]
  ...
  [ 1  1  2 ... -1 -1 -1]
  [ 1  2  1 ... -1 -1 -1]
  [ 1  1  1 ... -1 -1 -1]]

 [[ 1  1  1 ...  3  3  3]
  [ 1  1  1 ...  3  3  3]
  [ 1  1  1 ...  3  2  2]
  ...
  [ 1  1  1 ...  3  3  3]
  [ 1  1  2 ...  3  3  3]
  [ 1  1  1 ...  3  3  3]]]


In [81]:
M_C = np.zeros((10,50,3,3))
Accuracy = np.zeros(10)
beta = 1.0
Metrics = {}
for ii in range(0,len(mat_real_1)):
    temp = mat_real_1[ii]
    #print(temp)
    c1 = temp[temp==1] 
    c2 = temp[temp==2]
    c3 = temp[temp==3]
    #print(c1,c2,c3,c1.shape[0],c2.shape[0],c3.shape[0])
    for jj in range(0,mat_pred_1.shape[1]):
        temp_aprox = mat_pred_1[ii,jj]
        c1_aprox = temp_aprox[0:c1.shape[0]]
        c2_aprox = temp_aprox[c1.shape[0]:c1.shape[0]+c2.shape[0]]
        c3_aprox = temp_aprox[c1.shape[0]+c2.shape[0]:c3.shape[0]+c2.shape[0]+c1.shape[0]]
        c_aprox = {1:c1_aprox,2:c2_aprox,3:c3_aprox}
        # Compute of the confusition matrix
        #print(c1_aprox,c2_aprox,c3_aprox)  
        cont  = 0
        cont1 = 0
        cont2 = 0
        cont3 = 0
        # number of class in the dataset
        for i in range(1,4):
            c = temp[temp==i]
            c1_aprox = c_aprox[i]
            for j in range(0,c.shape[0]):
                if c1_aprox[j]==1:
                    cont1 = cont1+1
                elif c1_aprox[j]==2:
                     cont2 = cont2+1
                elif c1_aprox[j]==3:
                     cont3 = cont3+1
            M_C[ii,jj,i-1,0] = cont1
            M_C[ii,jj,i-1,1] = cont2
            M_C[ii,jj,i-1,2] = cont3
            cont  = 0
            cont1 = 0
            cont2 = 0
            cont3 = 0
            # we calculate the Accuracy
            error = np.array(np.where(mat_real_1[ii]!=mat_pred_1[ii,jj]))
            n = np.where(mat_real_1[0]!=-1)
            Accuracy[ii] = 1-error.shape[1]/len(n[0])

        ### Reduce of the confusition matrix for a two dimensional matrix
        ## +--
        M_c_2d = np.zeros((2,2))
        M_c_2d[0,0] = M_C[ii,jj,0,0]
        M_c_2d[1,1] = np.sum(M_C[ii,jj,1:3,1:3])
        M_c_2d[0,1] = np.sum(M_C[ii,jj,0,1:2])
        M_c_2d[1,0] = np.sum(M_C[ii,jj,1:2,0])
        ## -+-
        M_c_2d1 = np.zeros((2,2))
        M_c_2d1[0,0] = M_C[ii,jj,1,1]
        M_c_2d1[1,1] = np.sum([M_C[ii,jj,0,0],M_C[ii,jj,0,1],M_C[ii,jj,2,0],M_C[ii,jj,2,2]])
        M_c_2d1[0,1] = np.sum([M_C[ii,jj,1,0],M_C[ii,jj,1,2]])
        M_c_2d1[1,0] = np.sum([M_C[ii,jj,0,1],M_C[ii,jj,2,1]])
        ## --+
        M_c_2d2 = np.zeros((2,2))
        M_c_2d2[0,0] = M_C[ii,jj,2,2]
        M_c_2d2[1,1] = np.sum(M_C[ii,jj,0:2,0:2])
        M_c_2d2[0,1] = np.sum(M_C[ii,jj,2,0:2])
        M_c_2d2[1,0] = np.sum(M_C[ii,jj,0:2,2])
        
        mctotal = M_c_2d+ M_c_2d1+M_c_2d2
        ## computing recall and precision for F_measure
       
        recall1  =  M_c_2d[0,0]/(np.sum(M_c_2d[0,0:3]) + 0.0000001)
        precition1 = M_c_2d[0,0]/(np.sum(M_c_2d[0:3,0])+ 0.0000001)
        f_m1 =  (1.0+beta**2)*precition1*recall1/(beta**2*precition1+recall1)
        recall2  =  M_c_2d1[0,0]/(np.sum(M_c_2d1[0,0:3])+ 0.0000001 )
        precition2 = M_c_2d1[0,0]/(np.sum(M_c_2d1[0:3,0]) + 0.0000001)
        f_m2 =  (1+beta**2)*precition2*recall2/(beta**2*precition2+recall2)
        recall3  =  M_c_2d2[0,0]/(np.sum(M_c_2d2[0,0:3]) + 0.0000001)
        precition3 = M_c_2d2[0,0]/(np.sum(M_c_2d2[0:3,0]) + 0.0000001)
        f_m3 =  (1+beta**2)*precition3*recall3/(beta**2*precition3+recall3)
        
        recall0  =  mctotal[0,0]/(np.sum(mctotal[0,0:3]) + 0.0000001)
        precition0 = mctotal[0,0]/(np.sum(mctotal[0:3,0])+ 0.0000001)
        f_m0 =  (1.0+beta**2)*precition0*recall0/(beta**2*precition0+recall0)
        
        Metrics[ii,jj]= {'0':f_m1,'1':f_m2,'2':f_m3,'3':np.mean([f_m1,f_m2,f_m3]),'4':f_m0}

        

In [82]:
print(Metrics)

{(0, 0): {'0': 0.9999999800000003, '1': 0.9999999857142858, '2': 0.9999999750000006, '3': 0.9999999802380956, '4': 0.9999999937499999}, (0, 1): {'0': 0.8888888691358029, '1': 0.8571428448979593, '2': 0.8888888691358029, '3': 0.8783068610565218, '4': 0.8749999945312501}, (0, 2): {'0': 0.8888888691358029, '1': 0.7499999906250002, '2': 0.5714285551020414, '3': 0.7367724716209482, '4': 0.7499999953125}, (0, 3): {'0': 0.5714285551020414, '1': 0.7777777691358027, '2': 0.857142832653062, '3': 0.7354497189636353, '4': 0.7499999953125}, (0, 4): {'0': 0.7272727140495869, '1': 0.7142857040816327, '2': 0.857142832653062, '3': 0.7662337502614273, '4': 0.7499999953125}, (0, 5): {'0': 0.9090908925619838, '1': 0.8333333194444447, '2': 0.8888888691358029, '3': 0.8771043603807439, '4': 0.8749999945312501}, (0, 6): {'0': 0.8888888691358029, '1': 0.8235294020761247, '2': 0.6666666444444452, '3': 0.7930283052187909, '4': 0.812499994921875}, (0, 7): {'0': 0.7272727140495869, '1': 0.7692307573964497, '2': 0.

In [204]:
## +--
c_p  = M_C[0,0]
M_c_2d = np.zeros((2,2))
M_c_2d[0,0] = M_C[0,0]
M_c_2d[1,1] = np.sum(M_C[1:3,1:3])
M_c_2d[0,1] = np.sum(M_C[0,1:2])
M_c_2d[1,0] = np.sum(M_C[1:2,0])
## -+-
c_p  = M_C[0,0]
M_c_2d1 = np.zeros((2,2))
M_c_2d1[0,0] = M_C[1,1]
M_c_2d1[1,1] = np.sum([M_C[0,0],M_C[0,1],M_C[2,0],M_C[2,2]])
M_c_2d1[0,1] = np.sum([M_C[1,0],M_C[1,2]])
M_c_2d1[1,0] = np.sum([M_C[0,1],M_C[2,1]])
## --+
c_p  = M_C[0,0]
M_c_2d2 = np.zeros((2,2))
M_c_2d2[0,0] = M_C[2,2]
M_c_2d2[1,1] = np.sum(M_C[0:2,0:2])
M_c_2d2[0,1] = np.sum(M_C[2,0:2])
M_c_2d2[1,0] = np.sum(M_C[0:2,2])

## computing recall and precision for F_measure
beta = 1
recall1  =  M_c_2d[0,0]/np.sum(M_c_2d[0,0:3]) 
precition1 = M_c_2d[0,0]/np.sum(M_c_2d[0:3,0]) 
f_m1 =  (1+beta**2)*precition1*recall1/(beta**2*precition1+recall1)
print(f_m1)
recall2  =  M_c_2d1[0,0]/np.sum(M_c_2d1[0,0:3]) 
precition2 = M_c_2d1[0,0]/np.sum(M_c_2d1[0:3,0]) 
f_m2 =  (1+beta**2)*precition2*recall2/(beta**2*precition2+recall2)
print(f_m2)
recall3  =  M_c_2d2[0,0]/np.sum(M_c_2d2[0,0:3]) 
precition3 = M_c_2d2[0,0]/np.sum(M_c_2d2[0:3,0]) 
f_m3 =  (1+beta**2)*precition3*recall3/(beta**2*precition3+recall3)
print(f_m3)

1.0
0.9333333333333333
0.8571428571428571


In [205]:
M_C

array([[5., 0., 0.],
       [0., 7., 0.],
       [0., 1., 3.]])

In [184]:
print(M_c_2d)
print(M_c_2d1)
print(np.sum(M_c_2d2))
print(np.zeros(10))

[[ 5.  0.]
 [ 0. 11.]]
[[7. 0.]
 [1. 8.]]
16.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [175]:
beta = 1
recall1  =  M_c_2d[0,0]/np.sum(M_c_2d[0,0:3]) 
precition1 = M_c_2d[0,0]/np.sum(M_c_2d[0:3,0]) 
f_m1 =  (1+beta**2)*precition1*recall1/(beta**2*precition1+recall1)
print(f_m1)
recall2  =  M_c_2d1[0,0]/np.sum(M_c_2d1[0,0:3]) 
precition2 = M_c_2d1[0,0]/np.sum(M_c_2d1[0:3,0]) 
f_m2 =  (1+beta**2)*precition2*recall2/(beta**2*precition2+recall2)
print(f_m2)
recall3  =  M_c_2d2[0,0]/np.sum(M_c_2d2[0,0:3]) 
precition3 = M_c_2d2[0,0]/np.sum(M_c_2d2[0:3,0]) 
f_m3 =  (1+beta**2)*precition3*recall3/(beta**2*precition3+recall3)
print(f_m3)

1.0
0.9333333333333333
0.8571428571428571


In [149]:
error = np.array(np.where(mat_real_1[0]!=mat_pred_1[0,0]))
n = np.where(mat_real_1[0]!=-1)
Accuracy = 1-error.shape[1]/len(n[0])
print(error.shape,Accuracy)
print(len(n[0]))
error = np.where(mat_real_1[0]!=mat_pred_1[0,0])
print(error)
print(15/16)

(1, 1) 0.9375
16
(array([12]),)
0.9375


# HOUSE VOTES

In [10]:
'''
INITIAL PARAMETERS

'''
INPUT_PATH = 'data/house_votes_84/house-votes-84.tsv'
STRUCTURE_PATH = 'data/house_votes_84/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, '\t', 'target')
random.seed(1)
k = 10
n_tree = 50

cols = []
cols = cols + (list(zip([str(j['name']) for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
cols = cols + (list(zip([str(j['name']) for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))

(mat_pred_2, mat_real_2) = generate_matrix(obj, cols, k, n_tree)


____Fold0____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48____TREE49__
____Fold1____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48__

In [11]:
mat_real_2

array([[ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
 

In [53]:
mat_pred_2[0,0]

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1])

In [84]:
M_C = np.zeros((10,50,2,2))
Accuracy = np.zeros(10)
beta = 1.0
Metric = np.zeros((10,50))
for ii in range(0,len(mat_real_2)):
    temp = mat_real_2[ii]
    c1 = temp[temp==0] 
    c2 = temp[temp==1]
    #c3 = temp[temp==2]
    for jj in range(0,mat_pred_2.shape[1]):
        temp_aprox = mat_pred_2[ii,jj]
        c1_aprox = temp_aprox[0:c1.shape[0]]
        c2_aprox = temp_aprox[c1.shape[0]:c1.shape[0]+c2.shape[0]]
        #c3_aprox = temp_aprox[c2.shape[0]+c1.shape[0]:c3.shape[0]+c2.shape[0]+c1.shape[0]]
        c_aprox = {0:c1_aprox,1:c2_aprox}
        # Compute of the confusition matrix
   
        cont1 = 0
        cont2 = 0
        # number of class in the dataset
        for i in range(0,2):
            c = temp[temp==i]
            c1_aprox = c_aprox[i]
            for j in range(0,c.shape[0]):
                if c1_aprox[j]==0:
                    cont1 = cont1+1
                elif c1_aprox[j]==1:
                     cont2 = cont2+1
            M_C[ii,jj,i,0] = cont2
            M_C[ii,jj,i,1] = cont1
            
            cont1 = 0
            cont2 = 0
            # we calculate the Accuracy
            error = np.array(np.where(mat_real_1[ii]!=mat_pred_1[ii,jj]))
            n = np.where(mat_real_1[0]!=-1)
            Accuracy[ii] = 1-error.shape[1]/len(n[0])

              
        ## computing recall and precision for F_measure
       
        recall1  =  M_C[ii,jj,0,0]/(np.sum(M_C[ii,jj,0,0:2]) + 0.0000001)
        precition1 = M_C[ii,jj,0,0]/(np.sum(M_C[ii,jj,0:2,0])+ 0.0000001)
        f_m =  (1.0+beta**2)*precition1*recall1/(beta**2*precition1+recall1)
        Metric[ii,jj]= f_m

        

In [85]:
print(Metric)

[[0.74418604 0.79069767 0.76190476 0.76190476 0.76190476 0.74418604
  0.76190476 0.73170731 0.76190476 0.73170731 0.73170731 0.76190476
  0.76190476 0.74418604 0.76190476 0.72727272 0.73170731 0.76190476
  0.79069767 0.76190476 0.76190476 0.76190476 0.76190476 0.73170731
  0.76190476 0.71428571 0.76190476 0.76190476 0.71428571 0.76190476
  0.76190476 0.79069767 0.79069767 0.71428571 0.76190476 0.79069767
  0.76190476 0.76190476 0.76190476 0.76190476 0.7        0.73170731
  0.76190476 0.73170731 0.73170731 0.73170731 0.76190476 0.76190476
  0.74418604 0.74418604]
 [0.71428571 0.73170731 0.74418604 0.7        0.71428571 0.74418604
  0.79069767 0.79069767 0.79069767 0.73170731 0.7        0.74418604
  0.77272727 0.79069767 0.74418604 0.73170731 0.74418604 0.76190476
  0.72727272 0.79069767 0.66666666 0.76190476 0.68292683 0.74418604
  0.76190476 0.74418604 0.76190476 0.74418604 0.71428571 0.74418604
  0.76190476 0.73170731 0.79069767 0.76190476 0.76190476 0.76190476
  0.76190476 0.79069767

In [87]:
print(M_C[0,0])

[[16. 10.]
 [ 1. 15.]]
