In [37]:
import pandas as pd
import yaml
import numpy as np
import random
from math import log2, sqrt
from dataset import Dataset
from node_tree import Node

# Gain Ratio

In [38]:
def get_gain_ratio(df, TGT_COL='target', EVAL_COL='', CATEGORICAL=False):
#     print(df)
    uniq_tgts = df[TGT_COL].unique()
    uniq_d_a_vals = df[EVAL_COL].unique() #uniq_evals
    len_d_tot = len(df)
    
    # Validate if len(df)>0, else return 0
    if len_d_tot == 0: return 0
    
    sel_cut_point = None
    
    inf_d_tot = 0
    for curr_tgt in uniq_tgts:
        curr_len = len(df.loc[df[TGT_COL] ==curr_tgt])
        if curr_len!=0:
            inf_d_tot -= (curr_len/len_d_tot)*log2(curr_len/len_d_tot) ## Respect to Target Column
    
    gain_ratio_a = 0
    if CATEGORICAL is True:
        inf_d_a = 0
        split_info_a = 0
        for curr_d_aj in uniq_d_a_vals: #curr_eval
            '''Gain A: Begin'''
            len_d_aj = len(df[df[EVAL_COL] == curr_d_aj]) #len_eval
            inf_d_aj = 0
            for curr_tgt in uniq_tgts:
                curr_len = len(df.loc[(df[EVAL_COL] == curr_d_aj) & (df[TGT_COL] ==curr_tgt)])
                if curr_len != 0 and len_d_aj != 0:
                    inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
    #         print(f'curr_val: {curr_eval}, value: {curr_info}')
            inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
            '''Gain A: End'''
            '''Split Info A: Begin'''
            if len_d_aj != 0 and len_d_tot != 0:
                split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)  ## Respect to column to be split
            '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0:
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
#         print(f'{EVAL_COL}:   inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
    else: 
        ### Mean for cut point:
        mean_val_point = df[EVAL_COL].mean()
    
#         for curr_cut_point in qt_points:
        inf_d_a = 0
        split_info_a = 0
        '''Gain A: Begin'''
        '''    < cut_point '''
        len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] < mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''   >= cut_point '''
        len_d_aj =  len_d_tot - len_d_aj
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] >= mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''Gain A: End'''
        '''Split Info A: Begin'''
        if len_d_aj != 0 and len_d_tot != 0 and len_d_aj != len_d_tot:
            #if len_d_aj < 10: print(f'values: {list(df[EVAL_COL])}')
            #print(f'col2:{EVAL_COL} len_d_aj: {len_d_aj}, len_d_tot: {len_d_tot}, point:{mean_val_point}')
            len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
            len_d_aj = len_d_tot - len_d_aj
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
        '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0: 
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
        sel_cut_point = mean_val_point
#         print(f'For cut_point: {mean_val_point}, inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
            
    return (gain_ratio_a, sel_cut_point)

In [39]:
'''
cols: [(nom_col, True), (nom_col1, True), (nom_col2, False) ... ]  ........ True if categorical, Otherwise False
'''
def select_best_column(df, tgt_col='target', cols=[], m=5):
    best_param = ('None', 0, 0)
    rand_cols = random.sample(cols, k=m)
#     print(f'rand_cols: {rand_cols}')
    for col in rand_cols:
        curr_entr, cut_point = get_gain_ratio(df, TGT_COL=tgt_col, EVAL_COL=col[0], CATEGORICAL=col[1])
        if best_param[2] <= curr_entr:
            best_param = (col[0], cut_point, curr_entr)
    return best_param
    

In [40]:
def build_tree(parent_node, df, tgt_col, cols, m):
#     print(df)
    # (col_name, cut_point, score)
    best_param = select_best_column(df, tgt_col=tgt_col, cols=cols, m=m)
#     print(f'best_param: {best_param}')
    new_cols = []
    for c in cols:
        if c[0] != best_param[0]:
            new_cols.append(c)
#     print(f'new_cols: {new_cols}')
    
    '''Numerical'''
    if best_param[1] is not None: # Numerical
        parent_node.set_children_type('N')
               
        ''' Left child '''
        df_temp = df[df[best_param[0]] < best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': True}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
        
        if len(tgt_uniq_vals) == 1:  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if len(tgt_uniq_vals) > 1:
#             print(f'1. tgt_uniq_vals: {list(df_temp[tgt_col])}')
            build_tree(node_temp, df_temp, tgt_col, new_cols, m)
            
        ''' Right child '''
        df_temp = df[df[best_param[0]] >= best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': False}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
        
        if len(tgt_uniq_vals) == 1:  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if len(tgt_uniq_vals) > 1:
#             print(f'2. tgt_uniq_vals: {list(df_temp[tgt_col])}')
            build_tree(node_temp, df_temp, tgt_col, new_cols, m)
    else:
        '''Categorical'''
        parent_node.set_children_type('C')
        uniq_vals = df[best_param[0]].unique()
        
        for val in uniq_vals:
            df_temp = df[df[best_param[0]] == val]
            data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': val, 'check_less': None}
            tgt_uniq_vals = df_temp[tgt_col].unique()
            
            data_temp['answer'] = max(set(list(df_temp[tgt_col])), key = list(df_temp[tgt_col]).count)
            
            if len(tgt_uniq_vals) == 1:  # Is leaf
                data_temp['is_leaf'] = True
                data_temp['answer'] = tgt_uniq_vals[0]
            
            node_temp = Node(data_temp)
            parent_node.insert_node(node_temp)
            
            if len(tgt_uniq_vals) > 1:
#                 print(f'cat: {list(df_temp[tgt_col])}')
                build_tree(node_temp, df_temp, tgt_col, new_cols, m)
        
#     print(f'best_param: {best_param}')

In [41]:
def generate_matrix(obj, cols, k, n_tree):
    k_folds_strat = obj.get_n_class_k_final_folds(k)
    num_test_max = max(len(k_folds_strat[k-1]["test"]), len(k_folds_strat[k-2]["test"]))
#     mat = np.zeros((k, n_tree, num_test_max))
    mat = np.full((k, n_tree, num_test_max), -1)
    mat_real_value_by_fold = np.full((k, num_test_max), -1)
    mat_real_value_by_fold.astype(int)
    for i in range(k): # Iterate over k folds
        curr_boots = obj.get_n_bootstrap(k_folds_strat[i]["train"], n_tree)
        print(f'\n____Fold{i}____')
        for j in range(len(curr_boots)): # Iterate over each tree
            print(f'__TREE{j}__', end='')
            # Declare and initialize root tree node 
            ans = max(set(list(curr_boots[j]['train']['target'])), key = list(curr_boots[j]['train']['target']).count)
            data_root = {'is_leaf': False, 'answer': ans, 'column': None, 'value_to_check': None, 'check_less': None}
            root_node = Node(data_root)
            num_rand_cols = int(sqrt(len(cols)))
            build_tree(root_node, curr_boots[j]['train'], 'target', cols, num_rand_cols)
#             root_node.print_tree(level=0) # print the tree structure
            k_it = 0
            for idx, row in k_folds_strat[i]['test'].iterrows():
                mat[i][j][k_it] = root_node.get_classification(row)
                mat_real_value_by_fold[i][k_it] = row[obj.target_feature]
                k_it+=1


    
    return (mat, mat_real_value_by_fold)

# WINE DATASET

In [42]:
'''
INITIAL PARAMETERS

'''
INPUT_PATH = 'data/wine_recognition/wine-recognition.tsv'
STRUCTURE_PATH = 'data/wine_recognition/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, '\t', 'target')
random.seed(1)
k = 10
n_tree = 50

cols = []
cols = cols + (list(zip([str(j['name']) for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
cols = cols + (list(zip([str(j['name']) for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))

(mat_pred_1, mat_real_1) = generate_matrix(obj, cols, k, n_tree)


____Fold0____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48____TREE49__
____Fold1____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48__

In [43]:
mat_real_1

array([[ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1],
       [ 1,  1,  1,  1,  1,  2,  2

In [44]:
mat_pred_1[0]

array([[ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       ...,
       [ 2,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1]])

# HOUSE VOTES

In [45]:
'''
INITIAL PARAMETERS

'''
INPUT_PATH = 'data/house_votes_84/house-votes-84.tsv'
STRUCTURE_PATH = 'data/house_votes_84/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, '\t', 'target')
random.seed(1)
k = 10
n_tree = 50

cols = []
cols = cols + (list(zip([str(j['name']) for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
cols = cols + (list(zip([str(j['name']) for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))

(mat_pred_2, mat_real_2) = generate_matrix(obj, cols, k, n_tree)


____Fold0____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48____TREE49__
____Fold1____
__TREE0____TREE1____TREE2____TREE3____TREE4____TREE5____TREE6____TREE7____TREE8____TREE9____TREE10____TREE11____TREE12____TREE13____TREE14____TREE15____TREE16____TREE17____TREE18____TREE19____TREE20____TREE21____TREE22____TREE23____TREE24____TREE25____TREE26____TREE27____TREE28____TREE29____TREE30____TREE31____TREE32____TREE33____TREE34____TREE35____TREE36____TREE37____TREE38____TREE39____TREE40____TREE41____TREE42____TREE43____TREE44____TREE45____TREE46____TREE47____TREE48__

In [46]:
mat_real_2

array([[ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
 

In [36]:
mat_pred_2[0]

array([[ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       ...,
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1]])

In [None]:
# INPUT_PATH = 'data/own_test_benchmark/test_benchmark.csv'
# STRUCTURE_PATH = 'data/own_test_benchmark/metadata.yaml'
# obj = Dataset(INPUT_PATH, STRUCTURE_PATH, ';', 'target')
# cols = []
# cols = cols + (list(zip([j['name'] for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
# cols = cols + (list(zip([j['name'] for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))
# cols