In [21]:
import pandas as pd
import yaml
import numpy as np
import random
from math import log2

In [22]:
class Node():
    
    def __init__(self, data):
        self.is_leaf = data['is_leaf']
        self.answer = data['answer']
        self.feature_check = data['column'] 
        self.children_type = '' # if categorical C value_to_check - string, otherwise if Continuous N value_to_check numeric and validate if check_less
        self.value_to_check = data['value_to_check']
        self.check_less = data['check_less']
        self.children = [] ## just two if n_type = N (numerical) [less, greater or equal]
    
    def set_children_type(self, value):
        self.children_type = value
        
    def set_is_leaf(self, value):
        self.is_leaf = value
    
    def get_children(self):
        return self.children
    
    def insert_node(self, node):
        # self.children = self.children.append(node) 
        self.children.append(node) 
    
    def get_classification(self, row):
        if self.is_leaf: 
            return self.answer
        else:
            '''Categorical'''
            if self.children_type == 'C':
                for child in self.children:
                    print(f'feature_check: {child.feature_check}')
                    if row[child.feature_check] == child.value_to_check:
                        return child.get_classification(row)
            else:
                '''Numerical'''
                if row[child.feature_check] < child[0].value_to_check:
                    return child[0].get_classification(row)
                else:
                    return child[1].get_classification(row)
    
    def print_tree(self, level=0):
        print(f'lvl:{level}, Leaf: {self.is_leaf}, type: {self.children_type}, feature_check: {self.feature_check}, check: {self.value_to_check}')
        if self.is_leaf:
            print(f'lvl:{level}, answer: {self.answer}\n\n')
        else:
            for child in self.children:
                child.print_tree(level=level+1)


In [23]:
class Dataset():
    data = pd.DataFrame()
    categorical_features = []
    continuous_features = []
    yaml_structure = {}
    target_feature = ''
    target_type = ''

    '''
    Load the CSV/TSV file, saves it in data
    '''
    def load_dataset(self, input_path, separator):
        self.data = pd.read_csv(input_path, sep=separator)
    
    
    '''
    Read the structure and get the types of the columns in two list [categorical, coninuous]
    Set target_column and target_type
    '''
    def read_structure(self, input_file, target_column):
        with open(input_file) as f:
            self.yaml_structure = yaml.load(f, Loader=yaml.FullLoader)
        self.target_feature = target_column
        self.target_type = self.yaml_structure['target']['type']
        
        self.categorical_features = []
        self.continuous_features = []
        
        for feature in self.yaml_structure['features']:
            if feature['type'] == 'continuous':
                n_min = min(self.data[feature['name']])
                n_diff = max(self.data[feature['name']]) - n_min
                if n_diff != 0:
                    self.data[feature['name']] = (self.data[feature['name']] - n_min) / n_diff
                self.continuous_features.append({'name': feature['name'], 'min': n_min, 'diff': n_diff })
                
            else:
                self.categorical_features.append({'name': feature['name']})

    
    '''
    Initialization
    '''
    def __init__(self, file_dataset_path, file_structure_path, char_separator='\t', target_column='target'):
        self.load_dataset(file_dataset_path, char_separator)
        self.read_structure(file_structure_path, target_column)

In [40]:
def get_gain_ratio(df, TGT_COL='target', EVAL_COL='', CATEGORICAL=False):
#     print(df)
    uniq_tgts = df[TGT_COL].unique()
    uniq_d_a_vals = df[EVAL_COL].unique() #uniq_evals
    len_d_tot = len(df)
    
    # Validate if len(df)>0, else return 0
    if len_d_tot == 0: return 0
    
    sel_cut_point = None
    
    inf_d_tot = 0
    for curr_tgt in uniq_tgts:
        curr_len = len(df.loc[df[TGT_COL] ==curr_tgt])
        if curr_len!=0:
            inf_d_tot -= (curr_len/len_d_tot)*log2(curr_len/len_d_tot) ## Respect to Target Column
    
    gain_ratio_a = 0
    if CATEGORICAL is True:
        inf_d_a = 0
        split_info_a = 0
        for curr_d_aj in uniq_d_a_vals: #curr_eval
            '''Gain A: Begin'''
            len_d_aj = len(df[df[EVAL_COL] == curr_d_aj]) #len_eval
            inf_d_aj = 0
            for curr_tgt in uniq_tgts:
                curr_len = len(df.loc[(df[EVAL_COL] == curr_d_aj) & (df[TGT_COL] ==curr_tgt)])
                if curr_len != 0 and len_d_aj != 0:
                    inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
    #         print(f'curr_val: {curr_eval}, value: {curr_info}')
            inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
            '''Gain A: End'''
            '''Split Info A: Begin'''
            if len_d_aj != 0 and len_d_tot != 0:
                split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)  ## Respect to column to be split
            '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0:
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
        print(f'{EVAL_COL}:   inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
    else: 
        ### Mean for cut point:
        mean_val_point = df[EVAL_COL].mean()
    
#         for curr_cut_point in qt_points:
        inf_d_a = 0
        split_info_a = 0
        '''Gain A: Begin'''
        '''    < cut_point '''
        len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] < mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''   >= cut_point '''
        len_d_aj =  len_d_tot - len_d_aj
        inf_d_aj = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[EVAL_COL] >= mean_val_point) & (df[TGT_COL] ==curr_tgt)])
            if curr_len != 0 and len_d_aj != 0:
                inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
        '''Gain A: End'''
        '''Split Info A: Begin'''
        if len_d_aj != 0 and len_d_tot != 0:
            len_d_aj = len(df[df[EVAL_COL] < mean_val_point]) #len_eval
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
            len_d_aj = len_d_tot - len_d_aj
            split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
        '''Split Info A: End'''
        gain_a = inf_d_tot - inf_d_a
        if split_info_a != 0: 
            gain_ratio_a = gain_a / split_info_a
        else:
            gain_ratio_a = 0
        sel_cut_point = mean_val_point
        print(f'For cut_point: {mean_val_point}, inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a}')
        
        
        
        ''' TRY TO FIND A POINT FOR SPLIT THE DATA '''
#         ### Possible cut points:
#         df.sort_values(by=[EVAL_COL])
#         cand_points = []
#         qt_points = []
#         last_row = None
#         # mask = (df[df[TGT_COL].shift(1) != df[TGT_COL]])
#         np_lbls = np.array(df[TGT_COL])
#         np_numb = np.array(df[EVAL_COL])
#         print(np_lbls)
#         print(np_numb)
#         for idx in range(1, np_lbls.shape[0]):
#             if np_lbls[idx] != np_lbls[idx-1]:
#                 cand_points = np.append(cand_points, (np_numb[idx]+np_numb[idx-1])/2)
#         print(cand_points)
#         if cand_points.shape[0] < 4:
#             qt_points = cand_points
#         else:
#             len_cand = cand_points.shape[0]
#             qt_points = np.append(qt_points, cand_points[(int)(len_cand/4)])
#             qt_points = np.append(qt_points, cand_points[(int)(len_cand/2)])
#             qt_points = np.append(qt_points, cand_points[(int)(3*len_cand/4)])
        
    
#         for curr_cut_point in qt_points:
#             inf_d_a = 0
#             split_info_a = 0
#             '''Gain A: Begin'''
#             '''    < cut_point '''
#             len_d_aj = len(df[df[EVAL_COL] < curr_cut_point]) #len_eval
#             inf_d_aj = 0
#             for curr_tgt in uniq_tgts:
#                 curr_len = len(df.loc[(df[EVAL_COL] < curr_cut_point) & (df[TGT_COL] ==curr_tgt)])
#                 if curr_len != 0 and len_d_aj != 0:
#                     inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#     #         print(f'curr_val: {curr_eval}, value: {curr_info}')
#             inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
#             '''   >= cut_point '''
#             len_d_aj =  len_d_tot - len_d_aj
#             inf_d_aj = 0
#             for curr_tgt in uniq_tgts:
#                 curr_len = len(df.loc[(df[EVAL_COL] >= curr_cut_point) & (df[TGT_COL] ==curr_tgt)])
#                 if curr_len != 0 and len_d_aj != 0:
#                     inf_d_aj -= (curr_len/len_d_aj)*log2(curr_len/len_d_aj)
#     #         print(f'curr_val: {curr_eval}, value: {curr_info}')
#             inf_d_a += (len_d_aj/len_d_tot)*inf_d_aj
#             '''Gain A: End'''
#             '''Split Info A: Begin'''
#             if len_d_aj != 0 and len_d_tot != 0:
#                 len_d_aj = len(df[df[EVAL_COL] < curr_cut_point]) #len_eval
#                 split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
#                 len_d_aj = len_d_tot - len_d_aj
#                 split_info_a -= (len_d_aj/len_d_tot)*log2(len_d_aj/len_d_tot)
#             '''Split Info A: End'''
#             gain_a = inf_d_tot - inf_d_a
#             gain_ratio_a_temp = gain_a / split_info_a
#             if gain_ratio_a < gain_ratio_a_temp:
#                 gain_ratio_a = gain_ratio_a_temp
#                 sel_cut_point = curr_cut_point
#             print(f'For cut_point: {curr_cut_point}, inf_d_tot: {inf_d_tot}, inf_d_a: {inf_d_a}, gain_a: {gain_a}, split_info_a: {split_info_a}, gain_ratio: {gain_ratio_a_temp}')
            
    return (gain_ratio_a, sel_cut_point)

In [41]:
'''
cols: [(nom_col, True), (nom_col1, True), (nom_col2, False) ... ]  ........ True if categorical, Otherwise False
'''
def select_best_column(df, tgt_col='target', cols=[]):
    best_param = ('None', 0, 0)
    rand_cols = random.choices(cols, k=2)
    print(f'rand_cols: {rand_cols}')
    for col in rand_cols:
        curr_entr, cut_point = get_gain_ratio(df, TGT_COL=tgt_col, EVAL_COL=col[0], CATEGORICAL=col[1])
        if best_param[2] <= curr_entr:
            best_param = (col[0], cut_point, curr_entr)
    return best_param
    

In [42]:
INPUT_PATH = 'data/own_test_benchmark/test_benchmark.csv'
STRUCTURE_PATH = 'data/own_test_benchmark/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, ';', 'target')
obj

<__main__.Dataset at 0x7fd9f4353ef0>

In [43]:
cols = []
cols = cols + (list(zip([j['name'] for j in obj.categorical_features], [True for i in range(len(obj.categorical_features))])))
cols = cols + (list(zip([j['name'] for j in obj.continuous_features], [False for i in range(len(obj.continuous_features))])))
cols

[('Tempo', True),
 ('Temperatura', True),
 ('Umidade', True),
 ('Ventoso', True),
 ('Probabilidade', False)]

In [44]:
# del data_root, root_node
data_root = {'is_leaf': False, 'answer': None, 'column': None, 'value_to_check': None, 'check_less': None}
    
root_node = Node(data_root)

def build_tree(parent_node, df, tgt_col, cols):
#     print(df)
    # (col_name, cut_point, score)
    best_param = select_best_column(df, tgt_col=tgt_col, cols=cols)
    print(f'best_param: {best_param}')
    '''Numerical'''
    if best_param[1] is not None: # Numerical
        parent_node.set_children_type('N')
        
        new_cols = []
        for c in cols:
            if c[0] != best_param[0]:
                new_cols.append(c)
        print(f'new_cols: {new_cols}')
        
        ''' Left child '''
        df_temp = df[df[best_param[0]] < best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': True}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        if (len(tgt_uniq_vals) == 1):  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if (len(tgt_uniq_vals) > 1):
            build_tree(node_temp, df_temp, tgt_col, new_cols)
            
        ''' Right child '''
        df_temp = df[df[best_param[0]] >= best_param[1]]
        data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': best_param[1], 'check_less': False}
        tgt_uniq_vals = df_temp[tgt_col].unique()

        if (len(tgt_uniq_vals) == 1):  # Is leaf
            data_temp['is_leaf'] = True
            data_temp['answer'] = tgt_uniq_vals[0]

        node_temp = Node(data_temp)
        parent_node.insert_node(node_temp)

        if (len(tgt_uniq_vals) > 1):
            build_tree(node_temp, df_temp, tgt_col, new_cols)
    else:
        '''Categorical'''
        parent_node.set_children_type('C')
        uniq_vals = df[best_param[0]].unique()
        
        new_cols = []
        for c in cols:
            if c[0] != best_param[0]:
                new_cols.append(c)
#         print(f'new_cols: {new_cols}')
        
        for val in uniq_vals:
            df_temp = df[df[best_param[0]] == val]
            data_temp = {'is_leaf': False, 'answer': None, 'column': best_param[0], 'value_to_check': val, 'check_less': None}
            tgt_uniq_vals = df_temp[tgt_col].unique()
            
            if (len(tgt_uniq_vals) == 1):  # Is leaf
                data_temp['is_leaf'] = True
                data_temp['answer'] = tgt_uniq_vals[0]
            
            node_temp = Node(data_temp)
            parent_node.insert_node(node_temp)
            
            if (len(tgt_uniq_vals) > 1):
                build_tree(node_temp, df_temp, tgt_col, new_cols)
        
    print(f'best_param: {best_param}')


build_tree(root_node, obj.data.copy(), 'target', cols)
root_node.print_tree(level=0)

rand_cols: [('Umidade', True), ('Ventoso', True)]
Umidade:   inf_d_tot: 0.9402859586706309, inf_d_a: 0.7884504573082896, gain_a: 0.15183550136234136, split_info_a: 1.0, gain_ratio: 0.15183550136234136
Ventoso:   inf_d_tot: 0.9402859586706309, inf_d_a: 0.8921589282623617, gain_a: 0.04812703040826927, split_info_a: 0.9852281360342515, gain_ratio: 0.0488486155115206
best_param: ('Umidade', None, 0.15183550136234136)
rand_cols: [('Ventoso', True), ('Ventoso', True)]
Ventoso:   inf_d_tot: 0.9852281360342515, inf_d_a: 0.9649839288804954, gain_a: 0.020244207153756077, split_info_a: 0.9852281360342515, gain_ratio: 0.020547735507476704
Ventoso:   inf_d_tot: 0.9852281360342515, inf_d_a: 0.9649839288804954, gain_a: 0.020244207153756077, split_info_a: 0.9852281360342515, gain_ratio: 0.020547735507476704
best_param: ('Ventoso', None, 0.020547735507476704)
rand_cols: [('Temperatura', True), ('Temperatura', True)]
Temperatura:   inf_d_tot: 1.0, inf_d_a: 1.0, gain_a: 0.0, split_info_a: 1.0, gain_ratio

KeyError: 'None'

In [20]:
row = {'Tempo': 'Chuvoso', 'Temperatura': 'Amena', 'Umidade': 'Alta', 'Ventoso': 'Falso', 'Probabilidade': 16}
root_node.get_classification(row)

feature_check: Umidade
feature_check: Tempo
feature_check: Tempo
feature_check: Tempo
feature_check: Ventoso


'Sim'