In [31]:
import pandas as pd
import yaml
import numpy as np
import random
from math import log2

In [27]:
class Dataset():
    data = pd.DataFrame()
    categorical_features = []
    continuous_features = []
    yaml_structure = {}
    target_feature = ''
    target_type = ''

    '''
    Load the CSV/TSV file, saves it in data
    '''
    def load_dataset(self, input_path, separator):
        self.data = pd.read_csv(input_path, sep=separator)
    
    
    '''
    Read the structure and get the types of the columns in two list [categorical, coninuous]
    Set target_column and target_type
    '''
    def read_structure(self, input_file, target_column):
        with open(input_file) as f:
            self.yaml_structure = yaml.load(f, Loader=yaml.FullLoader)
        self.target_feature = target_column
        self.target_type = self.yaml_structure['target']['type']
        
        self.categorical_features = []
        self.continuous_features = []
        
        for feature in self.yaml_structure['features']:
            if feature['type'] == 'continuous':
                self.continuous_features.append(feature['name'])
            else:
                self.categorical_features.append(feature['name'])

    
    '''
    Initialization
    '''
    def __init__(self, file_dataset_path, file_structure_path, char_separator='\t', target_column='target'):
        self.load_dataset(file_dataset_path, char_separator)
        self.read_structure(file_structure_path, target_column)

In [107]:
def get_entropy(df, tgt_col='target', eval_col='', categorical=False):
    uniq_tgts = df[tgt_col].unique()
    uniq_evals = df[eval_col].unique()
    len_df = len(df)
    
    # Validate if len(df)>0, else return 0
    
    orig_entr = 0
    for curr_tgt in uniq_tgts:
        curr_len = len(df.loc[df[tgt_col] ==curr_tgt])
        if curr_len!=0:
            orig_entr -= (curr_len/len_df)*log2(curr_len/len_df)
    
#     print(orig_entr)
    
    eval_entr = 0
    for curr_eval in uniq_evals:
        len_eval = len(df[df[eval_col] == curr_eval])
        curr_info = 0
        for curr_tgt in uniq_tgts:
            curr_len = len(df.loc[(df[eval_col] == curr_eval) & (df[tgt_col] ==curr_tgt)])
            if curr_len != 0 and len_eval != 0:
                curr_info -= (curr_len/len_eval)*log2(curr_len/len_eval)
#         print(f'curr_val: {curr_eval}, value: {curr_info}')
        
        eval_entr += (len_eval/len_df)*curr_info
    eval_entr = orig_entr - eval_entr
    print(f'total: {eval_entr}')
    return eval_entr

In [108]:
def select_best_column(df, tgt_col='target', cols=[]):
    for col in cols:
        print(col)
        curr_entr = get_entropy(df, tgt_col=tgt_col, eval_col=col, categorical=True)

In [109]:
INPUT_PATH = 'data/test_benchmark/test_benchmark.csv'
STRUCTURE_PATH = 'data/test_benchmark/metadata.yaml'
obj = Dataset(INPUT_PATH, STRUCTURE_PATH, ';', 'target')

In [112]:
obj.categorical_features

['Tempo', 'Temperatura', 'Umidade', 'Ventoso']

In [111]:
select_best_column(obj.data, tgt_col='target', cols=obj.categorical_features)

Tempo
total: 0.2467498197744391
Temperatura
total: 0.029222565658954647
Umidade
total: 0.15183550136234136
Ventoso
total: 0.04812703040826927
