In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#calcula a entropia do conjunto de dados
def calculate_entropy(dataframe, target):
    entropy = 0
    for target_value in dataframe[target].unique():
        target_value_count = dataframe[target].value_counts()[target_value]
        target_value_probability = target_value_count / len(dataframe)
        entropy += -target_value_probability * np.log2(target_value_probability)
    return entropy

#calcula o ganho de informação de um atributo
def calculate_information_gain(dataframe, target, attribute, verbose=False):
    information_gain = calculate_entropy(dataframe, target)
    for attribute_value in dataframe[attribute].unique():
        if verbose:
            print('attribute: {}'.format(attribute_value))
        attribute_value_count = dataframe[attribute].value_counts()[attribute_value]
        if verbose:
            print('total occurrences: {}'.format(attribute_value_count))
        attribute_value_probability = attribute_value_count / len(dataframe)
        if verbose:
            print('probability: {}'.format(attribute_value_probability))
        information_gain -= attribute_value_probability * calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target) 
        if verbose:
            print('entropy: {}'.format(calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target)))
            print('')

    if verbose:
        print('information gain: {}'.format(information_gain))
    
    return information_gain

In [4]:
divorce_dataset = pd.read_csv('dataset/divorce.csv', sep=',')

#estatística descritiva: contagem, média, desvio padrão, mínimo, máximo e os quartis
divorce_dataset.describe()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
count,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,...,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0
mean,1.776471,1.652941,1.764706,1.482353,1.541176,0.747059,0.494118,1.452941,1.458824,1.576471,...,2.552941,2.270588,2.741176,2.382353,2.429412,2.476471,2.517647,2.241176,2.011765,0.494118
std,1.627257,1.468654,1.415444,1.504327,1.632169,0.904046,0.898698,1.546371,1.557976,1.421529,...,1.371786,1.586841,1.137348,1.511587,1.40509,1.260238,1.476537,1.505634,1.667611,0.501442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
50%,2.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,...,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,0.0
75%,3.0,3.0,3.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0


In [6]:
#lista com nome dos atributos
attributes = divorce_dataset.columns.tolist()[0:-1]

#nome do atributo meta
meta_attribute = divorce_dataset.columns.tolist()[-1]

In [7]:
#entropia do conjunto de dados
print(calculate_entropy(divorce_dataset, meta_attribute))

0.9999001572094884


In [30]:
#calcula o ganho de informação de cada atributo
gain_list = []

for attribute in attributes:
    gain_list.append(calculate_information_gain(divorce_dataset,meta_attribute,attribute))

gain_df = pd.DataFrame({'attribute': attributes, 'gain': gain_list})
gain_df.sort_values(by='gain', ascending=False, inplace=True, ignore_index=True)

#top 10 atributos com maior ganho de informação
gain_df.head(10)