In [63]:
import pandas as pd
import numpy as np

#calcula a entropia do conjunto de dados
def calculate_entropy(dataframe, target, decimal_places=4):
    entropy = 0
    for target_value in sorted(dataframe[target].unique()):
        target_value_count = dataframe[target].value_counts()[target_value]
        target_value_probability = target_value_count / len(dataframe)
        entropy += -target_value_probability * np.log2(target_value_probability)
    return round(entropy, decimal_places)

#calcula o ganho de informação de um atributo
def calculate_information_gain(dataframe, target, attribute, verbose=False, decimal_places=4):
    information_gain = calculate_entropy(dataframe, target)
    for attribute_value in sorted(dataframe[attribute].unique()):
        if verbose:
            print('attribute: {}'.format(attribute_value))
        attribute_value_count = dataframe[attribute].value_counts()[attribute_value]
        if verbose:
            print('total occurrences: {}'.format(attribute_value_count))
        attribute_value_probability = attribute_value_count / len(dataframe)
        if verbose:
            print('probability: {}'.format(attribute_value_probability))
        information_gain -= attribute_value_probability * calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target) 
        if verbose:
            print('entropy: {}'.format(calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target)))
            print('')

    if verbose:
        print('information gain: {}'.format(information_gain))
    
    return round(information_gain, decimal_places)

In [64]:
divorce_dataset = pd.read_csv('dataset/divorce.csv', sep=',')

#estatística descritiva: contagem, média, desvio padrão, mínimo, máximo e os quartis
divorce_dataset.describe()

Unnamed: 0,question_1,question_2,question_3,question_4,question_5,question_6,question_7,question_8,question_9,question_10,...,question_46,question_47,question_48,question_49,question_50,question_51,question_52,question_53,question_54,class
count,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,...,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0
mean,1.776471,1.652941,1.764706,1.482353,1.541176,0.747059,0.494118,1.452941,1.458824,1.576471,...,2.552941,2.270588,2.741176,2.382353,2.429412,2.476471,2.517647,2.241176,2.011765,0.494118
std,1.627257,1.468654,1.415444,1.504327,1.632169,0.904046,0.898698,1.546371,1.557976,1.421529,...,1.371786,1.586841,1.137348,1.511587,1.40509,1.260238,1.476537,1.505634,1.667611,0.501442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
50%,2.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,...,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,0.0
75%,3.0,3.0,3.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0


In [65]:
#nome dos atributos
attributes_name_list = divorce_dataset.columns.tolist()[0:-1]

#nome do atributo meta
meta_attribute = divorce_dataset.columns.tolist()[-1]

In [66]:
#entropia do conjunto de dados
print(calculate_entropy(divorce_dataset, meta_attribute))

0.9999


In [67]:
#calcula o ganho de informação de cada atributo
gain_list = []

for attribute in attributes_name_list:
    gain_list.append(calculate_information_gain(divorce_dataset,meta_attribute,attribute))

gain_df = pd.DataFrame({'attribute': attributes_name_list, 'gain': gain_list})
gain_df.sort_values(by='gain', ascending=False, inplace=True, ignore_index=True)

#top 10 atributos com maior ganho de informação
gain_df.head(10)

Unnamed: 0,attribute,gain
0,question_18,0.9119
1,question_20,0.897
2,question_40,0.89
3,question_17,0.8821
4,question_19,0.8715
5,question_11,0.8683
6,question_9,0.8545
7,question_16,0.8369
8,question_15,0.8232
9,question_26,0.8223


In [71]:
#18. Meu cônjuge e eu temos ideias semelhantes sobre como o casamento deve ser.
calculate_information_gain(divorce_dataset,meta_attribute,"question_18",verbose=True)

question_18_dataset = divorce_dataset[['question_18','class']]

divorce_dataset[divorce_dataset['question_18'] == 4]['class'].value_counts()

attribute: 0
total occurrences: 79
probability: 0.4647058823529412
entropy: 0.0979

attribute: 1
total occurrences: 10
probability: 0.058823529411764705
entropy: 0.7219

attribute: 2
total occurrences: 16
probability: 0.09411764705882353
entropy: 0.0

attribute: 3
total occurrences: 44
probability: 0.25882352941176473
entropy: 0.0

attribute: 4
total occurrences: 21
probability: 0.12352941176470589
entropy: 0.0

information gain: 0.9119405882352941


1    21
Name: class, dtype: int64

In [72]:
#20. Meu cônjuge e eu temos valores semelhantes em relação à confiança.
calculate_information_gain(divorce_dataset,meta_attribute,"question_20",verbose=True)

question_20_dataset = divorce_dataset[['question_20','class']]

divorce_dataset[divorce_dataset['question_20'] == 4]['class'].value_counts()

attribute: 0
total occurrences: 81
probability: 0.4764705882352941
entropy: 0.096

attribute: 1
total occurrences: 10
probability: 0.058823529411764705
entropy: 0.971

attribute: 2
total occurrences: 21
probability: 0.12352941176470589
entropy: 0.0

attribute: 3
total occurrences: 36
probability: 0.21176470588235294
entropy: 0.0

attribute: 4
total occurrences: 22
probability: 0.12941176470588237
entropy: 0.0

information gain: 0.8970411764705882


1    22
Name: class, dtype: int64

In [70]:
#40. Estamos apenas começando uma discussão antes que eu saiba o que está acontecendo.
calculate_information_gain(divorce_dataset,meta_attribute,"question_40",verbose=True)

question_40_dataset = divorce_dataset[['question_40','class']]

divorce_dataset[divorce_dataset['question_40'] == 0]['class'].value_counts()

attribute: 0
total occurrences: 72
probability: 0.4235294117647059
entropy: 0.1056

attribute: 1
total occurrences: 13
probability: 0.07647058823529412
entropy: 0.3912

attribute: 2
total occurrences: 6
probability: 0.03529411764705882
entropy: 1.0

attribute: 3
total occurrences: 23
probability: 0.13529411764705881
entropy: 0.0

attribute: 4
total occurrences: 56
probability: 0.32941176470588235
entropy: 0.0

information gain: 0.8899658823529413


0    71
1     1
Name: class, dtype: int64