In [18]:
import pandas as pd
import numpy as np

#calcula a entropia do conjunto de dados
def calculate_entropy(dataframe, target):
    entropy = 0
    for target_value in dataframe[target].unique():
        target_value_count = dataframe[target].value_counts()[target_value]
        target_value_probability = target_value_count / len(dataframe)
        entropy += -target_value_probability * np.log2(target_value_probability)
    return entropy

#calcula o ganho de informação de um atributo
def calculate_information_gain(dataframe, target, attribute, verbose=False):
    information_gain = calculate_entropy(dataframe, target)
    for attribute_value in dataframe[attribute].unique():
        if verbose:
            print('attribute: {}'.format(attribute_value))
        attribute_value_count = dataframe[attribute].value_counts()[attribute_value]
        if verbose:
            print('total occurrences: {}'.format(attribute_value_count))
        attribute_value_probability = attribute_value_count / len(dataframe)
        if verbose:
            print('probability: {}'.format(attribute_value_probability))
        information_gain -= attribute_value_probability * calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target) 
        if verbose:
            print('entropy: {}'.format(calculate_entropy(dataframe[dataframe[attribute] == attribute_value], target)))
            print('')

    if verbose:
        print('information gain: {}'.format(information_gain))
    
    return information_gain

#descricao dos atributos
attributes_dict = {
    1: "If one of us apologizes when our discussion deteriorates, the discussion ends.",
    2: "I know we can ignore our differences, even if things get hard sometimes.",
    3: "When we need it, we can take our discussions with my spouse from the beginning and correct it.",
    4: "When I discuss with my spouse, to contact him will eventually work.",
    5: "The time I spent with my wife is special for us.",
    6: "We don't have time at home as partners.",
    7: "We are like two strangers who share the same environment at home rather than family.",
    8: "I enjoy our holidays with my wife.",
    9: "I enjoy traveling with my wife.",
    10: "Most of our goals are common to my spouse.",
    11: "I think that one day in the future, when I look back, I see that my spouse and I have been in harmony with each other.",
    12: "My spouse and I have similar values in terms of personal freedom.",
    13: "My spouse and I have similar sense of entertainment.",
    14: "Most of our goals for people (children, friends, etc.) are the same.",
    15: "Our dreams with my spouse are similar and harmonious.",
    16: "We're compatible with my spouse about what love should be.",
    17: "We share the same views about being happy in our life with my spouse",
    18: "My spouse and I have similar ideas about how marriage should be",
    19: "My spouse and I have similar ideas about how roles should be in marriage",
    20: "My spouse and I have similar values in trust.",
    21: "I know exactly what my wife likes.",
    22: "I know how my spouse wants to be taken care of when she/he sick.",
    23: "I know my spouse's favorite food.",
    24: "I can tell you what kind of stress my spouse is facing in her/his life.",
    25: "I have knowledge of my spouse's inner world.",
    26: "I know my spouse's basic anxieties.",
    27: "I know what my spouse's current sources of stress are.",
    28: "I know my spouse's hopes and wishes.",
    29: "I know my spouse very well.",
    30: "I know my spouse's friends and their social relationships.",
    31: "I feel aggressive when I argue with my spouse.",
    32: "When discussing with my spouse, I usually use expressions such as ‘you always’ or ‘you never’.",
    33: "I can use negative statements about my spouse's personality during our discussions.",
    34: "I can use offensive expressions during our discussions.",
    35: "I can insult my spouse during our discussions.",
    36: "I can be humiliating when we discussions.",
    37: "My discussion with my spouse is not calm.",
    38: "I hate my spouse's way of open a subject.",
    39: "Our discussions often occur suddenly.",
    40: "We're just starting a discussion before I know what's going on.",
    41: "When I talk to my spouse about something, my calm suddenly breaks.",
    42: "When I argue with my spouse, I only go out and I don't say a word.",
    43: "I mostly stay silent to calm the environment a little bit.",
    44: "Sometimes I think it's good for me to leave home for a while.",
    45: "I'd rather stay silent than discuss with my spouse.",
    46: "Even if I'm right in the discussion, I stay silent to hurt my spouse.",
    47: "When I discuss with my spouse, I stay silent because I am afraid of not being able to control my anger.",
    48: "I feel right in our discussions.",
    49: "I have nothing to do with what I've been accused of.",
    50: "I'm not actually the one who's guilty about what I'm accused of.",
    51: "I'm not the one who's wrong about problems at home.",
    52: "I wouldn't hesitate to tell my spouse about her/his inadequacy.",
    53: "When I discuss, I remind my spouse of her/his inadequacy.",
    54: "I'm not afraid to tell my spouse about her/his incompetence."
}

In [19]:
#carrega o dataset
divorce_dataset = pd.read_csv('dataset/divorce.csv', sep=';')
divorce_dataset.head(5) 
#Todas as respostas foram coletadas em uma escala de 5 pontos (0=Nunca, 1=Raramente, 2=Média, 3=Frequentemente, 4=Sempre).

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [20]:
# contagem de registros
len(divorce_dataset)

170

In [21]:
#lista de atributos
attributes = divorce_dataset.columns.tolist()[0:-1]

#atributo meta
meta_attribute = divorce_dataset.columns.tolist()[-1]

#entropia do conjunto de dados
print(calculate_entropy(divorce_dataset, meta_attribute))

0.9999001572094884


In [22]:
#calcula o ganho de informação de cada atributo
gain_list = []

for attribute in attributes:
    gain_list.append(calculate_information_gain(divorce_dataset,meta_attribute,attribute))

#attributes = [attributes_dict[int(attribute.replace('Atr', ''))] for attribute in attributes]

gain_df = pd.DataFrame({'attribute': attributes, 'gain': gain_list})
gain_df.sort_values(by='gain', ascending=False, inplace=True, ignore_index=True)

In [23]:
#top 10 atributos com maior ganho de informação
gain_df.head(10)

Unnamed: 0,attribute,gain
0,Atr18,0.91192
1,Atr20,0.897058
2,Atr40,0.889967
3,Atr17,0.882133
4,Atr19,0.871516
5,Atr11,0.868326
6,Atr9,0.854525
7,Atr16,0.836943
8,Atr15,0.823246
9,Atr26,0.822322


In [24]:
for attribute in gain_df.head(10)['attribute']:
    print('{}'.format(attributes_dict[int(attribute.replace('Atr', ''))]))

My spouse and I have similar ideas about how marriage should be
My spouse and I have similar values in trust.
We're just starting a discussion before I know what's going on.
We share the same views about being happy in our life with my spouse
My spouse and I have similar ideas about how roles should be in marriage
I think that one day in the future, when I look back, I see that my spouse and I have been in harmony with each other.
I enjoy traveling with my wife.
We're compatible with my spouse about what love should be.
Our dreams with my spouse are similar and harmonious.
I know my spouse's basic anxieties.


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#divisão do dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(divorce_dataset[gain_df.head(10)['attribute'].tolist()], divorce_dataset['Class'], test_size=0.2, random_state=42)

#treinamento do modelo
random_forest = RandomForestClassifier(n_estimators=10, #número de árvores na floresta
                                        criterion='entropy', #função para medir a qualidade de um divisão
                                        max_depth=5, #profundidade máxima da árvore
                                        min_samples_split=10, #número mínimo de amostras para dividir um nó
                                        min_samples_leaf=5, #número mínimo de amostras para ser um nó folha
                                        max_features='sqrt', #número de features(atributos) a serem considerados ao procurar a melhor divisão, max_features=sqrt(n_features)
                                        min_impurity_decrease=0.3, #um nó será dividido se essa divisão diminuir a impureza maior ou igual a esse valor
                                        bootstrap=True, #amostras de bootstrap são usadas para construir árvores
                                        random_state=42,
                                        max_samples= int(round(len(X_train)*0.20,0)) #número de amostras a serem usadas para treinar cada árvore
                                       ).fit(X_train, y_train)

#predição nos dados de teste
y_pred = random_forest.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print('Acurácia nos dados de teste: {}'.format(test_accuracy))

#precisão nos dados de teste, dos positivos dentre todas as classificações que o modelo fez como positivo quantos realmente são positivos
test_precision = precision_score(y_test, y_pred)
print('Precisão nos dados de teste: {}'.format(test_precision))

#recall nos dados de teste, dentre todos os positivos quantos o modelo classificou como positivo
test_recall = recall_score(y_test, y_pred)
print('Recall nos dados de teste: {}'.format(test_recall))

#f1 nos dados de teste, média harmônica entre precisão e recall
test_f1 = f1_score(y_test, y_pred)
print('F1 nos dados de teste: {}'.format(test_f1))

Acurácia nos dados de teste: 0.9705882352941176
Precisão nos dados de teste: 1.0
Recall nos dados de teste: 0.95
F1 nos dados de teste: 0.9743589743589743
