In [63]:
import pandas as pd
import numpy as np
import graphviz, pydot
from scipy.stats import entropy
from sklearn import tree

In [64]:
EXAMPLE_1 = '../data/lec10/Exercise1.xlsx'
new_df = pd.read_excel(EXAMPLE_1)

In [65]:
clf = tree.DecisionTreeClassifier(criterion='entropy')

In [66]:
lbl_d = {}
atts = new_df.columns[1:-1]

num_df = pd.DataFrame(index=new_df.index)

for _a in atts:
    num_df[_a], lbl_d[_a] = pd.factorize(new_df[_a], sort=True)
    
num_df['Sentiment'], lbl_d['Sentiment'] = pd.factorize(new_df['Sentiment'], sort=True)

X = num_df[atts]
y = num_df['Sentiment']

clf_app = clf.fit(X, y)

In [67]:
dot_data = tree.export_graphviz(
    clf_app, 
    out_file='tree2.dot', 
    feature_names=atts, class_names=lbl_d['Sentiment'])

In [68]:
(graph, ) = pydot.graph_from_dot_file('tree2.dot')
graph.write_png('tree2.png')

In [79]:
new_df.reset_index(inplace=True)
all_ent = entropy(new_df['Sentiment'].value_counts(), base=2)
all_ent

0.9940302114769566

In [70]:
def get_ent(indx, all_ent, _df):
    pivot = _df.pivot_table(index=indx, columns='Sentiment', values='index', aggfunc='count', fill_value=0)
    e = []
    for i, row in pivot.iterrows():
        e.append(entropy(row, base=2))

    pivot['Entropy'] = e
    pivot['sum'] = pivot['positive'] + pivot['negative']
    pivot['entropy*n'] = pivot['sum'] * pivot['Entropy']
    ent = pivot['entropy*n'].sum() / pivot['sum'].sum()
    print(pivot)
    print('Entropy by {}: {}'.format(indx, ent))
    print('Information gain by {}: {}'.format(indx, (all_ent - ent)))
    print('\n')
    return ent

In [71]:
feat = new_df.columns[1:-1]

_all_vars = [get_ent(x, all_ent, new_df) for x in feat]

_min_ent = min(_all_vars)

print('Attribute with highest information gain: {}, {}'.format(
    feat[_all_vars.index(_min_ent)], (all_ent - _min_ent)))

Sentiment  negative  positive   Entropy  sum  entropy*n
Gender                                                 
female            2         4  0.918296    6   5.509775
male              3         2  0.970951    5   4.854753
Entropy by Gender: 0.9422298160545709
Information gain by Gender: 0.05180039542238568


Sentiment  negative  positive   Entropy  sum  entropy*n
Age                                                    
old               5         3  0.954434    8   7.635472
young             0         3  0.000000    3   0.000000
Entropy by Age: 0.6941338203090655
Information gain by Age: 0.29989639116789113


Sentiment   negative  positive   Entropy  sum  entropy*n
Department                                              
A                  3         2  0.970951    5   4.854753
B                  2         4  0.918296    6   5.509775
Entropy by Department: 0.9422298160545709
Information gain by Department: 0.05180039542238568


Attribute with highest information gain: Age, 0.2998963911

In [72]:
young = new_df[new_df['Age'] == 'young']
old = new_df[new_df['Age'] == 'old']

In [73]:
_old_vars = [get_ent(x, all_ent, old) for x in feat]

_old_min_ent = min(_old_vars)
old_ent = entropy(old['Sentiment'].value_counts(), base=2)

print('Attribute with highest information gain: {}, {}'.format(
    feat[_old_vars.index(_old_min_ent)], (old_ent - _old_min_ent)))

Sentiment  negative  positive   Entropy  sum  entropy*n
Gender                                                 
female            2         2  1.000000    4   4.000000
male              3         1  0.811278    4   3.245112
Entropy by Gender: 0.9056390622295665
Information gain by Gender: 0.08839114924739011


Sentiment  negative  positive   Entropy  sum  entropy*n
Age                                                    
old               5         3  0.954434    8   7.635472
Entropy by Age: 0.954434002924965
Information gain by Age: 0.03959620855199153


Sentiment   negative  positive   Entropy  sum  entropy*n
Department                                              
A                  3         0  0.000000    3   0.000000
B                  2         3  0.970951    5   4.854753
Entropy by Department: 0.606844121534168
Information gain by Department: 0.3871860899427886


Attribute with highest information gain: Department, 0.34758988139079705


In [74]:
A = old[old['Department'] == 'A']
B = old[old['Department'] == 'B']

B

Unnamed: 0,index,Gender,Age,Department,Sentiment
3,3,female,old,B,positive
5,5,female,old,B,positive
8,8,male,old,B,negative
9,9,male,old,B,positive
10,10,female,old,B,negative


In [75]:
# A_ent = entropy(A['Sentiment'].value_counts(), base=2)

# _A_vars = [get_ent(x,  all_ent, A) for x in feat]

# _A_min_ent = min(_A_vars)

# print('Attribute with highest information gain: {}, {}'.format(
#     feat[_A_vars.index(_A_min_ent)], (A_ent - _A_min_ent)))

In [76]:
B_ent = entropy(B['Sentiment'].value_counts(), base=2)

_B_vars = [get_ent(x,  all_ent, B) for x in feat]

_B_min_ent = min(_B_vars)

print('Attribute with highest information gain: {}, {}'.format(
    feat[_B_vars.index(_B_min_ent)], (B_ent - _B_min_ent)))

Sentiment  negative  positive   Entropy  sum  entropy*n
Gender                                                 
female            1         2  0.918296    3   2.754888
male              1         1  1.000000    2   2.000000
Entropy by Gender: 0.9509775004326937
Information gain by Gender: 0.04305271104426289


Sentiment  negative  positive   Entropy  sum  entropy*n
Age                                                    
old               2         3  0.970951    5   4.854753
Entropy by Age: 0.9709505944546688
Information gain by Age: 0.02307961702228778


Sentiment   negative  positive   Entropy  sum  entropy*n
Department                                              
B                  2         3  0.970951    5   4.854753
Entropy by Department: 0.9709505944546688
Information gain by Department: 0.02307961702228778


Attribute with highest information gain: Gender, 0.019973094021975113


In [77]:
A

Unnamed: 0,index,Gender,Age,Department,Sentiment
4,4,male,old,A,negative
6,6,male,old,A,negative
7,7,female,old,A,negative


In [78]:
B

Unnamed: 0,index,Gender,Age,Department,Sentiment
3,3,female,old,B,positive
5,5,female,old,B,positive
8,8,male,old,B,negative
9,9,male,old,B,positive
10,10,female,old,B,negative
