In [61]:
import numpy as np
import pandas as pd
from scipy.stats import entropy

# Titanic Entropy

## Prepare data

In [62]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Convert into numeric values

In [63]:
train['Sex'] = train['Sex'] == 'male'
train['Sex'] = train['Sex'].astype(int)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


Simplify the data set so we have all nominal values

In [64]:
del train['Name']
del train['PassengerId']
del train['Ticket']
del train['Cabin']
del train['Fare']
del train['Age']
del train['Embarked']
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch
0,0,3,1,1,0
1,1,1,0,1,0
2,1,3,0,0,0
3,1,1,0,1,0
4,0,3,1,0,0


## Computing Entropy

For each attribute, calculate its entropy.

In [65]:
def calculateEntropy(data, auto_base=False):
    unique, occurances = np.unique(data, return_counts=True)
    prob = np.zeros(unique.size)
    total = data.size
    base = unique.size if auto_base else 2
    
    print("Entropy info for", data.name)
    print("Classfification:", unique)
    print("Occurances: ", end='')
    
    for i in range(unique.size):
        prob[i] = occurances[i] / total
        print("{:d} ({:.1f}%)".format(occurances[i], prob[i] * 100), end=' ')
    print()
    
    result = entropy(prob, base=base)
    print("Entropy: {:.2f}".format(result), end="\n\n")
    return result;

In [67]:
calculateEntropy(train['Pclass'], True)
calculateEntropy(train['Sex'], True)
calculateEntropy(train['SibSp'], True)
calculateEntropy(train['Parch'], True)

Entropy info for Pclass
Classfification: [1 2 3]
Occurances: 216 (24.2%) 184 (20.7%) 491 (55.1%) 
Entropy: 0.91

Entropy info for Sex
Classfification: [0 1]
Occurances: 314 (35.2%) 577 (64.8%) 
Entropy: 0.94

Entropy info for SibSp
Classfification: [0 1 2 3 4 5 8]
Occurances: 608 (68.2%) 209 (23.5%) 28 (3.1%) 16 (1.8%) 18 (2.0%) 5 (0.6%) 7 (0.8%) 
Entropy: 0.48

Entropy info for Parch
Classfification: [0 1 2 3 4 5 6]
Occurances: 678 (76.1%) 118 (13.2%) 80 (9.0%) 5 (0.6%) 4 (0.4%) 5 (0.6%) 1 (0.1%) 
Entropy: 0.40



0.40192188376890242

## Computing Entropy Average

For each entropy, calculate its entropy average compared to a parent (labels)

In [178]:
def calculateEntropyAverage(data, classification, labels):
    class_unique = np.unique(classification)
    labels_unique = np.unique(labels)
    num_of_classification = class_unique.size
    num_of_labels = labels_unique.size
    total = data.size
    
    result = 0
    allCounts = np.array([])
    
    # Find out for each classificaiton, count the number of labels
    for i in range(num_of_classification):
        c = class_unique[i]
        counts =  np.array([])

        for j in range(num_of_labels):
            label = labels_unique[j]
            query = "({:s} == {:d}) & ({:s} == {:d})".format(classification.name, c, labels.name, label)
            counts[j] = data.query(query).size
        allCounts.hstack((allCounts, counts))
        
    return result

def calculateEntropyAverageFromArrays(data):
    total = data.sum()
    result = 0
    
    for i in range(data.shape[0]):
        cSize = data[i].sum()
        prob = data[i] / cSize
        childEntropy = entropy(prob, base=2)
        weight = cSize / total
        result += weight * childEntropy
        
    return result

In [177]:
# still has bug, fix later
# calculateEntropyAverage(train, train['Sex'], train['Survived'])

In [184]:
input_data = np.array([[2,3], [4,0], [3,2]])
child = calculateEntropyAverageFromArrays(input_data)
print("{:.2f}".format(child.item()))

AttributeError: 'float' object has no attribute '2f'

In [179]:
parent = calculateEntropy(train['Survived'], False)

Entropy info for Survived
Classfification: [0 1]
Occurances: 549 (61.6%) 342 (38.4%) 
Entropy: 0.96



In [181]:
info_gain = parent - child