In [41]:
import numpy as np
import pandas as pd
from scipy.stats import entropy

# Titanic Entropy

## Prepare data

In [43]:
train = pd.read_csv("train.csv")[:20]
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Convert into numeric values

In [45]:
train['Sex'] = train['Sex'] == 'male'
train['Sex'] = train['Sex'].astype(int)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


Simplify the data set so we have all nominal values

In [12]:
del train['Name']
del train['PassengerId']
del train['Ticket']
del train['Cabin']
del train['Fare']
del train['Age']
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked
0,0,3,male,1,0,S
1,1,1,female,1,0,C
2,1,3,female,0,0,S
3,1,1,female,1,0,S
4,0,3,male,0,0,S


## Computing Entropy

For each attribute, calculate its entropy.

In [48]:
def calculateEntropy(data, auto_base=False):
    unique, occurances = np.unique(data, return_counts=True)
    prob = np.zeros(unique.size)
    total = data.size
    
    print("Entropy info for", data.name)
    print("Classfification:", unique)
    print("Occurances: ", end='')
    
    for i in range(unique.size):
        prob[i] = occurances[i] / total
        print("{:d} ({:.1f}%)".format(occurances[i], prob[i] * 100), end=' ')
    print()
    
    result = entropy(prob, base=base)
    print("Entropy: {:.2f}".format(result), end="\n\n")
    return result;

In [49]:
calculateEntropy(train['Pclass'], 2)
calculateEntropy(train['Sex'], 2)
calculateEntropy(train['SibSp'], 2)
calculateEntropy(train['Parch'], 2)
calculateEntropy(train['Embarked'], 2)

Entropy info for Pclass
Classfification: [1 2 3]
Occurances: 4 (20.0%) 3 (15.0%) 13 (65.0%) 
Entropy: 1.28

Entropy info for Sex
Classfification: [0 1]
Occurances: 11 (55.0%) 9 (45.0%) 
Entropy: 0.99

Entropy info for SibSp
Classfification: [0 1 3 4]
Occurances: 11 (55.0%) 7 (35.0%) 1 (5.0%) 1 (5.0%) 
Entropy: 1.44

Entropy info for Parch
Classfification: [0 1 2 5]
Occurances: 15 (75.0%) 3 (15.0%) 1 (5.0%) 1 (5.0%) 
Entropy: 1.15

Entropy info for Embarked
Classfification: ['C' 'Q' 'S']
Occurances: 3 (15.0%) 2 (10.0%) 15 (75.0%) 
Entropy: 1.05



1.0540157730727999