In [1]:
%matplotlib inline
import graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib import image as mpimg
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
def simple_tree_classify(tree, item):
    if (type(tree) != tuple):
        return tree
    else:
        attrib = tree[0]
        branches = tree[1]
        key = item[attrib]
        subtree = branches[key]
        return simple_tree_classify(subtree, item)

In [3]:
def entropy(probabilities):
    """Dada uma série de probabilidade de classes de um conjunto de dados, calcula o valor total da entropia."""
    p = probabilities[probabilities != 0]
    return sum(-p * np.log2(p))

In [4]:
def class_probabilities(labels):
    """Dada uma série de labels, calcula a série de probabilidades para cada classes de label."""
    total = len(labels)
    c = Counter(labels)
    return np.array([count/total for count in c.values()])

In [5]:
def data_entropy(labels):
    """Dada uma série de labels, calcula o valor total da entropia para o conjunto de dados associado à estes labels."""
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [6]:
def partition_by(df, part_col):
    """Dado um conjunto de dados e uma coluna, realiza a partição do conjunto
    em subconjuntos de acordo com os valores únicos continos na coluna e retorna
    a lista de subconjuntos de dados gerada pela partição."""
    subsets = dict()
    for k in df[part_col].unique():
        subset = df[df[part_col] == k]
        subsets[k] = subset
    return subsets

In [7]:
def partition_entropy(subsets, label_col):
    """Calcula o coeficiente de entropia para a partição do conjunto de 
    dados cujos subconjuntos estão em `subsets`, utilizando a coluna
    `label_col` como a coluna contendo os labels."""
    acc = 0
    total = sum([len(s) for s in subsets])
    for subset in subsets:
        q = len(subset)/total
        h = data_entropy(subset[label_col])
        acc = acc + q * h
    return acc

In [8]:
def partition_entropy_by(df, part_col, label_col):
    subsets = partition_by(df, part_col)
    return partition_entropy(subsets.values(), label_col)

In [9]:
def build_tree_id3(df, features, label_col):
    labels = df[label_col].unique()
    if len(labels) == 1:
        return labels[0]
    elif len(features) == 0:
        c = Counter(labels)
        return c.most_common(1)[0][0]
    else:
        best = min(features, key=lambda k: partition_entropy_by(df, k, label_col))
        partitions = partition_by(df, best)
        new_features = [k for k in features if k != best]
        branches = {key: build_tree_id3(subset, new_features, label_col) for (key, subset) in partitions.items()}
        return (best, branches)

In [10]:
# load datasets

train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

In [11]:
# cleanup

train = train.fillna(train.mean())
test = test.fillna(test.mean())

train.isna().sum()
test.isna().sum()

for i in ['Name', 'Ticket', 'Cabin', 'Embarked']:
    train.pop(i) if i in train else None
    test.pop(i) if i in test else None
    
labelEncoder = LabelEncoder() # alias

labelEncoder.fit(test['Sex'])
test['Sex'] = labelEncoder.transform(test['Sex'])

labelEncoder.fit(train['Sex'])
train['Sex'] = labelEncoder.transform(train['Sex'])

In [12]:
# tree classifier

enc = LabelEncoder()
t2 = DecisionTreeClassifier()
scaler = MinMaxScaler()

enc.fit([0, 1])
train['iSurvived'] = enc.transform(train['Survived'])
train['iSex'] = enc.transform(train['Sex'])

enc.fit([3, 1, 2])
train['iPclass'] = enc.transform(train['Pclass'])

enc.fit([1, 0, 3, 4, 2, 5, 8])
train['iSibSp'] = enc.transform(train['SibSp'])

enc.fit([0, 1, 2, 5, 3, 4, 6])
train['iParch'] = enc.transform(train['Parch'])

cols = ['Pclass', 'Sex', 'Age', 'SibSp']

t2 = t2.fit(train[cols], train['iSurvived'])
train['ipredicted'] = t2.predict(train[cols])

train[['iSurvived', 'ipredicted']][:10]

Unnamed: 0_level_0,iSurvived,ipredicted
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,1,1
3,1,1
4,1,1
5,0,0
6,0,0
7,0,0
8,0,0
9,1,1
10,1,1


In [13]:
dot_data = tree.export_graphviz(t2, out_file=None, feature_names=['iSex', 'iPclass', 'iSibSp', 'iParch'])
graph = graphviz.Source(dot_data) 
graph.render("titanic") 

'titanic.pdf'