In [1]:
from __future__ import division
import numpy as np
from scipy.stats import mode
import pandas as pd

# Problem 1
def gini(labels):
    return 1 - np.sum((np.bincount(labels)/len(labels))**2)

# Problem 2
def split(D,L,p,x):
    d1 = D[:,p] <= x
    d2 = D[:,p] > x
    
    return D[d1], L[d1], D[d2], L[d2]

# Problem 3
def info_gain(D,L,p,x):
    D1, L1, D2, L2 = split(D,L,p,x)
    return gini(L) - (len(L1)*gini(L1) + len(L2)*gini(L2))/len(L)

# Problem 4
def best_split(D,L):
    best = 0
    best_loc = None
    
    for p in xrange(D.shape[1]):
        vals = set(D[:,p])
        for x in vals:
            curr = info_gain(D,L,p,x)
            if best <= curr:
                best = curr
                best_loc = (p,x)
    return best_loc

# Problems 5-7
class Node(object):
    
    def __init__(self, tree, D, L, depth, max_depth, tol):
        tree.nodes.append(self)
        self.depth = depth
        self.score = gini(L)
        
        if self.depth >= max_depth or self.score < tol:
            self.label = np.argmax(np.bincount(L)) if len(L) != 0 else None
            self.leaf = True
        
        else:
            self.p, self.x = best_split(D, L)
            D1, L1, D2, L2 = split(D, L, self.p, self.x)
            self.left = Node(tree, D1, L1, depth+1, max_depth, tol)
            self.right = Node(tree, D2, L2, depth+1, max_depth, tol)
            self.leaf = False
    
    def predict(self, sample):
        if self.leaf:
            return self.label
        elif sample[self.p] <= self.x:
            return self.left.predict(sample)
        else:
            return self.right.predict(sample)
    
    def __str__(self):
        out = "Leaf Node" if self.leaf else "Parent Node"
        out += "\n\t Gini: {}".format(self.score)
        out += "\n\t Depth: {}".format(self.depth)
        if self.leaf:
            out += "\n\t Label: {}".format(self.label)
        else:
            out += "\n\t split index (p): {}".format(self.p)
            out += "\n\t split threshold (x): {}".format(self.x)
        return out

class ClassificationTree(object):
    
    def __init__(self, D, L, max_depth=10, tol=.2):
        self.max_depth = max_depth
        self.nodes = []
        self.root = Node(self, D, L, 1, max_depth, tol)

    def accuracy(self, data, labels):
        return 100*np.mean(self.classify(data) == labels)
    
    def classify(self, data):
        return np.array([self.predict(d) for d in data])
    
    def predict(self, sample):
        return self.root.predict(sample)
    
    def __str__(self):
        return "\n\n".join([str(node) for node in self.nodes])

In [3]:
def process_titanic_data(filename="titanic4real.csv", pclass_change=False):

    data = pd.read_csv(filename)
    data = data[["Survived", "Pclass", "Sex", "Age"]]
    data.dropna(inplace=True)
    
    data.loc[data.Sex=="female", "Sex"] = 0.
    data.loc[data.Sex=="male", "Sex"] = 1.
    
    if pclass_change is True:
        pclass = data["Pclass"]
        data["SecondClass"] = [1 if 2==pclass[i] else 0 for i in pclass.index]
        data["FirstClass"] = [1 if 1==pclass[i] else 0 for i in pclass.index]
        data.drop("Pclass", axis=1, inplace=True)

    test_index = sorted(np.random.choice(data.index, int(len(data.index)*.4), replace=False))
    train_index = [i for i in data.index if i not in test_index]
    
    return data.loc[train_index], data.loc[test_index]

training, testing = process_titanic_data()

test_data = np.array(testing.drop("Survived",axis=1),dtype=np.float)
test_labels = np.array(testing["Survived"],dtype=np.int64)

train_data = np.array(training.drop("Survived",axis=1),dtype=np.float)
train_labels = np.array(training["Survived"],np.int64)

tree = ClassificationTree(train_data,train_labels,max_depth=8,tol=.1)
print(tree)

print "\n{}% Accuracy".format(tree.accuracy(test_data,test_labels))

Parent Node
	 Gini: 0.470708750862
	 Depth: 1
	 split index (p): 1
	 split threshold (x): 0.0

Parent Node
	 Gini: 0.379761464497
	 Depth: 2
	 split index (p): 0
	 split threshold (x): 2.0

Parent Node
	 Gini: 0.131688263377
	 Depth: 3
	 split index (p): 2
	 split threshold (x): 55.0

Parent Node
	 Gini: 0.100556034145
	 Depth: 4
	 split index (p): 2
	 split threshold (x): 2.0

Parent Node
	 Gini: 0.444444444444
	 Depth: 5
	 split index (p): 0
	 split threshold (x): 1.0

Leaf Node
	 Gini: 0.0
	 Depth: 6
	 Label: 0

Leaf Node
	 Gini: 0.0
	 Depth: 6
	 Label: 1

Leaf Node
	 Gini: 0.0867768595041
	 Depth: 5
	 Label: 1

Parent Node
	 Gini: 0.336734693878
	 Depth: 4
	 split index (p): 0
	 split threshold (x): 1.0

Parent Node
	 Gini: 0.152777777778
	 Depth: 5
	 split index (p): 2
	 split threshold (x): 62.0

Leaf Node
	 Gini: 0.0
	 Depth: 6
	 Label: 1

Parent Node
	 Gini: 0.32
	 Depth: 6
	 split index (p): 2
	 split threshold (x): 63.0

Parent Node
	 Gini: 0.5
	 Depth: 7
	 split index (p): 2

In [None]:
class ForestNode(object):
    
    def __init__(self, D, L, features, depth, max_features, max_depth, tol):
        self.depth = depth
        self.score = gini(L)
        
        if self.depth >= max_depth or self.score < tol or len(features) < max_features:
            self.label = mode(L)[0][0] if len(L) != 0 else -1
            self.leaf = True
        
        else:
            self.p, self.x = self._best_split(D, L, features)
            D1, L1, D2, L2 = split(D, L, self.p, self.x)
            if len(set(D1[:,self.p])) == 1 or len(set(D2[:,self.p])) == 1:
                features.remove(self.p)
            
            self.left = ForestNode(D1, L1, features, depth+1, max_features, max_depth, tol)
            self.right = ForestNode(D2, L2, features, depth+1, max_features, max_depth, tol)
            self.leaf = False
    
    @staticmethod
    def _best_split(D, L, pvalues):
        best, best_loc = 0, None
        for p in pvalues:
            vals = set(D[:,p])
            for x in vals:
                curr = info_gain(D, L, p, x)
                if best <= curr:
                    best = curr
                    best_loc = (p,x)
        return best_loc
    
    def predict(self, sample):
        if self.leaf:
            return self.label
        elif sample[self.p] <= self.x:
            return self.left.predict(sample)
        else:
            return self.right.predict(sample)

class ForestTree(object):
    
    def __init__(self, D, L, features, max_depth=10, tol=.2):
        self.root = ForestNode(D, L, features, 1, len(features), max_depth, tol)
    
    def predict(self, sample):
        return self.root.predict(sample)
    
