In [26]:
import numpy as np
import pandas as pd
import math
import sklearn.metrics as metrics

<h1> 1 Explore the Data

In [2]:
data = pd.read_csv("titanic.csv.bz2")
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
data['male'] = data['sex'].map({'female': 0, 'male': 1})
summary_data = data[['pclass', 'male', 'age', 'sibsp', 'parch', 'fare', 'survived']]
survived_data = summary_data[summary_data.survived == 1]
drowned_data = summary_data[summary_data.survived == 0]

missings = []
for each in summary_data:
    missings.append(sum(summary_data.isnull()[each]))

summary_table = pd.DataFrame({'variable': summary_data.columns[:-1], 
                              'survived': survived_data.mean()[:-1],
                              'drowned': drowned_data.mean()[:-1],
                              'missings': missings[:-1] 
                             })

summary_table = summary_table[['variable', 'survived', 'drowned', 'missings']]
summary_table

Unnamed: 0,variable,survived,drowned,missings
pclass,pclass,1.962,2.500618,0
male,male,0.322,0.843016,0
age,age,28.918228,30.545369,263
sibsp,sibsp,0.462,0.521632,0
parch,parch,0.476,0.328801,0
fare,fare,49.361184,23.353831,1


In [4]:
import scipy.stats as st

ranges = []
missingvalues = []
correlations = []
maxes = []
mins = []

for var in summary_data:
    maxes.append(max(summary_data[var]))
    mins.append(min(summary_data[var]))
    ranges.append(max(summary_data[var]) - min(summary_data[var]))
    missingvalues.append(sum(summary_data[var].isnull()))
    correlations.append(summary_data[var].corr(summary_data['fare']))

summary_table = pd.DataFrame({'means': summary_data.mean(),
                              'min': mins,
                              'max': maxes,
                              'range': ranges, 
                              'missing values': missingvalues, 
                              'correlation': correlations})

summary_table = summary_table[['means', 'min', 'max', 'range', 'missing values', 'correlation']]
summary_table

Unnamed: 0,means,min,max,range,missing values,correlation
pclass,2.294882,1.0,3.0,2.0,0,-0.558629
male,0.644003,0.0,1.0,1.0,0,-0.185523
age,29.881135,0.1667,80.0,79.8333,263,0.178739
sibsp,0.498854,0.0,8.0,8.0,0,0.160238
parch,0.385027,0.0,9.0,9.0,0,0.221539
fare,33.295479,0.0,512.3292,512.3292,1,1.0
survived,0.381971,0.0,1.0,1.0,0,0.244265


<h1> 2 Implement decision tree

<h3> 2.1 Create binary variables

In [9]:
data2 = pd.get_dummies(data, columns=['pclass'])
data2.head()

Unnamed: 0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,male,pclass_1,pclass_2,pclass_3
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",0,1,0,0
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1,1,0,0
2,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,1,0,0
3,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",1,1,0,0
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,1,0,0


In [10]:
dataAge = pd.cut(data2['age'], bins=[0,11,18,45,60,90], labels=["age0-10", "age11-18", "age18-45", "age45-60", "age60+"])
dataAge = pd.get_dummies(dataAge, columns=['age'])
dataAge

data2[["age0-10", "age11-18", "age18-45", "age45-60", "age60+"]] = dataAge[["age0-10", "age11-18", "age18-45", "age45-60", "age60+"]]

In [11]:
data2['parch-1+'] = data['parch'].map(lambda x: 0 if x<1 else 1)
data2.head()

Unnamed: 0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,male,pclass_1,pclass_2,pclass_3,age0-10,age11-18,age18-45,age45-60,age60+,parch-1+
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,...,0,1,0,0,0,0,1,0,0,0
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,...,1,1,0,0,1,0,0,0,0,1
2,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,...,0,1,0,0,1,0,0,0,0,1
3,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,...,1,1,0,0,0,0,1,0,0,1
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,...,0,1,0,0,0,0,1,0,0,1


In [12]:
data2['sib-2+'] = data['parch'].map(lambda x: 0 if x<2 else 1)
data2.head()

Unnamed: 0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,pclass_1,pclass_2,pclass_3,age0-10,age11-18,age18-45,age45-60,age60+,parch-1+,sib-2+
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,...,1,0,0,0,0,1,0,0,0,0
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,...,1,0,0,1,0,0,0,0,1,1
2,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,1,0,0,0,0,1,1
3,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,0,0,1,0,0,1,1
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,0,0,1,0,0,1,1


In [13]:
data2['sex'] = data2['sex'].map({'female': 0, 'male': 1})
data2.head()

Unnamed: 0,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,pclass_1,pclass_2,pclass_3,age0-10,age11-18,age18-45,age45-60,age60+,parch-1+,sib-2+
0,1,"Allen, Miss. Elisabeth Walton",0,29.0,0,0,24160,211.3375,B5,S,...,1,0,0,0,0,1,0,0,0,0
1,1,"Allison, Master. Hudson Trevor",1,0.9167,1,2,113781,151.55,C22 C26,S,...,1,0,0,1,0,0,0,0,1,1
2,0,"Allison, Miss. Helen Loraine",0,2.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,1,0,0,0,0,1,1
3,0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,0,0,1,0,0,1,1
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1,2,113781,151.55,C22 C26,S,...,1,0,0,0,0,1,0,0,1,1


In [15]:
data2 = data2[['survived', 'sex', 'pclass_1','pclass_2', 'pclass_3', 'age0-10', 'age11-18', 'age18-45', 'age45-60', 'age60+', 'parch-1+', 'sib-2+']]
data2.head()

Unnamed: 0,survived,sex,pclass_1,pclass_2,pclass_3,age0-10,age11-18,age18-45,age45-60,age60+,parch-1+,sib-2+
0,1,0,1,0,0,0,0,1,0,0,0,0
1,1,1,1,0,0,1,0,0,0,0,1,1
2,0,0,1,0,0,1,0,0,0,0,1,1
3,0,1,1,0,0,0,0,1,0,0,1,1
4,0,0,1,0,0,0,0,1,0,0,1,1


In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(data2, test_size=0.20)

<h3> 2.2 Implement the decision tree with binary variables

In [17]:
def getEntropy(p1, p2):
    return (-p1*np.log2(p1)) - (p2*np.log2(p2))

train_survived = x_train[x_train['survived'] == 1]
train_drowned = x_train[x_train['survived'] == 0]

s_p = len(train_survived) / (len(train_survived) + len(train_drowned))
d_p = 1 - s_p

entropy = getEntropy(s_p, d_p)
print(entropy)

0.964595846501


In [19]:
def getEntropyDF(data):
    col_names = ['Feature', 'Entropy0', 'Entropy1', 'Weighted Avg. Entropy', 'Entropy Gain']
    summary_data = []
    for each in data.columns[1:]:
        data_0 = data[data[each] == 0]
        data_1 = data[data[each] == 1]
        
        if (len(data_0) == 0):
            s_0_p = 0
        else: 
            s_0_p = len(data_0[data_0['survived'] == 1])/len(data_0)
            
        if (len(data_1) == 0):
            s_1_p = 0
        else:
            s_1_p = len(data_1[data_1['survived'] == 1])/len(data_1)
            

        d_0_p = 1 - s_0_p
        d_1_p = 1 - s_1_p

        e_data_0 = getEntropy(s_0_p, d_0_p)
        e_data_1 = getEntropy(s_1_p, d_1_p)
    
        ew = (len(data_1)*e_data_0 + len(data_0)*e_data_0)/(len(data_1) + len(data_0))

        gain = entropy - ew

        row = [each, e_data_0, e_data_1, ew, gain]
        summary_data.append(row)


    summary_table = pd.DataFrame(summary_data, columns=col_names)
    return summary_table

summary_table = getEntropyDF(x_train)
summary_table

Unnamed: 0,Feature,Entropy0,Entropy1,Weighted Avg. Entropy,Entropy Gain
0,sex,0.83105,0.723106,0.83105,0.133546
1,pclass_1,0.897288,0.951848,0.897288,0.067308
2,pclass_2,0.955989,0.987983,0.955989,0.008607
3,pclass_3,0.995905,0.834754,0.995905,-0.031309
4,age0-10,0.954245,0.977001,0.954245,0.010351
5,age11-18,0.963099,0.980378,0.963099,0.001497
6,age18-45,0.963252,0.965735,0.963252,0.001344
7,age45-60,0.962381,0.982681,0.962381,0.002215
8,age60+,0.966443,0.828056,0.966443,-0.001847
9,parch-1+,0.927191,0.994779,0.927191,0.037405


In [21]:
feature = summary_table['Entropy Gain'].max()
print("feature= sex:", feature)
print()

female_data = x_train[x_train['sex'] == 0]
male_data = x_train[x_train['sex'] == 1]

female_s_p = len(female_data[female_data['survived'] == 1])/len(female_data)
male_s_p = len(male_data[male_data['survived'] == 1])/len(male_data)

female_d_p = 1- female_s_p
male_d_p = 1-male_s_p

print("Females:")
print("   Survived: ", female_s_p)
print("   Drowned: ", female_d_p)

print("Males:")
print("   Survived: ", male_s_p)
print("   Drowned: ", male_d_p)

feature= sex: 0.133545503076

Females:
   Survived:  0.7371273712737128
   Drowned:  0.26287262872628725
Males:
   Survived:  0.20058997050147492
   Drowned:  0.7994100294985251


In [22]:
def bestAttribute(data):
    entropyData = getEntropyDF(data)
    sortedEntropy = entropyData.sort_values(by='Entropy Gain', ascending = False)
    
    return(sortedEntropy["Feature"].iloc[0])

In [23]:
def makeTree(data):
    len_leaf_1 = len(data[data["survived"] == 1])
    len_leaf_0 = len(data[data["survived"] == 0])
    
    if(len_leaf_0 == data.shape[0] or len_leaf_0 < 5):
        return 0
    if(len_leaf_1 == data.shape[0] or len_leaf_1 < 5):
        return 0
    if(data.shape[1] == 1):
        return len_leaf_1 / len(data)
    
    best = bestAttribute(data)

    split_0 = (data[data[best] == 0].drop(best, axis = 1))
    split_1 = (data[data[best] == 1].drop(best, axis = 1))

    return [best, makeTree(split_1), makeTree(split_0)]

In [24]:
tree = makeTree(x_train)
tree

  
  


['sex',
 ['pclass_1',
  ['age18-45',
   ['pclass_2',
    0,
    ['pclass_3',
     0,
     ['age0-10',
      0,
      ['age11-18',
       0,
       ['age45-60',
        0,
        ['age60+',
         0,
         ['parch-1+',
          ['sib-2+', 0, 0],
          ['sib-2+', 0, 0.40384615384615385]]]]]]]],
   ['sib-2+',
    0,
    ['age45-60',
     ['pclass_2',
      0,
      ['pclass_3',
       0,
       ['age0-10',
        0,
        ['age11-18', 0, ['age60+', 0, ['parch-1+', 0, 0.3333333333333333]]]]]],
     ['pclass_2',
      0,
      ['pclass_3',
       0,
       ['age0-10',
        0,
        ['age11-18',
         0,
         ['parch-1+', 0, ['age60+', 0, 0.3684210526315789]]]]]]]]],
  ['parch-1+',
   ['age0-10',
    ['sib-2+',
     ['pclass_2', 0, 0],
     ['age11-18',
      0,
      ['age18-45',
       0,
       ['age45-60',
        0,
        ['age60+',
         0,
         ['pclass_2', 0, ['pclass_3', 0.47058823529411764, 0]]]]]]],
    ['pclass_3', ['age18-45', 0, 0], 0]],
   ['

In [25]:
def predict(treeData, row):
    if(isinstance(treeData, (int, float, complex))):
        if (treeData > 0.5):
            return 1
        else:
            return 0
    else:
        if (row[treeData[0]] == 1):
            return predict(treeData[1], row)
        else:
            return predict(treeData[2], row)

predict(tree, x_train.iloc[0])

0

In [27]:
predict_all = []
for i in range(len(x_train)):
    predict_all.append(predict(tree, x_train.iloc[i]))

print("accuracy", metrics.accuracy_score(x_train['survived'], predict_all))
print("recall", metrics.recall_score(x_train['survived'], predict_all))
print("precision", metrics.precision_score(x_train['survived'], predict_all))

accuracy 0.659980897803
recall 0.203431372549
precision 0.728070175439


<h1> 3 Bagging and Random Forests

<h3> 3.1 Bagging

In [31]:
B = 5
def bagging(b):
    bags = []
    for bag in range(b):
        train, test = train_test_split(data2, test_size=0.2)
        tree = makeTree(train)
        bags.append(tree)
        
    predict_all = []
    agree_total = 0
    for i in range(len(test)):
        survivalCount = 0
        for bag in range(b):
            survivalCount += predict(bags[bag], test.iloc[i])
            
        if (survivalCount > b / 2):
            predict_all.append(1)
        else: 
            predict_all.append(0)
        if (survivalCount == b or survivalCount == 0):
            agree_total += 1       
    print("# bags: ", b)
    print("accuracy", metrics.accuracy_score(test['survived'], predict_all))
    print("recall", metrics.recall_score(test['survived'], predict_all, average='weighted'))
    print("precision", metrics.precision_score(test['survived'], predict_all, average='weighted'))
    print("100% agreement rate: ", agree_total / len(test))
    print()
    
bagging(B)

  
  


# bags:  5
accuracy 0.679389312977
recall 0.679389312977
precision 0.691518986392
100% agreement rate:  0.9923664122137404



In [32]:
B = [1, 20, 100, 250]
for each in B:
    print(each)
    bagging(each)

1


  
  


# bags:  1
accuracy 0.664122137405
recall 0.664122137405
precision 0.619988453397
100% agreement rate:  1.0

20
# bags:  20
accuracy 0.671755725191
recall 0.671755725191
precision 0.714905057185
100% agreement rate:  0.8549618320610687

100
# bags:  100
accuracy 0.671755725191
recall 0.671755725191
precision 0.737286127928
100% agreement rate:  0.9236641221374046

250
# bags:  250
accuracy 0.63358778626
recall 0.63358778626
precision 0.617540145829
100% agreement rate:  0.8740458015267175

