In [None]:
from sklearn import tree # tree library
%matplotlib inline
from sklearn.tree import export_graphviz
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#step 1
data = pd.read_csv("/content/Titanic.csv")  # Loads datasets as a dataframe using pandas, read_csv
data.head() # Shows first 5 entries


Unnamed: 0.1,Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1st,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,Southampton,2.0,,"St Louis, MO"
1,2,1st,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1st,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,4,1st,0,"Allison, Mr. Hudson Joshua Crei",male,30.0,1,2,113781,151.550003,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1st,0,"Allison, Mrs. Hudson J C (Bessi",female,25.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [None]:
# drop everything but pclass + sex + age + sibsp and survived
data.drop(['parch','embarked', 'boat', 'body','home.dest','name','ticket','cabin', 'fare', 'Unnamed: 0'], axis='columns', inplace=True)
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp
0,1st,1,female,29.0,0
1,1st,1,male,0.9167,1
2,1st,0,female,2.0,1
3,1st,0,male,30.0,1
4,1st,0,female,25.0,1


In [None]:
# Seperate label from features
inputs = data.drop('survived', axis='columns')  #feature columns
target= data.survived   #Survived column

In [None]:
inputs.sex = inputs.sex.map({'male':1, 'female':2}) # label encoding the 2 possible values
inputs.pclass = inputs.pclass.map({'1st':1, '2nd':2, '3rd':3}) # label encoding pclass with 3 possible values

from keras.utils import to_categorical

inputs.age= inputs.age.fillna(inputs.age.mean()) # fills in empty age data points with the mean age
inputs.tail()

Unnamed: 0,pclass,sex,age,sibsp
1304,3,2,14.5,1
1305,3,2,29.881135,1
1306,3,1,26.5,0
1307,3,1,27.0,0
1308,3,1,29.0,0


In [None]:
from sklearn.model_selection import train_test_split

# split training and testing data 80/20 randomly
x_train, x_test, y_train, y_test = train_test_split(inputs,target,test_size=.2)

In [None]:
# fit decision tree
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='entropy')
model=model.fit(x_train, y_train) #fit model with training data

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(model, out_file='tree.dot', feature_names = inputs.columns, class_names=['Died','Survived'], rounded = True, proportion = False, precision = 0, filled = True)
!dot -Tpng tree.dot -o tree.png 
from IPython.display import Image   #display decision tree
Image(filename = 'tree.png')  #saves image as tree.png

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Number 3, metrics using the unpruned decision tree
from sklearn.metrics import confusion_matrix

#In sample (using training data)
y_pred = model.predict(x_train) #predicitions from training set
tp, fp, fn, tn = confusion_matrix(y_train, y_pred).ravel() # gives true negative, false positive, false negative, and true positive
print('In Sample')
print(' ')
print('True Positives Rate: ', tp/(tp + fn))
print('True Negatives Rate: ', tn/(tn + fp))
print('Accuracy: ', model.score(x_train, y_train))    # accuracy with training set
print(' ')
#Out of sample (using testing data)
y_pred2 = model.predict(x_test) #predicitions from testing set
tp2, fp2, fn2, tn2 =confusion_matrix(y_test, y_pred2).ravel() # gives true negative, false positive, false negative, and true positive
print('Out of Sample')
print(' ')
print('True Positives Rate: ', tp2/(tp2 + fn2))
print('True Negatives Rate: ', tn2/(tn2 + fp2))
print('Accuracy: ', model.score(x_test, y_test))    # accuracy with test set


In Sample
 
True Positives Rate:  0.8657342657342657
True Negatives Rate:  0.9126506024096386
Accuracy:  0.8806112702960841
 
Out of Sample
 
True Positives Rate:  0.8070175438596491
True Negatives Rate:  0.7472527472527473
Accuracy:  0.7862595419847328


In [None]:
#Number 4
from sklearn.model_selection import GridSearchCV, cross_val_score
maxLeafNodes= [2,3,4,5,6,7,8,9,10]  #list of values for paramaer "max_leaf_nodes"
maxLeafNodes = np.array(maxLeafNodes) # convert list to numpy array
#cross validation
param_grid = {'max_leaf_nodes':maxLeafNodes}
grid_search = GridSearchCV(model,param_grid=param_grid,cv=5) #cv = k segments
grid_search.fit(x_train, y_train)
misclassifications = (1-grid_search.cv_results_['mean_test_score'])* y_train.size #calculates the number of miscalculations
print(misclassifications)
plt.ylabel('misclassifications')
plt.xlabel('Tree Size')
plt.plot(maxLeafNodes, misclassifications)

In [None]:
#Number 5
min_misclass = int(np.amin(misclassifications)) # convert from tuple to int
min_misclassification_index = np.where(misclassifications == np.amin(misclassifications)) #gives index of minimum number of misclassifications
min_misclassification_index = min_misclassification_index[0]
best_leaf_nodes = int(maxLeafNodes[min_misclassification_index[0]]) # Gives value of max leaf nodes at the corresponding index, convert from tuple to int
print('Minimum Misclassifications: ', min_misclass)
print('Best HyperParameter: ', best_leaf_nodes) #print the best parameter

model = tree.DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=best_leaf_nodes)  #create new decision tree with best parameter
model = model.fit(x_train, y_train) #fit model

#plot new pruned tree
export_graphviz(model, out_file='pruned_tree.dot', feature_names = inputs.columns, class_names=['Died','Survived'], rounded = True, proportion = False, precision = 0, filled = True)
!dot -Tpng pruned_tree.dot -o pruned_tree.png 
from IPython.display import Image   #display decision tree
Image(filename = 'pruned_tree.png')  #saves image

In [None]:
#Number 6, results with new pruned decision tree

#In sample (using training data)
y_pred = model.predict(x_train) #predicitions from training set
tp, fp, fn, tm = confusion_matrix(y_train, y_pred).ravel() # gives true negative, false positive, false negative, and true positive
print('In Sample')
print(' ')
print('True Positives Rate: ', tp/(tp + fn))
print('True Negatives Rate: ', tn/(tn + fp))
print('Accuracy: ', model.score(x_train, y_train))    # accuracy with training set
print(' ')
#Out of sample (using testing data)
y_pred2 = model.predict(x_test) #predicitions from testing set
tp2, fp2, fn2, tn2 =confusion_matrix(y_test, y_pred2).ravel() # gives true negative, false positive, false negative, and true positive
print('Out of Sample')
print(' ')
print('True Positives Rate: ', tp2/(tp2 + fn2))
print('True Negatives Rate: ', tn2/(tn2 + fp2))
print('Accuracy: ', model.score(x_test, y_test))    # accuracy with test set

'''The accuracy for the out of sample results is better in the pruned tree compared to the larger decision tree.
   As a result, the in sample accuracy slight decreased from the large tree to the pruned tree'''