In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
data=pd.read_csv('Titanic.csv')

# DataFrame=pclass + sex + age + sibsp---> survived

data['pclass'].replace('1st',1,inplace=True)
data['pclass'].replace('2nd',2,inplace=True)
data['pclass'].replace('3rd',3,inplace=True)

data['sex'].replace('female',0,inplace=True)
data['sex'].replace('male',1,inplace=True)

data['age'].fillna(data['age'].median(),inplace=True)

data=data[['pclass','sex','age','sibsp','survived']]
data

Unnamed: 0,pclass,sex,age,sibsp,survived
0,1,0,29.0000,0,1
1,1,1,0.9167,1,1
2,1,0,2.0000,1,0
3,1,1,30.0000,1,0
4,1,0,25.0000,1,0
...,...,...,...,...,...
1304,3,0,14.5000,1,0
1305,3,0,28.0000,1,0
1306,3,1,26.5000,0,0
1307,3,1,27.0000,0,0


In [11]:
# Step-1
# Randomly splitting data into 80%-Trainin 20%-Test
x_data=data[['pclass','sex','age','sibsp']]
y_data=data['survived']
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,train_size=0.8, random_state=None, shuffle=True, stratify=None)
print(np.shape(x_train))
print(np.shape(x_test))
print(np.shape(y_train))
print(np.shape(y_test))

(1047, 4)
(262, 4)
(1047,)
(262,)


In [16]:
# Step-2
ann=MLPClassifier(hidden_layer_sizes=(2,),activation='logistic',solver='adam',
                  learning_rate='constant',learning_rate_init=0.01,momentum=0.9,alpha=1e-5,max_iter=5000)
ann_fit=ann.fit(x_train,y_train)

In [17]:
ann_predict=ann_fit.predict(x_test)

In [18]:
# Step-3
cm=np.array(confusion_matrix(y_test,ann_predict,labels=[1,0]))
confusion=pd.DataFrame(cm,index=['Survived','Non-Survived'],columns=['Predicted Survived','Predicted Non-Survived'])
confusion

Unnamed: 0,Predicted Survived,Predicted Non-Survived
Survived,60,38
Non-Survived,13,151


In [19]:
TP=confusion['Predicted Survived']['Survived']
FP=confusion['Predicted Survived']['Non-Survived']
FN=confusion['Predicted Non-Survived']['Survived']
TN=confusion['Predicted Non-Survived']['Non-Survived']

# out-of-sample percent survivors correctly predicted (on test set)
ANN_Survivors_Correctly_Predicted=TP/(TP+FP)
print('out-of-sample percent survivors correctly predicted (on test set)=',ANN_Survivors_Correctly_Predicted)

# out-of-sample percent fatalities correctly predicted (on test set) 
ANN_Fatalities_Correctly_Predicted=TN/(FP+TN)
print('out-of-sample percent fatalities correctly predicted (on test set)=',ANN_Fatalities_Correctly_Predicted)

out-of-sample percent survivors correctly predicted (on test set)= 0.821917808219178
out-of-sample percent fatalities correctly predicted (on test set)= 0.9207317073170732


In [20]:
#Step-4
"""
Compare the out-of-sample accuracy (as defined in step 3) with the random forest
obtained in homework #3. (You can either use a table or plot the results of the two algorithms
in one figure). Explain any difference in accuracy
"""

RandomForest_Percent_Survivors_Correctly_Predicted= 0.8160919540229885
RandomForest_Percent_Fatalities_Correctly_Predicted= 0.8057142857142857


data={'Random Forest':[RandomForest_Percent_Survivors_Correctly_Predicted,RandomForest_Percent_Fatalities_Correctly_Predicted],
      'Neural Network':[ANN_Survivors_Correctly_Predicted,ANN_Fatalities_Correctly_Predicted]}
comparison=pd.DataFrame(data,index=['Percent Survivors Correctly Predicted','Percent Fatalities Correctly Predicted'])
comparison


Unnamed: 0,Random Forest,Neural Network
Percent Survivors Correctly Predicted,0.816092,0.821918
Percent Fatalities Correctly Predicted,0.805714,0.920732


Random Forest and Artificial Neural Network almost perform the same in terms of predicting survivors correctly, but Neural Network is slightly better. However, Artificial Neural Network outperforms in terms of predicting Fatalities correctly.