In [23]:
# Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

**Step 1: use our “titanic” dataset in homework #3, and split data in the same way you did in homework #3 – 80% as training and 20% test sets**

In [24]:
data = pd.read_csv("Titanic.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1st,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.337494,B5,Southampton,2.0,,"St Louis, MO"
1,2,1st,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.550003,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1st,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,4,1st,0,"Allison, Mr. Hudson Joshua Crei",male,30.0,1,2,113781,151.550003,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1st,0,"Allison, Mrs. Hudson J C (Bessi",female,25.0,1,2,113781,151.550003,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [25]:
x = data[['pclass', 'sex', 'age', 'sibsp']]
y = data['survived']
x.head()

Unnamed: 0,pclass,sex,age,sibsp
0,1st,female,29.0,0
1,1st,male,0.9167,1
2,1st,female,2.0,1
3,1st,male,30.0,1
4,1st,female,25.0,1


In [26]:
x.isnull().sum()

pclass      0
sex         0
age       263
sibsp       0
dtype: int64

In [27]:
x.fillna(x.mean(), inplace = True)

In [28]:
x.isnull().sum()

pclass    0
sex       0
age       0
sibsp     0
dtype: int64

In [29]:
ordinal_encoder = OrdinalEncoder()

x.loc[:,"sex"] = ordinal_encoder.fit_transform(x[["sex"]])

x.loc[:,"pclass"] = ordinal_encoder.fit_transform(x[["pclass"]])

In [30]:
x.tail()

Unnamed: 0,pclass,sex,age,sibsp
1304,2.0,0.0,14.5,1
1305,2.0,0.0,29.881135,1
1306,2.0,1.0,26.5,0
1307,2.0,1.0,27.0,0
1308,2.0,1.0,29.0,0


In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1234, stratify = y)
print('Shape of:\nx_train =',x_train.shape, '\ty_train =', y_train.shape, '\nx_test =', x_test.shape, '\ty_test =', y_test.shape)

Shape of:
x_train = (1047, 4) 	y_train = (1047,) 
x_test = (262, 4) 	y_test = (262,)


**Step 2: Fit a neural network using independent variables ‘pclass + sex + age + sibsp’ and dependent variable ‘survived’. Fill in n/a attributes with the average of the same attributes from other training examples. Use 2 hidden layers and set the activation functions for both the hidden and output layer to be the sigmoid function. Set “solver” parameter as either SGD (stochastic gradient descend) or Adam (similar to SGD but optimized performance with mini batches). You can adjust parameter “alpha” for regularization (to control overfitting) and other parameters such as “learning rate” and “momentum” as needed.**

In [32]:
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(20,20), 
                    activation='logistic', alpha=0.0001, momentum=0.3, random_state=1234)
mlp.fit(x_train, y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(20, 20), momentum=0.3,
              random_state=1234)

In [33]:
# Predicting on the test set
y_pred = mlp.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8587786259541985


**Step 3: Check the performance of the model with out-of- sample accuracy, defined as
out-of-sample percent survivors correctly predicted (on test set)
out-of-sample percent fatalities correctly predicted (on test set)
Please try two different network structures (i.e., number of neurons at each hidden layer) and
show their respective accuracy.**

In [34]:
y_pred_train = mlp.predict(x_train)

confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
TN = confusion_matrix_train[0][0]
FN = confusion_matrix_train[1][0]
TP = confusion_matrix_train[1][1]
FP = confusion_matrix_train[0][1]

print("in-sample survivors correctly predicted (on training set) = {0:.2f}%".format(TP/(TP+FN)*100))

print("in-sample fatalities correctly predicted (on training set) = {0:.2f}%".format(TN/(FP+TN)*100))


confusion_matrix_test = confusion_matrix(y_test, y_pred)
TN = confusion_matrix_test[0][0]
FN = confusion_matrix_test[1][0]
TP = confusion_matrix_test[1][1]
FP = confusion_matrix_test[0][1]

print("\nout-of-sample survivors correctly predicted (on testing set) = {0:.2f}%".format(TP/(TP+FN)*100))

print("out-of‐sample fatalities correctly predicted (on testing set) = {0:.2f}%".format(TN/(FP+TN)*100))

in-sample survivors correctly predicted (on training set) = 64.75%
in-sample fatalities correctly predicted (on training set) = 88.72%

out-of-sample survivors correctly predicted (on testing set) = 70.00%
out-of‐sample fatalities correctly predicted (on testing set) = 95.68%


**Using another network structure with : different number of neurons = 100 at each hidden layer**

In [35]:
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(100,100), 
                    activation='logistic', alpha=0.0001, momentum=0.3, random_state=1234)
mlp.fit(x_train, y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 100),
              momentum=0.3, random_state=1234)

In [36]:
# Predicting on the test set
y_pred = mlp.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8549618320610687


In [40]:
# Predicting on the train set
y_pred_train = mlp.predict(x_train)

confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
TN = confusion_matrix_train[0][0]
FN = confusion_matrix_train[1][0]
TP = confusion_matrix_train[1][1]
FP = confusion_matrix_train[0][1]

print("in-sample survivors correctly predicted (on training set) = {0:.2f}%".format(TP/(TP+FN)*100))

print("in-sample fatalities correctly predicted (on training set) = {0:.2f}%".format(TN/(FP+TN)*100))


confusion_matrix_test = confusion_matrix(y_test, y_pred)
TN = confusion_matrix_test[0][0]
FN = confusion_matrix_test[1][0]
TP = confusion_matrix_test[1][1]
FP = confusion_matrix_test[0][1]

print("\nout-of-sample survivors correctly predicted (on testing set) = {0:.2f}%".format(TP/(TP+FN)*100))

print("out-of‐sample fatalities correctly predicted (on testing set) = {0:.2f}%".format(TN/(FP+TN)*100))

in-sample survivors correctly predicted (on training set) = 65.75%
in-sample fatalities correctly predicted (on training set) = 88.10%

out-of-sample survivors correctly predicted (on testing set) = 70.00%
out-of‐sample fatalities correctly predicted (on testing set) = 95.06%


**Step 4: Compare the out-of-sample accuracy (as defined in step 3) with the random forest
obtained in homework #3. (You can either use a table or plot the results of the two algorithms
in one figure). Explain any difference in accuracy.**

In [38]:
df = pd.DataFrame(columns=['Parameter', 'Random Forest Classifier', '2 Layer NN with 20 neurons','2 Layer NN with 100 neurons'])

df = df.append({'Parameter': 'out‐of‐sample survivors correctly predicted (on testing set)', 'Random Forest Classifier': '71.00%', '2 Layer NN with 20 neurons': '70.00%','2 Layer NN with 100 neurons':'70.00%'}, ignore_index=True)
df = df.append({'Parameter': 'out-of‐sample fatalities correctly predicted (on testing set)', 'Random Forest Classifier': '88.89%', '2 Layer NN with 20 neurons': '95.68%','2 Layer NN with 100 neurons':'95.06%'}, ignore_index=True)
pd.set_option('display.max_colwidth', -1)
df

Unnamed: 0,Parameter,Random Forest Classifier,2 Layer NN with 20 neurons,2 Layer NN with 100 neurons
0,out‐of‐sample survivors correctly predicted (on testing set),71.00%,70.00%,70.00%
1,out-of‐sample fatalities correctly predicted (on testing set),88.89%,95.68%,95.06%


In [41]:
# Random Forest Results
# Percent of survivors correctly predicted (on test set): 71.0 %
# Percent of fatalities correctly predicted (on test set): 88.89 %

**The accuracy with the Neural Network is much better than that with the Random Forest Classifier.**