# Prepare data

Import data and create train/test set

In [30]:
import pandas as pd
import numpy as np
# import Titanic data
df = pd.read_csv("../data/titanicDataSet.csv")

# drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# choose only columns we may want to use in Analysis
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded']]

# choose only columns where data is complete for all features
df = df[(df['Pclass'].notnull()) & (df['Age'].notnull()) & (df['SibSp'].notnull()) & (df['Parch'].notnull()) & (df['Fare'].notnull()) & (df['Boarded'].notnull()) & (df['Sex'].notnull())]

In [31]:
# make train/test Set
df_train = df[df['Survived'].notnull()]
df_test = df[df['Survived'].isnull()]

# set features and target
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']
feature_column_names = X_train.columns

# get indices for train/test sets
index_values_train = X_train.index
index_values_test = X_test.index

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [32]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from tensorflow.keras.utils import to_categorical

# separate categorical and numeric data to encode categorical data
categorical_X_train = df_train[['Sex', 'Boarded']]
numeric_X_train = df_train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
categorical_X_test = df_test[['Sex', 'Boarded']]
numeric_X_test = df_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

# encode categorical data
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(categorical_X_train)
cat_encoded_X_train = ordinal_encoder.transform(categorical_X_train)
cat_encoded_X_test = ordinal_encoder.transform(categorical_X_test)

# make categorical dataframes to join with numeric dataframes
cat_encoded_X_train = pd.DataFrame(data = cat_encoded_X_train, index = index_values_train, columns = ['Sex', 'Boarded'])
cat_encoded_X_test = pd.DataFrame(data = cat_encoded_X_test, index = index_values_test, columns = ['Sex', 'Boarded'])

# join categorical encoded data with numeric data
encoded_X_train = numeric_X_train.join(cat_encoded_X_train)
encoded_X_test = numeric_X_test.join(cat_encoded_X_test)

# encode train target values
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)

# categorical one hot encoding needed for neural network
cat_y_train = to_categorical(encoded_y_train)

# scale X data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(encoded_X_train)
X_train_scaled = X_scaler.transform(encoded_X_train)
X_test_scaled = X_scaler.transform(encoded_X_test)

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
885    0.0
886    0.0
887    1.0
889    1.0
890    0.0
Name: Survived, Length: 714, dtype: float64
[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


# Train the Model



In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# make model with layers
model = Sequential()
number_inputs = len(encoded_X_train.columns)
number_hidden_nodes = number_inputs + 1
number_outputs = len(cat_y_train[0])
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_outputs, activation='softmax'))

# GridSearch (Gradient Descent) is being done in .compile or .fit)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_scaled, cat_y_train, epochs=1000,shuffle=True)

# evaluate model
model_loss, model_accuracy = model.evaluate(X_train_scaled, cat_y_train)
print(f"Train - Loss: {model_loss}, Accuracy: {model_accuracy}")

7 8 2
     Pclass   Age  SibSp  Parch     Fare  Sex  Boarded
0         3  22.0      1      0   7.2500  1.0      3.0
1         1  38.0      1      0  71.2833  0.0      1.0
2         3  26.0      0      0   7.9250  0.0      3.0
3         1  35.0      1      0  53.1000  0.0      3.0
4         3  35.0      0      0   8.0500  1.0      3.0
..      ...   ...    ...    ...      ...  ...      ...
885       3  39.0      0      5  29.1250  0.0      2.0
886       2  27.0      0      0  13.0000  1.0      3.0
887       1  19.0      0      0  30.0000  0.0      3.0
889       1  26.0      0      0  30.0000  1.0      1.0
890       3  32.0      0      0   7.7500  1.0      2.0

[714 rows x 7 columns]
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/10

In [34]:
# make dataframes with indices associated with numeric data
encoded_X_train = pd.DataFrame(data = X_train_scaled, index = index_values_train, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])
encoded_X_test = pd.DataFrame(data = X_test_scaled, index = index_values_test, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])

# combine test and train set to predict all values
encoded_X = encoded_X_train.append(encoded_X_test)
encoded_X = encoded_X.sort_index(axis=0)
y_pred = model.predict(encoded_X)

In [36]:
# output array to useful output ((a, b) = (deceased, survived))
y_out =[]
for row in y_pred:
    y_out.append(round(row[1]))

In [37]:
# add predicted column to original data
df['predicted_survival'] = y_out

In [38]:
# save results to csv
df.to_csv('neural_results.csv')