# This notebook creates a model to predict whether or not a give passenger from titanic surived the sinking of the ship

In [188]:
#importing required libraries
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [189]:
df = pd.read_csv('../data/train.csv')
df.head()

Number of records in training data is 891


In [190]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Cabin' ]
X = df[features]
gender_dict = {'male':1, 'female':2}
X['Sex'].replace(gender_dict, inplace = True)
X['Cabin_Block'] = X['Cabin'].str.slice(0,1)
X['Cabin_Block'] = X['Cabin_Block'].fillna('X', inplace = False)
block_dict = {'X': 0, 'C' : 1, 'E' : 2, 'G': 3, 'D' : 4, 'A' : 5, 'B': 6, 'F': 7, 'T': 8}
X['Cabin_Block'].replace(block_dict, inplace = True)
X.drop('Cabin', axis =1, inplace = True)
X = X.fillna(X.mean())
X.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [228]:
X = StandardScaler().fit_transform(X)

Number of rows in training set is 891


In [229]:
y = df['Survived']
n_inputs = X.shape[1]
#normalising data
print('Number of rows in training set is {}'.format(y.shape[0]))
print('number of inouts is {}'.format(X.shape[1]))

Number of rows in training set is 891
number of inouts is 6


In [230]:
#defining a method to create a sequential model
#arguments:
#n_ip - numbers of inputs
#n_hiddenLayers - number of hidden layers in model
#n_hiddenLayerUnits - An array, with ith element specifying the number of units contained in ith hidden layer
def regModel(n_ip,n_hiddenLayers,n_hiddenLayerUnits ):
    model = Sequential()
    model.add(Dense(n_hiddenLayerUnits[0], activation='relu', input_shape=(n_ip,)))
    if n_hiddenLayers > 1:    
        for i in range(1, n_hiddenLayers):   
            model.add(Dense(n_hiddenLayerUnits[i], activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [231]:
#loop around
#mseArr = np.zeros(50)
#acc = np.zeros(50)
#for i in range(0,50):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
model = regModel(n_inputs, 2, [10, 5])
model.fit(X_train,y_train, epochs= 200, verbose = 0)
y_pred = model.predict(X_test,verbose = 0)
y_pred = np.round(y_pred)
mseArr = mean_squared_error(y_test,y_pred)
acc = (sum(y_pred[:,0] == y_test)*100)/len(y_test)


In [232]:
print('mean of mse is {}'.format(mseArr))
print('mean accuracy is {}'.format(acc))

mean of mse is 0.19776119402985073
mean accuracy is 80.22388059701493


# Adding age to input

In [233]:
data_pred = pd.read_csv('../data/test.csv')
data_pred.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [238]:
X_pred = data_pred[['Pclass','SibSp','Parch','Fare','Sex','Age']]
X_pred['Sex'].replace(to_replace='male', value=1, inplace=True)
X_pred['Sex'].replace(to_replace='female', value=2, inplace=True)
X_pred = X_pred.fillna(X_pred.mean())
print('Number of rows in testing set is {}'.format(X_pred.shape[0]))



Number of rows in testing set is 418


In [239]:

#normalising data
X_pred = (X_pred - X_pred.mean())/X_pred.std()
y_pred = model.predict(X_pred)

In [240]:
final_sub = pd.DataFrame()
final_sub['PassengerId'] = data_pred['PassengerId']
final_sub['Survived'] = np.round(y_pred).astype('int')
final_sub.to_csv('../data/submission.csv')