# Data preparation

Import data and remove unused columns

In [0]:
import pandas as pd
import numpy as np

data_train = pd.read_csv('train.csv', index_col='PassengerId')
data_test = pd.read_csv('test.csv', index_col='PassengerId')

y_train = data_train['Survived']
# drop Name and Ticket columns because they are very noisy
X_train = data_train.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
X_test = data_test.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)


Normalize data

In [76]:
X_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [0]:
def normalize_data(data, mean, std):
  return (data - mean) / std

In [0]:
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].mean())

X_train['Age'] = normalize_data(X_train['Age'], X_train['Age'].mean(), X_train['Age'].std())
X_train['Fare'] = normalize_data(X_train['Fare'], X_train['Fare'].mean(), X_train['Fare'].std())
X_train['SibSp'] = normalize_data(X_train['SibSp'], X_train['SibSp'].mean(), X_train['SibSp'].std())
X_train['Parch'] = normalize_data(X_train['Parch'], X_train['Parch'].mean(), X_train['Parch'].std())
X_train['Pclass'] = normalize_data(X_train['Pclass'], X_train['Pclass'].mean(), X_train['Pclass'].std())

X_test['Age'] = normalize_data(X_test['Age'], X_test['Age'].mean(), X_test['Age'].std())
X_test['Fare'] = normalize_data(X_test['Fare'], X_test['Fare'].mean(), X_test['Fare'].std())
X_test['SibSp'] = normalize_data(X_test['SibSp'], X_test['SibSp'].mean(), X_test['SibSp'].std())
X_test['Parch'] = normalize_data(X_test['Parch'], X_test['Parch'].mean(), X_test['Parch'].std())
X_test['Pclass'] = normalize_data(X_test['Pclass'], X_test['Pclass'].mean(), X_test['Pclass'].std())

Process categorical data

In [0]:

X_train['Sex'] = pd.Categorical(X_train['Sex'])
X_train['Sex'] = X_train.Sex.cat.codes
X_test['Sex'] =  pd.Categorical(X_test['Sex'])
X_test['Sex'] = X_test.Sex.cat.codes

# Model setup

In [0]:
from tensorflow import keras
#from sklearn.model_selection import train_test_split

def get_compiled_model():
  model = keras.Sequential([
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
  
  return model



Train model

In [81]:
model = get_compiled_model()
model.fit(X_train.values, y_train.values, epochs=10)

Train on 891 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f154feddfd0>

# Predict

In [0]:
predictions = model.predict(X_test.values)

In [0]:
def save_to_csv(filename, str):
  with open(filename, 'a') as csv:
    csv.write(str + "\n")

In [94]:
for prediction, PassengerId in zip(predictions, X_test.index):
  if np.isnan(prediction[0]):
    prediction[0] = 0
  str = '{},{}'.format(PassengerId, int(round(prediction[0])))
  #print(str)
  save_to_csv('predictions.csv', str)


892,0
893,0
894,0
895,0
896,0
897,0
898,1
899,0
900,1
901,0
902,0
903,0
904,1
905,0
906,1
907,1
908,0
909,0
910,0
911,0
912,1
913,1
914,1
915,1
916,1
917,0
918,1
919,0
920,0
921,0
922,0
923,0
924,0
925,0
926,1
927,0
928,1
929,1
930,0
931,0
932,0
933,1
934,0
935,1
936,1
937,0
938,0
939,0
940,1
941,0
942,1
943,0
944,1
945,1
946,0
947,0
948,0
949,0
950,0
951,1
952,0
953,0
954,0
955,1
956,1
957,1
958,1
959,0
960,1
961,1
962,1
963,0
964,1
965,1
966,1
967,0
968,0
969,1
970,0
971,1
972,1
973,1
974,0
975,0
976,0
977,0
978,1
979,1
980,1
981,1
982,0
983,0
984,1
985,0
986,1
987,0
988,1
989,0
990,1
991,0
992,1
993,0
994,0
995,0
996,1
997,0
998,0
999,0
1000,0
1001,0
1002,0
1003,1
1004,1
1005,1
1006,1
1007,0
1008,0
1009,1
1010,0
1011,1
1012,1
1013,0
1014,1
1015,0
1016,0
1017,1
1018,0
1019,0
1020,0
1021,0
1022,0
1023,0
1024,1
1025,0
1026,0
1027,0
1028,0
1029,0
1030,1
1031,0
1032,0
1033,1
1034,0
1035,0
1036,0
1037,0
1038,0
1039,0
1040,1
1041,0
1042,1
1043,0
1044,0
1045,0
1046,0
1047,0
1048,1
1049,1
10