In [1]:
import numpy as np # linear algebra
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
data = [train, test]
ports = {"S": 0, "C": 1, "Q": 2}
common_value = 'S'
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 1
    dataset['travelled_alone'] = dataset['travelled_alone'].astype(int)
    mean = train["Age"].mean()
    std = test["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train["Age"].astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 5)
    dataset['Title'] = dataset['Title'].replace('Mrs', 3)
    dataset['Title'] = dataset['Title'].replace(['Ms','Mlle','Mme','Miss'], 2)
    dataset['Title'] = dataset['Title'].replace('Mr', 1)
    dataset['Title'] = dataset['Title'].replace('Master', 4)
    # convert titles into numbers
    #dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna("NA")
    dataset['Sex'].replace('female', 0,inplace=True)
    dataset['Sex'].replace('male', 1,inplace=True)
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [3]:
train_data = train[['Pclass','Sex','Age','Fare','Embarked','relatives','travelled_alone','Title']]
train_data = train_data.to_numpy()
train_label = train[['Survived']]
train_label = train_label.to_numpy()

In [4]:
test_data = test[['Pclass','Sex','Age','Fare','Embarked','relatives','travelled_alone','Title']]
#test_data = test_data.to_numpy()
test_data

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,relatives,travelled_alone,Title
0,3,1,22,7,2,0,1,1
1,3,0,38,7,0,1,0,3
2,2,1,26,9,2,0,1,1
3,3,1,35,8,0,0,1,1
4,3,0,35,12,0,2,0,3
...,...,...,...,...,...,...,...,...
413,3,1,16,8,0,0,1,1
414,1,0,44,108,1,0,1,5
415,3,1,38,7,0,0,1,1
416,3,1,34,8,0,0,1,1


# Deep learning

In [5]:
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential
from sklearn import metrics
from tensorflow.keras.callbacks import EarlyStopping

def model_create():
    model = Sequential()
    model.add(Dense(160, input_shape=(8,),activation ='relu'))
    model.add(Dense(100, activation ='relu'))
    model.add(Dense(10, activation ='relu'))
    model.add(Dense(1, activation ='sigmoid'))
    return model

In [6]:
epoch = 100

es = EarlyStopping(monitor='val_accuracy', mode='auto', verbose=2, patience=50, restore_best_weights = True)

model = model_create()
model.summary()
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics=["accuracy"])
model.fit(train_data, train_label,epochs=epoch, batch_size=8, validation_split=0.2,verbose = 2, callbacks=[es])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 160)               1440      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               16100     
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 18,561
Trainable params: 18,561
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
89/89 - 0s - loss: 0.6720 - accuracy: 0.6559 - val_loss: 0.5505 - val_accuracy: 0.7263
Epoch 2/100
89/89 - 0s - loss: 0.6769 - accuracy: 0.6784 - val_loss: 0.4824 - val_accuracy: 0.8156
Epoch 3/100
89/89 - 0s - loss: 0.6453 - accuracy: 0.

Epoch 75/100
89/89 - 0s - loss: 0.4017 - accuracy: 0.8272 - val_loss: 0.3961 - val_accuracy: 0.8324
Epoch 76/100
89/89 - 0s - loss: 0.4224 - accuracy: 0.8244 - val_loss: 0.3763 - val_accuracy: 0.8492
Epoch 77/100
89/89 - 0s - loss: 0.4176 - accuracy: 0.8174 - val_loss: 0.3364 - val_accuracy: 0.8659
Epoch 78/100
89/89 - 0s - loss: 0.4076 - accuracy: 0.8160 - val_loss: 0.3442 - val_accuracy: 0.8603
Epoch 79/100
89/89 - 0s - loss: 0.4054 - accuracy: 0.8216 - val_loss: 0.3526 - val_accuracy: 0.8547
Epoch 80/100
89/89 - 0s - loss: 0.4064 - accuracy: 0.8230 - val_loss: 0.3357 - val_accuracy: 0.8715
Epoch 81/100
89/89 - 0s - loss: 0.4031 - accuracy: 0.8230 - val_loss: 0.3636 - val_accuracy: 0.8603
Epoch 82/100
89/89 - 0s - loss: 0.4002 - accuracy: 0.8244 - val_loss: 0.3535 - val_accuracy: 0.8715
Epoch 83/100
89/89 - 0s - loss: 0.4024 - accuracy: 0.8174 - val_loss: 0.3278 - val_accuracy: 0.8715
Epoch 84/100
89/89 - 0s - loss: 0.3974 - accuracy: 0.8216 - val_loss: 0.3541 - val_accuracy: 0.8603


<tensorflow.python.keras.callbacks.History at 0x1c1c4637b48>

In [7]:
predictions = model.predict(test_data)
rounded = [int(round(x[0])) for x in predictions]
print(rounded[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [8]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': rounded})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [9]:
train_label[:10]

array([[0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1]], dtype=int64)

In [10]:
_, accuracy = model.evaluate(train_data, train_label)

