# Kaggle - Titanic

In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [29]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.read_csv('./data/gender_submission.csv')

train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [37]:
mean_age = np.nanmean(list(train["Age"]))
max_age = np.amax(list(train["Age"]))
train = train.fillna(mean_age)


mean_n_of_family = np.mean([row['SibSp'] + row['Parch'] for index, row in train.iterrows()])

In [38]:
def gender_to_int(gender):
    return 1 if gender == 'male' else 0

def embarked_to_int(embarked):
    if embarked == 'S':
        return 0
    elif embarked == 'C': 
        return 1
    else:
        return 2
    
def get_x_data(data):
    return [ [gender_to_int(row['Sex']), row['Age']/max_age, row['Pclass'], (row['SibSp'] + row['Parch'])/mean_n_of_family] for index, row in data.iterrows() ]


X = get_x_data(train)
Y = list(train["Survived"])

In [39]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

## Using a RandomForestClassifier

In [40]:
rd = RandomForestClassifier()

rd.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
rd.score(x_test, y_test)

0.8305084745762712

In [42]:
rd.feature_importances_

array([0.30862016, 0.44007973, 0.14430759, 0.10699252])

In [71]:
test = test.fillna(mean_age)
x_test_data = get_x_data(test)

y_pred = rd.predict(x_test_data)

final = pd.DataFrame(zip(test['PassengerId'], y_pred), columns=['PassengerId', 'Survived'])

In [44]:
final.to_csv('result_rd.csv', index=False)

## Using a KNeighborsClassifier

In [45]:
from sklearn.neighbors import KNeighborsClassifier

In [47]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [48]:
knn.score(x_test, y_test)

0.8067796610169492

## Using a basic neural network

In [49]:
from keras import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

In [63]:
lr = 0.001

model = Sequential()
model.add(Dense(8, input_shape=(4,), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss=binary_crossentropy,
              optimizer=Adam(lr),
             metrics=['acc'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_9 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 17        
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________


In [64]:
model.fit(np.array(x_train), y_train,
                  epochs=100,
                  batch_size=5,
                  validation_data=(np.array(x_test), y_test))

Train on 596 samples, validate on 295 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1a3ce00090>

In [72]:
def get_my_x_data(data):
    return [ list([gender_to_int(row['Sex']), row['Age']/max_age, row['Pclass'], (row['SibSp'] + row['Parch'])/mean_n_of_family] for index, row in data.iterrows()) ]
            

test = test.fillna(mean_age)
x_test_data = get_my_x_data(test)

y_pred = model.predict(x_test_data)
y_pred = [1 if x[0] > 0.5 else 0 for x in y_pred]

final = pd.DataFrame(zip(test['PassengerId'], y_pred), columns=['PassengerId', 'Survived'])
final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [66]:
final.to_csv('result_dl_2.csv', index=False)