<a href="https://colab.research.google.com/github/mzignis/titanic/blob/master/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

sns.set()

In [45]:
HOME = '/content/drive/My Drive/ml_competition/titanic'
%cd $HOME

/content/drive/My Drive/ml_competition/titanic


In [46]:
data_dir = os.path.join(HOME, 'data')
os.listdir(data_dir)

['test.csv',
 'train.csv',
 'gender_submission.csv',
 'train_preprocessed.csv',
 'test_preprocessed.csv']

In [47]:
train_data = pd.read_csv(os.path.join(data_dir, 'train_preprocessed.csv'), index_col=0)
train_data.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
775,0.0,0.827377,1.0,-0.873136,-0.474545,-0.473674,0.076347,-0.492378,0.0,0.0,1.0
281,0.0,0.827377,1.0,-0.104637,-0.474545,-0.473674,0.076935,-0.49028,0.0,0.0,1.0
256,1.0,-1.566107,0.0,-0.104637,-0.474545,-0.473674,-0.425876,0.946246,1.0,0.0,0.0
435,1.0,-1.566107,0.0,-1.180535,0.432793,2.008933,-0.279283,1.767741,0.0,0.0,1.0
257,1.0,-1.566107,0.0,0.049062,-0.474545,-0.473674,-0.284782,1.093229,0.0,0.0,1.0
866,1.0,-0.369365,0.0,-0.181487,0.432793,-0.473674,-0.449404,-0.369389,1.0,0.0,0.0
9,1.0,-0.369365,0.0,-1.180535,0.432793,-0.473674,-0.090315,-0.042956,1.0,0.0,0.0
16,0.0,0.827377,1.0,-2.102733,3.154809,0.76763,0.13057,-0.061999,0.0,1.0,0.0
239,0.0,-0.369365,1.0,0.279612,-0.474545,-0.473674,-0.450263,-0.401268,0.0,0.0,1.0
126,0.0,0.827377,1.0,-0.104637,-0.474545,-0.473674,0.111852,-0.492378,0.0,1.0,0.0


In [48]:
test_data = pd.read_csv(os.path.join(data_dir, 'test_preprocessed.csv'), index_col=0)
test_data.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
0,0.827377,1.0,0.394887,-0.474545,-0.473674,0.051705,-0.490783,0.0,1.0,0.0
1,0.827377,0.0,1.35551,0.432793,-0.473674,0.10103,-0.507479,0.0,0.0,1.0
2,-0.369365,1.0,2.508257,-0.474545,-0.473674,-0.086444,-0.453367,0.0,1.0,0.0
3,0.827377,1.0,-0.181487,-0.474545,-0.473674,0.027687,-0.474005,0.0,0.0,1.0
4,0.827377,0.0,-0.565736,0.432793,0.76763,4.274406,-0.401017,0.0,0.0,1.0
5,0.827377,1.0,-1.180535,-0.474545,-0.473674,-0.44119,-0.462679,0.0,0.0,1.0
6,0.827377,0.0,0.049062,-0.474545,-0.473674,0.051798,-0.49481,0.0,1.0,0.0
7,-0.369365,1.0,-0.258337,0.432793,0.76763,-0.073546,-0.064516,0.0,0.0,1.0
8,0.827377,0.0,-0.873136,-0.474545,-0.473674,-0.448629,-0.502864,1.0,0.0,0.0
9,0.827377,1.0,-0.642586,1.340132,-0.473674,-0.378189,-0.162169,0.0,0.0,1.0


In [49]:
y_test_data = pd.read_csv(os.path.join(data_dir, 'gender_submission.csv'), index_col=0)

In [50]:
x, y = train_data.drop(columns=['Survived']).values, train_data['Survived'].values

x.shape, y.shape

((891, 10), (891,))

In [51]:
x_test = test_data.values
y_test = y_test_data.values

x_test.shape, y_test.shape

((418, 10), (418, 1))

In [52]:
rnd_forrest = RandomForestClassifier()
log_reg = LogisticRegression()

In [59]:
def score_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    print('======== TRAIN DATA ========')
    print(f'Accuracy: {accuracy_score(y_train, y_train_pred)*100:.2f}%')
    print(f'F1:       {f1_score(y_train, y_train_pred)*100:.2f}%')
    print(confusion_matrix(y_train, y_train_pred))
    print('\n\n')

    print('======== TEST DATA ========')
    print(f'Accuracy: {accuracy_score(y_test, y_test_pred) * 100:.2f}%')
    print(f'F1:       {f1_score(y_test, y_test_pred) * 100:.2f}%')
    print(confusion_matrix(y_test, y_test_pred))

In [60]:
score_model(log_reg, x, y, x_test, y_test)

Accuracy: 79.91%
F1:       72.84%
[[472  77]
 [102 240]]



Accuracy: 92.82%
F1:       90.32%
[[248  18]
 [ 12 140]]


In [61]:
score_model(rnd_forrest, x, y, x_test, y_test)

Accuracy: 99.78%
F1:       99.71%
[[547   2]
 [  0 342]]



Accuracy: 85.41%
F1:       80.13%
[[234  32]
 [ 29 123]]
