<a href="https://colab.research.google.com/github/mzignis/titanic/blob/master/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install eli5
!pip install xgboost



In [66]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

import eli5
from eli5.sklearn import PermutationImportance

sns.set()

In [3]:
HOME = '/content/drive/My Drive/ml_competition/titanic'
%cd $HOME

/content/drive/My Drive/ml_competition/titanic


In [4]:
data_dir = os.path.join(HOME, 'data')
os.listdir(data_dir)

['test.csv',
 'train.csv',
 'gender_submission.csv',
 'train_preprocessed.csv',
 'test_preprocessed.csv',
 'result.csv',
 'submission.csv']

In [5]:
train_data = pd.read_csv(os.path.join(data_dir, 'train_preprocessed.csv'), index_col=0)
train_data.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
797,1.0,0.827377,0.0,0.125912,-0.474545,-0.473674,0.079648,-0.473586,0.0,0.0,1.0
834,0.0,0.827377,1.0,-0.873136,-0.474545,-0.473674,-0.449291,-0.481304,0.0,0.0,1.0
593,0.0,0.827377,0.0,-0.104637,-0.474545,2.008933,0.103432,-0.492378,0.0,1.0,0.0
261,1.0,0.827377,1.0,-2.025883,3.154809,2.008933,0.076345,-0.016444,0.0,0.0,1.0
44,1.0,0.827377,0.0,-0.796286,-0.474545,-0.473674,0.051776,-0.489776,0.0,1.0,0.0
533,1.0,0.827377,0.0,-0.104637,-0.474545,2.008933,-0.448612,-0.198244,1.0,0.0,0.0
241,1.0,0.827377,0.0,-0.104637,0.432793,-0.473674,0.107063,-0.336334,0.0,1.0,0.0
591,1.0,-1.566107,0.0,1.739759,0.432793,-0.473674,-0.396363,0.927454,1.0,0.0,0.0
353,0.0,0.827377,1.0,-0.335187,0.432793,-0.473674,0.079638,-0.290024,0.0,0.0,1.0
846,0.0,0.827377,1.0,-0.104637,6.784163,2.008933,-0.449108,0.751946,0.0,0.0,1.0


In [21]:
x, y = train_data.drop(columns=['Survived']).values, train_data['Survived'].values

x.shape, y.shape

((891, 10), (891,))

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((668, 10), (668,), (223, 10), (223,))

In [44]:
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(),
    DecisionTreeClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
]

In [107]:
def score_model(model, x_train, y_train, x_test, y_test, verbose=True):
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    if verbose:
        print(f'Train accuracy: {accuracy_score(y_train, y_train_pred)*100:.2f}%')
        print(f'Test accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%')

    return accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_test_pred)


In [50]:
selected_models = []

for model in models:
    print(f'======== {model.__class__.__name__} ========')
    train_score, test_score = score_model(model, x_train, y_train, x_test, y_test)
    if train_score > 0.85 and test_score > 0.75:
        selected_models.append(model)
    print()

print('\n')
selected_models

Train accuracy: 81.44%
Test accuracy: 78.03%

Train accuracy: 85.78%
Test accuracy: 77.13%

Train accuracy: 81.44%
Test accuracy: 78.03%

Train accuracy: 84.58%
Test accuracy: 80.72%

Train accuracy: 99.70%
Test accuracy: 79.82%

Train accuracy: 77.54%
Test accuracy: 78.03%





Train accuracy: 99.70%
Test accuracy: 80.72%





[KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                      weights='uniform'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                  

In [92]:
knn = KNeighborsClassifier()
parameters = [{'n_neighbors': range(1, 21), 'weights': ['uniform', 'distance'], 'p': [1, 2], 
               'algorithm': ['ball_tree', 'kd_tree', 'brute']}]
clf = GridSearchCV(knn, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=14, p=1,
                     weights='uniform')
Train accuracy: 84.58%
Test accuracy: 78.48%


Weight,Feature
0.1363  ± 0.0352,x1
0.0717  ± 0.0454,x0
0.0233  ± 0.0192,x4
0.0170  ± 0.0243,x3
0.0143  ± 0.0036,x8
0.0117  ± 0.0185,x2
0.0117  ± 0.0091,x9
0.0045  ± 0.0098,x6
0.0018  ± 0.0122,x5
-0.0027  ± 0.0122,x7


In [109]:
best_match = None
features = [1, 0, 3, 4, 8, 2, 9, 6, 5]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

(0.8532934131736527, 0.7982062780269058, [1, 0, 3, 4, 8, 2])

In [None]:
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),

In [126]:
tree = DecisionTreeClassifier()
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'min_samples_split': [2, 3, 4, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
}
clf = GridSearchCV(tree, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=12, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
Train accuracy: 88.02%
Test accuracy: 80.72%


Weight,Feature
0.1803  ± 0.0533,x1
0.1300  ± 0.0241,x0
0.0314  ± 0.0443,x2
0.0296  ± 0.0201,x6
0.0135  ± 0.0150,x5
0.0117  ± 0.0044,x3
0.0099  ± 0.0067,x7
0  ± 0.0000,x8
0  ± 0.0000,x4
-0.0018  ± 0.0122,x9


In [127]:
best_match = None
features = [1, 0, 2, 6, 5, 3, 7]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

(0.8847305389221557, 0.8295964125560538, [1, 0, 2, 6, 5, 3, 7])

In [129]:
forest = RandomForestClassifier()
parameters = {
    'n_estimators': [1, 2, 3, 4, 5, 10, 15, 20, 25, 50],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'min_samples_split': [2, 3, 4, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
}
clf = GridSearchCV(forest, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=19, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train accuracy: 93.26%
Test accuracy: 80.72%


Weight,Feature
0.2126  ± 0.0264,x1
0.0350  ± 0.0154,x0
0.0332  ± 0.0193,x5
0.0251  ± 0.0399,x2
0.0242  ± 0.0108,x6
0.0072  ± 0.0072,x9
0.0054  ± 0.0067,x7
0.0018  ± 0.0134,x3
-0.0009  ± 0.0036,x8
-0.0018  ± 0.0044,x4


In [133]:
best_match = None
features = [1, 0, 5, 2, 6, 9, 7]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

(0.9416167664670658, 0.8295964125560538, [1, 0, 5, 2, 6, 9])

In [134]:
test_data = pd.read_csv(os.path.join(data_dir, 'test_preprocessed.csv'), index_col=0)
test_data.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
280,0.827377,0.0,-0.488887,-0.474545,-0.473674,0.027582,-0.474005,0.0,0.0,1.0
134,0.827377,1.0,1.04811,-0.474545,-0.473674,0.079621,-0.489442,0.0,0.0,1.0
38,0.827377,1.0,-0.335187,-0.474545,-0.473674,0.07435,-0.457142,0.0,0.0,1.0
413,0.827377,1.0,-0.181487,-0.474545,-0.473674,-0.447747,-0.486337,0.0,0.0,1.0
11,-1.566107,1.0,1.27866,-0.474545,-0.473674,-0.451621,-0.12492,0.0,0.0,1.0
382,0.827377,0.0,-0.181487,-0.474545,-0.473674,0.102899,-0.356469,0.0,0.0,1.0
168,-1.566107,0.0,-0.181487,-0.474545,-0.473674,-0.425594,-0.090272,1.0,0.0,0.0
181,-1.566107,1.0,0.587011,0.432793,0.76763,-0.425615,1.025945,1.0,0.0,0.0
286,0.827377,1.0,-0.181487,-0.474545,-0.473674,4.27443,-0.502445,0.0,0.0,1.0
273,0.827377,0.0,-0.181487,0.432793,-0.473674,0.111851,-0.336334,0.0,1.0,0.0


In [140]:
x_valid = test_data.values[:, [1, 0, 5, 2, 6, 9, 7]]
x_valid.shape

(418, 7)

In [142]:
model = clf.best_estimator_
predictions = model.predict(x_valid)

In [146]:
result_data = pd.read_csv(os.path.join(data_dir, 'gender_submission.csv'), index_col=0)
result_data['Survived'] = predictions.astype(int)
result_data

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [147]:
result_data.to_csv(os.path.join(data_dir, 'result.csv'))