<a href="https://colab.research.google.com/github/mzignis/titanic/blob/master/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install eli5
!pip install xgboost



In [26]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier

import eli5
from eli5.sklearn import PermutationImportance

sns.set()

In [3]:
HOME = '/content/drive/My Drive/ml_competition/titanic'
%cd $HOME

/content/drive/My Drive/ml_competition/titanic


In [4]:
data_dir = os.path.join(HOME, 'data')
os.listdir(data_dir)

['test.csv',
 'train.csv',
 'gender_submission.csv',
 'train_preprocessed.csv',
 'test_preprocessed.csv',
 'result.csv',
 'submission.csv']

In [5]:
train_data = pd.read_csv(os.path.join(data_dir, 'train_preprocessed.csv'), index_col=0)
train_data.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,C,Q,S
582,0.0,-0.369365,1.0,1.893459,-0.474545,-0.473674,-0.409386,-0.12492,0.0,0.0,1.0
32,1.0,0.827377,0.0,-0.104637,-0.474545,-0.473674,0.058969,-0.492378,0.0,1.0,0.0
168,0.0,-1.566107,1.0,-0.104637,-0.474545,-0.473674,-0.426283,-0.12643,0.0,0.0,1.0
146,1.0,0.827377,1.0,-0.181487,-0.474545,-0.473674,0.080866,-0.491456,0.0,0.0,1.0
365,0.0,0.827377,1.0,0.049062,-0.474545,-0.473674,-0.441894,-0.502445,0.0,0.0,1.0
688,0.0,0.827377,1.0,-0.873136,-0.474545,-0.473674,0.080856,-0.491456,0.0,0.0,1.0
696,0.0,0.827377,1.0,1.12496,-0.474545,-0.473674,0.101518,-0.486337,0.0,0.0,1.0
68,1.0,0.827377,0.0,-0.949986,3.154809,2.008933,4.27438,-0.488854,0.0,0.0,1.0
806,0.0,-1.566107,1.0,0.740711,-0.474545,-0.473674,-0.281889,-0.648422,0.0,0.0,1.0
420,0.0,0.827377,1.0,-0.104637,-0.474545,-0.473674,0.079664,-0.489442,1.0,0.0,0.0


In [6]:
x, y = train_data.drop(columns=['Survived']).values, train_data['Survived'].values

x.shape, y.shape

((891, 10), (891,))

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((668, 10), (668,), (223, 10), (223,))

In [8]:
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(),
    DecisionTreeClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
]

In [9]:
def score_model(model, x_train, y_train, x_test, y_test, verbose=True):
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    if verbose:
        print(f'Train accuracy: {accuracy_score(y_train, y_train_pred)*100:.2f}%')
        print(f'Test accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%')

    return accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_test_pred)


In [10]:
selected_models = []

for model in models:
    print(f'======== {model.__class__.__name__} ========')
    train_score, test_score = score_model(model, x_train, y_train, x_test, y_test)
    if train_score > 0.85 and test_score > 0.75:
        selected_models.append(model)
    print()

print('\n')
selected_models

Train accuracy: 79.34%
Test accuracy: 79.37%

Train accuracy: 85.63%
Test accuracy: 76.68%

Train accuracy: 79.94%
Test accuracy: 79.82%

Train accuracy: 83.08%
Test accuracy: 81.17%

Train accuracy: 99.85%
Test accuracy: 73.54%

Train accuracy: 76.50%
Test accuracy: 72.65%





Train accuracy: 99.85%
Test accuracy: 79.82%





[KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                      weights='uniform'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False)]

In [11]:
knn = KNeighborsClassifier()
parameters = [{'n_neighbors': range(1, 21), 'weights': ['uniform', 'distance'], 'p': [1, 2], 
               'algorithm': ['ball_tree', 'kd_tree', 'brute']}]
clf = GridSearchCV(knn, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=1,
                     weights='uniform')
Train accuracy: 82.63%
Test accuracy: 80.72%


Weight,Feature
0.1130  ± 0.0337,x1
0.0350  ± 0.0318,x0
0.0215  ± 0.0105,x2
0.0161  ± 0.0185,x3
0.0108  ± 0.0193,x4
0.0099  ± 0.0067,x7
0.0090  ± 0.0127,x5
0.0081  ± 0.0105,x9
0.0036  ± 0.0036,x8
-0.0072  ± 0.0146,x6


In [12]:
best_match = None
features = [1, 0, 3, 4, 8, 2, 9, 6, 5]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

(0.8278443113772455, 0.8161434977578476, [1, 0, 3, 4, 8, 2, 9])

In [13]:
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),

(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),)

In [14]:
tree = DecisionTreeClassifier()
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'min_samples_split': [2, 3, 4, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
}
clf = GridSearchCV(tree, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

KeyboardInterrupt: ignored

In [None]:
best_match = None
features = [1, 0, 2, 6, 5, 3, 7]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

In [39]:
def predict_data(model, features):
    valid_data = pd.read_csv(os.path.join(data_dir, 'test_preprocessed.csv'), index_col=0).values[:, features]
    result_data = pd.read_csv(os.path.join(data_dir, 'gender_submission.csv'), index_col=0)
    predictions = model.predict(valid_data)
    result_data['Survived'] = predictions.astype(int)
    result_data.to_csv(os.path.join(data_dir, 'result.csv'))

# predict_data(model)

In [None]:
forest = RandomForestClassifier()
parameters = {
    'n_estimators': [1, 2, 3, 4, 5, 10, 15, 20, 25, 50],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'min_samples_split': [2, 3, 4, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
}
clf = GridSearchCV(forest, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

In [None]:
best_match = None
features = [1, 0, 5, 2, 6, 9, 7]

for idx, _ in enumerate(features):
    new_x_train = x_train[:, features[:idx+1]]
    new_x_test = x_test[:, features[:idx+1]]

    train_score, test_score = score_model(model, new_x_train, y_train, new_x_test, y_test, verbose=False)

    if not best_match:
        best_match = train_score, test_score, features[:idx+1]
        continue

    if train_score >= best_match[0] and test_score >= best_match[1]:
        best_match = train_score, test_score, features[:idx+1]

best_match

In [None]:
x_valid = test_data.values[:, [1, 0, 5, 2, 6, 9, 7]]
x_valid.shape

In [None]:
model = clf.best_estimator_
model

### Models

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
features = [0, 1, 2, 3, 4, 5]
x_train, x_test = x_train[:, features], x_test[:, features]

svm = SVC()
c_values = [9, 10, 11]
parameters = [
    {
        'C': c_values,
        'kernel': ['rbf'],
        'gamma': ['scale'] 
    },
]   
clf = GridSearchCV(svm, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

SVC(C=9, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Train accuracy: 84.28%
Test accuracy: 82.06%


Weight,Feature
0.2529  ± 0.0427,x1
0.0430  ± 0.0134,x0
0.0359  ± 0.0057,x3
0.0287  ± 0.0091,x2
0.0135  ± 0.0212,x4
-0.0018  ± 0.0044,x5


In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
features = [0, 1, 2, 3, 4, 5]
x_train, x_test = x_train[:, features], x_test[:, features]

forest = RandomForestClassifier()
parameters = {
    'n_estimators': range(5, 11),
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 8),
    'min_samples_split': range(10, 17, 2)
}
clf = GridSearchCV(forest, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=7,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train accuracy: 85.93%
Test accuracy: 81.17%


Weight,Feature
0.2206  ± 0.0444,x1
0.0269  ± 0.0139,x2
0.0108  ± 0.0245,x4
0.0018  ± 0.0134,x5
0.0018  ± 0.0231,x0
0.0009  ± 0.0154,x3


In [24]:
xgb = XGBClassifier()
score_model(xgb, x_train, y_train, x_test, y_test)

Train accuracy: 88.32%
Test accuracy: 83.41%


(0.8832335329341318, 0.8340807174887892)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
features = [0, 1, 2, 3, 4, 5]
x_train, x_test = x_train[:, features], x_test[:, features]

xgb = XGBClassifier()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7],
    'n_estimators': [10, 15, 25, 30, 50, 100],
    'learning_rate': [0.05, 0.1, 0.5, 1],
}
clf = GridSearchCV(xgb, parameters, cv=5) 
clf.fit(x_train, y_train)
model = clf.best_estimator_
print(model)
score_model(model, x_train, y_train, x_test, y_test)

perm = PermutationImportance(model).fit(x_test, y_test)
eli5.show_weights(perm)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=30, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
Train accuracy: 87.57%
Test accuracy: 83.41%


Weight,Feature
0.2018  ± 0.0336,x1
0.0700  ± 0.0434,x0
0.0673  ± 0.0098,x2
0.0296  ± 0.0146,x3
0.0063  ± 0.0257,x5
0.0036  ± 0.0067,x4


In [32]:
svc = SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
          decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
          max_iter=-1, probability=False, random_state=None, shrinking=True,
          tol=0.001, verbose=False)

In [33]:
forest = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                criterion='entropy', max_depth=7, max_features='auto',
                                max_leaf_nodes=None, max_samples=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=1, min_samples_split=10,
                                min_weight_fraction_leaf=0.0, n_estimators=7,
                                n_jobs=None, oob_score=False, random_state=None,
                                verbose=0, warm_start=False)

In [34]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bynode=1, colsample_bytree=1, gamma=0,
                    learning_rate=0.1, max_delta_step=0, max_depth=5,
                    min_child_weight=1, missing=None, n_estimators=30, n_jobs=1,
                    nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=None, subsample=1, verbosity=1)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y)
features = [0, 1, 2, 3, 4, 5]
x_train, x_test = x_train[:, features], x_test[:, features]

model = VotingClassifier([('svc', svc), ('forest', forest), ('xgb', xgb)], voting='hard')
model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

print(f'Train accuracy: {accuracy_score(y_train, y_train_pred)*100:.2f}%')
print(f'Test accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}%')

Train accuracy: 85.33%
Test accuracy: 83.41%


In [43]:
predict_data(model, features)