# Titanic Survival Prediction

This notebook is intended to predict survival of Titanic passengers using Python and its libraries. Many models will be covered with fine-tuned hyperparameters.

## Import and preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from lib.data_utils import load_Titanic, create_submission
# scikit_learn
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
# Keras
'''
from keras.layers import Input, Dense, Flatten, Activation
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop
from keras.models import Model, Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as ktf
'''

'\nfrom keras.layers import Input, Dense, Flatten, Activation\nfrom keras.layers.core import Dropout\nfrom keras.layers.normalization import BatchNormalization\nfrom keras.optimizers import RMSprop\nfrom keras.models import Model, Sequential, load_model\nfrom keras.callbacks import ModelCheckpoint, EarlyStopping\nfrom keras import backend as ktf\n'

In [2]:
# read the data
x_train, y_train, x_test = load_Titanic('../data/all_tf.csv')

# preprocessing: standardize numeric, encode categorical
x_train[:,[0,2,3,5]] = scale(x_train[:,[0,2,3,5]])
x_test[:,[0,2,3,5]] = scale(x_test[:,[0,2,3,5]])
for i in [1,4,6]:
    x_train[:,i] = LabelEncoder().fit_transform(x_train[:,i])
    x_test[:,i] = LabelEncoder().fit_transform(x_test[:,i])

# see if it works well
print(x_train[:5])
print(x_test[:5])

[['0.8273772438659699' '1' '-0.221936632898316' '-0.879740569393426' '2'
  '0.3248226020114721' '2']
 ['-1.566106925889157' '0' '-0.366023089336598' '1.3612199269233827' '0'
  '0.3248226020114721' '3']
 ['0.8273772438659699' '0' '-0.273967853278807' '-0.798539974052804' '2'
  '-0.682686962816101' '1']
 ['-1.566106925889157' '0' '-0.348762732575762' '1.0620380556287148' '2'
  '0.3248226020114721' '3']
 ['0.8273772438659699' '1' '-0.348762732575762' '-0.784179243007400' '2'
  '-0.682686962816101' '2']]
[['0.873481905063612' '1' '-0.272216068494200' '-0.866889649115075' '1'
  '-0.686792910356192' '2']
 ['0.873481905063612' '0' '-0.360028904313975' '-0.969003971902478' '2'
  '0.3761220785579247' '3']
 ['-0.315819190430165' '1' '-0.419407107582584' '-0.669117059798610' '1'
  '-0.686792910356192' '2']
 ['0.873481905063612' '1' '-0.181894294508146' '-0.773508803025895' '2'
  '-0.686792910356192' '2']
 ['0.873481905063612' '0' '-0.088954498087713' '-0.443659387190805' '2'
  '1.1302722649391934



## Logistic model

We use random search to fine-tune the hyperparameters of logistic model and get the best.

In [42]:
# set the range of hyperparameters
param_distributions = {'penalty': ['l1', 'l2'],
                       'tol': np.linspace(1e-5,1e-3,10),
                       'C': np.linspace(0.1,2,10)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=LogisticRegression(),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train.astype(np.float64), y_train)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:   26.6s finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=200, n_jobs=1,
          param_distributions={'penalty': ['l1', 'l2'], 'tol': array([  1.00000e-05,   1.20000e-04,   2.30000e-04,   3.40000e-04,
         4.50000e-04,   5.60000e-04,   6.70000e-04,   7.80000e-04,
         8.90000e-04,   1.00000e-03]), 'C': array([ 0.1    ,  0.31111,  0.52222,  0.73333,  0.94444,  1.15556,
        1.36667,  1.57778,  1.78889,  2.     ])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

After the searching, we see the hyperparameters and accuracy of the best model, and keep the model.

In [44]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
logis = random_search.best_estimator_

{'penalty': 'l1', 'tol': 1.0000000000000001e-05, 'C': 0.31111111111111112}
Best accuracy:  0.7822671156
LogisticRegression(C=0.31111111111111112, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=1.0000000000000001e-05, verbose=0,
          warm_start=False)


In [46]:
print(logis.predict(x_test[:10].astype(np.float)))

[ 0.  0.  0.  0.  0.  0.  1.  0.  1.  0.]


## Lasso

In [48]:
# set the range of hyperparameters
param_distributions = {'alpha': np.linspace(1,10,100),
                       'tol': np.linspace(1e-5,1e-3,10)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=ElasticNet(fit_intercept=False, max_iter=10000),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train.astype(np.float), y_train.astype(np.float))

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    1.5s finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=False, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'tol': array([  1.00000e-05,   1.20000e-04,   2.30000e-04,   3.40000e-04,
         4.50000e-04,   5.60000e-04,   6.70000e-04,   7.80000e-04,
         8.90000e-04,   1.00000e-03]), 'alpha': array([  1.     ,   1.09091, ...,   9.90909,  10.     ])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [49]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
enet = random_search.best_estimator_

{'alpha': 1.0, 'tol': 0.00034000000000000002}
Best accuracy:  -0.413028041049
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=False, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.00034000000000000002,
      warm_start=False)


In [41]:
enet.predict(x_test.astype(np.float)[:50])

array([ 0.1559616 ,  0.2483576 ,  0.15601073,  0.18465058,  0.24848808,
        0.18466514,  0.09220014,  0.18494196,  0.19103329,  0.1848966 ,
        0.18462932,  0.18491486,  0.24896007,  0.18491486,  0.24888495,
        0.19135475,  0.15606793,  0.12727759,  0.12087459,  0.19103316,
        0.12779036,  0.05692337,  0.24871956,  0.12779865,  0.19192465,
        0.18477212,  0.06404554,  0.12727759,  0.1849545 ,  0.12753844,
        0.18491486,  0.18496254,  0.24861274,  0.24864492,  0.12778323,
        0.12727772,  0.12087817,  0.12089501,  0.18467196,  0.18510926,
        0.12742192,  0.18492005,  0.18462799,  0.24850152,  0.24884656,
        0.18463016,  0.12761631,  0.15595929,  0.19160935,  0.24854993])

## k-Nearest Neighbors

We use random search to fine-tune the hyperparameters of kNN and get the best model.

In [47]:
# set the range of hyperparameters
param_distributions = {'n_neighbors': np.array(np.linspace(1,15,15), dtype=np.int),
                       'p': np.array(np.linspace(1,5,5), dtype=np.int),
                       'leaf_size': np.array(np.linspace(10,50,40), dtype=np.int)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   48.8s finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'leaf_size': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
       44, 45, 46, 47, 48, 50]), 'p': array([1, 2, 3, 4, 5]), 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

After the searching, we see the hyperparameters and accuracy of the best model, and keep the model.

In [48]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
knn = random_search.best_estimator_

{'leaf_size': 50, 'p': 2, 'n_neighbors': 9}
Best accuracy:  0.83164983165
KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')


In [None]:
# save the model parameters
knn = KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

## Naive Bayes

Since there's no hyperparameters for Gaussian Naive Bayes, there's no need to do random search, so we just fit the data, and see the training accuracy.

In [20]:
nb = GaussianNB()
nb.fit(x_train.astype(np.float), y_train.astype(np.float))
print('Training accuracy: ', nb.score(x_train.astype(np.float), y_train.astype(np.float)))

Training accuracy:  0.768799102132


## Random Forest

Although Random Forest has been run on R with 0.78 test accuracy, we train here again with hyperparameters fine-tuned.

In [9]:
# set the range of hyperparameters
param_distributions = {'n_estimators': np.array(np.arange(10,501), dtype=np.int),
                       'max_features': np.array(np.linspace(1,7,7), dtype=np.int),
                       'min_samples_split': np.array(np.linspace(2,10,9), dtype=np.int),
                       'min_samples_leaf': np.array(np.linspace(1,10,10), dtype=np.int),
                       'min_impurity_decrease': np.linspace(0,0.1,20)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 27.7min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'min_impurity_decrease': array([ 0.     ,  0.00526,  0.01053,  0.01579,  0.02105,  0.02632,
        0.03158,  0.03684,  0.04211,  0.04737,  0.05263,  0.05789,
        0.06316,  0.06842,  0.07368,  0.07895,  0.08421,  0.08947,
        0.09474,  0.1    ]), 'n_estimators': array([ ...s_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': array([1, 2, 3, 4, 5, 6, 7])},
          pre_dispatch='2*n_jobs', ra

After the searching, we see the hyperparameters and accuracy of the best model, and keep the model.

In [10]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
rf = random_search.best_estimator_

{'min_impurity_decrease': 0.0, 'n_estimators': 265, 'max_features': 5, 'min_samples_split': 2, 'min_samples_leaf': 9}
Best accuracy:  0.838383838384
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=265, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [None]:
# save the model parameters
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=265, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## SVM

We use random search to fine-tune the hyperparameters of SVM and get the best model.

In [62]:
# set the range of hyperparameters
param_distributions = {'C': np.linspace(0.1,1,20),
                       'gamma': np.linspace(0.1,1,10),
                       'tol': np.linspace(1e-4,1e-2,10)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=SVC(probability=True),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.3min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'C': array([ 0.1    ,  0.14737,  0.19474,  0.24211,  0.28947,  0.33684,
        0.38421,  0.43158,  0.47895,  0.52632,  0.57368,  0.62105,
        0.66842,  0.71579,  0.76316,  0.81053,  0.85789,  0.90526,
        0.95263,  1.     ]), 'tol': array([ 0.0001,  0.0012,  0.0023,  0.0034,  0.0045,  0.0056,  0.0067,
        0.0078,  0.0089,  0.01  ]), 'gamma': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

After the searching, we see the hyperparameters and accuracy of the best model, and keep the model.

In [64]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
svm = random_search.best_estimator_

{'C': 0.90526315789473688, 'tol': 0.0067000000000000002, 'gamma': 0.59999999999999998}
Best accuracy:  0.836139169473
SVC(C=0.90526315789473688, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.59999999999999998,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.0067000000000000002, verbose=False)


In [None]:
# save the model parameters
svm = SVC(C=0.90526315789473688, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.59999999999999998,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.0067000000000000002, verbose=False)

Then we try grid search.

In [3]:
# set the range of hyperparameters
param_distributions = {'C': np.linspace(0.1,1,10),
                       'gamma': np.linspace(0.1,1,10),
                       'tol': np.linspace(1e-4,1e-2,10)}
# initialize the random search
grid_search = GridSearchCV(estimator=SVC(probability=True),
                           param_grid=param_distributions,
                           cv=None,
                           verbose=1)
# start searching
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=1)]: Done 3000 out of 3000 | elapsed: 12.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]), 'gamma': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]), 'tol': array([ 0.0001,  0.0012,  0.0023,  0.0034,  0.0045,  0.0056,  0.0067,
        0.0078,  0.0089,  0.01  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [5]:
print(grid_search.best_params_)
print('Best accuracy: ', grid_search.best_score_)
print(grid_search.best_estimator_)
svm = grid_search.best_estimator_

{'C': 0.80000000000000004, 'gamma': 0.10000000000000001, 'tol': 0.0001}
Best accuracy:  0.829405162738
SVC(C=0.80000000000000004, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.10000000000000001,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.0001, verbose=False)


Grid search doesn't show significant advantage on training accuracy, and it takes much longer time.

## MLP in scikit_learn

In [66]:
# set the range of hyperparameters
param_distributions = {'hidden_layer_sizes': [(50,25),(25,25),(25,10),(50,),(25,),(10,)],
                       'alpha': np.linspace(0.00005,0.05,20),
                       'tol': np.linspace(1e-4,1e-2,10)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=MLPClassifier(solver='lbfgs'),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train.astype(np.float), y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.4min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'hidden_layer_sizes': [(50, 25), (25, 25), (25, 10), (50,), (25,), (10,)], 'tol': array([ 0.0001,  0.0012,  0.0023,  0.0034,  0.0045,  0.0056,  0.0067,
        0.0078,  0.0089,  0.01  ]), 'alpha': array([  5.00000e-05,   2.67895e-03,   5.30789e-03,   7.93684e-03,
         1.0565...,   3.68553e-02,   3.94842e-02,
         4.21132e-02,   4.47421e-02,   4.73711e-02,   5.00000e-02])},
          pre_dispatch='

In [67]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
mlp = random_search.best_estimator_

{'hidden_layer_sizes': (25,), 'tol': 0.0078000000000000005, 'alpha': 0.010565789473684211}
Best accuracy:  0.826038159371
MLPClassifier(activation='relu', alpha=0.010565789473684211,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(25,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0078000000000000005,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [None]:
# save the model parameters
mlp = MLPClassifier(activation='relu', alpha=0.010565789473684211,
       batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(25,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0078000000000000005,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Gradient Boosting

In [23]:
# set the range of hyperparameters
param_distributions = {'learning_rate': np.linspace(1e-3,1,100),
                       'n_estimators': np.around(np.linspace(50,1000,100)).astype(np.int),
                       'max_depth': np.linspace(2,10,9),
                       'min_samples_split': np.linspace(2,40,39).astype(np.int),
                       'max_features': np.linspace(1,7,7).astype(np.int),
                       'subsample': np.linspace(0.1,1,20)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(x_train.astype(np.float), y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 11.6min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'max_depth': array([  2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.]), 'learning_rate': array([ 0.001  ,  0.01109, ...,  0.98991,  1.     ]), 'subsample': array([ 0.1    ,  0.14737,  0.19474,  0.24211,  0.28947,  0.33684,
        0.38421,  0.43158,  0.47895,  0.52632,  0.57...    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38,

In [24]:
# see the best
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
gbm = random_search.best_estimator_

{'max_depth': 6.0, 'learning_rate': 0.011090909090909092, 'subsample': 0.14736842105263159, 'n_estimators': 587, 'max_features': 7, 'min_samples_split': 25}
Best accuracy:  0.832772166105
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.011090909090909092, loss='deviance',
              max_depth=6.0, max_features=7, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=25,
              min_weight_fraction_leaf=0.0, n_estimators=587,
              presort='auto', random_state=None,
              subsample=0.14736842105263159, verbose=0, warm_start=False)


In [None]:
# save the model parameters
gbm = GradientBoostingClassifier(learning_rate=0.011090909090909092,
                                 max_depth=6,
                                 max_features=7,
                                 min_samples_split=25,
                                 n_estimators=587,
                                 subsample=0.14736842105263159)

## Ensembling

In [25]:
# Have the collections of models
mlp = MLPClassifier(alpha=0.010565789473684211,
                    hidden_layer_sizes=(25,),
                    solver='lbfgs',
                    tol=0.0078000000000000005)
svm = SVC(C=0.90526315789473688,
          gamma=0.59999999999999998,
          probability=True,
          tol=0.0067000000000000002)
rf = RandomForestClassifier(max_features=5,
                            min_impurity_decrease=0.0,
                            min_samples_leaf=9, 
                            min_samples_split=2,
                            n_estimators=265)
knn = KNeighborsClassifier(leaf_size=50,
                           n_neighbors=9,
                           p=2)
gbm = GradientBoostingClassifier(learning_rate=0.011090909090909092,
                                 max_depth=6,
                                 max_features=7,
                                 min_samples_split=25,
                                 n_estimators=587,
                                 subsample=0.14736842105263159)
log = LogisticRegression(C=0.31111111111111112, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=1.0000000000000001e-05, verbose=0,
          warm_start=False)
nb = GaussianNB()

# fit and produce predictions
classifiers = [mlp, svm, rf, knn, gbm, nb, log]
pred = []
for clf in classifiers:
    clf.fit(x_train.astype(np.float), y_train)
    pred.append(clf.predict(x_train.astype(np.float)).astype(np.float))
pred = np.array(pred).T

See the correlation of prediction results using Chi-square test.

In [26]:
print(pred.shape)
print(np.corrcoef(pred, rowvar=False))

(891, 7)
[[ 1.          0.81258949  0.82598103  0.78985123  0.81125225  0.72023638
   0.77886047]
 [ 0.81258949  1.          0.8646752   0.86801047  0.85391864  0.68270761
   0.70145759]
 [ 0.82598103  0.8646752   1.          0.86046002  0.90641598  0.68855527
   0.73401199]
 [ 0.78985123  0.86801047  0.86046002  1.          0.85207741  0.68607654
   0.66199908]
 [ 0.81125225  0.85391864  0.90641598  0.85207741  1.          0.72705634
   0.7263311 ]
 [ 0.72023638  0.68270761  0.68855527  0.68607654  0.72705634  1.
   0.7804306 ]
 [ 0.77886047  0.70145759  0.73401199  0.66199908  0.7263311   0.7804306
   1.        ]]


Here, we combine some of the best models above to work together and vote. Specificially, we would use trained **Random Forest**, **SVM**, **k-nearest neighbors** and **Multilayer Perceptrons** to form an ensemble.

In [69]:
ens = VotingClassifier(estimators=[('RF',rf),('SVM',svm),('kNN',knn),('mlp',mlp)], 
                       voting='soft')
ens.fit(x_train.astype(np.float), y_train)

VotingClassifier(estimators=[('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_...bfgs', tol=0.0078000000000000005,
       validation_fraction=0.1, verbose=False, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [70]:
print(ens.predict(x_test.astype(np.float)[:10]))

['0' '1' '0' '0' '1' '0' '1' '0' '1' '0']


## Stacking

Use the prediction results as features to train another classifier again.

In [29]:
# use random forest as second level classifier, again random search
# set the range of hyperparameters
param_distributions = {'n_estimators': np.array(np.arange(10,501), dtype=np.int),
                       'max_features': np.array(np.linspace(1,5,5), dtype=np.int),
                       'min_samples_split': np.array(np.linspace(2,10,9), dtype=np.int),
                       'min_samples_leaf': np.array(np.linspace(1,10,10), dtype=np.int),
                       'min_impurity_decrease': np.linspace(0,0.1,20)}
# initialize the random search
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_distributions,
                                   n_iter=30,
                                   cv=10,
                                   verbose=1)
# start searching
random_search.fit(pred, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  8.0min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), 'n_estimators': array([ 10,  11, ..., 499, 500]), 'max_features': array([1, 2, 3, 4, 5]), 'min_impurity_decrease': array([ 0.     ,  0.00526,  0.01053,  0.01579,  0.02105,  0.02632,
        0.03158,  0.03684,  0.04211,  0.04737,  0.05263,  0.05789,
        0.06316,  0.06842,  0.07368,  0.07895,  0.08421,  0.08947,
        0.09474,  0.1    ]), 'min_sample

In [30]:
print(random_search.best_params_)
print('Best accuracy: ', random_search.best_score_)
print(random_search.best_estimator_)
stack_2 = random_search.best_estimator_

{'min_samples_leaf': 8, 'n_estimators': 304, 'min_samples_split': 10, 'min_impurity_decrease': 0.094736842105263161, 'max_features': 3}
Best accuracy:  0.881032547699
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.094736842105263161,
            min_impurity_split=None, min_samples_leaf=8,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=304, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


## Neural Network using Keras

To have more flexibility, we use Keras instead of Multilayer Perceptron in scikit_learn.

In concern with model selection, since the data only has 7 features and is not time series, both CNN and RNN are not quite suitable, therefore only MLP(Multilayer Perceptron) is appropriate.

First, define the mlp model.

In [21]:
def mlp_model(hidden_dims, dropout=0.5, lr=0.001):
    # build the model
    mlp = Sequential()
    mlp.add(Flatten(input_shape=(7,1)))
    for hidden_dim in hidden_dims:
        mlp.add(Dense(units=hidden_dim, activation='relu'))
        if dropout:
            mlp.add(Dropout(rate=dropout))
    mlp.add(Dense(units=2, activation='softmax'))
    
    # set the optimizer and loss
    rmsprop = RMSprop(lr=lr, decay=0.99)
    mlp.compile(loss='sparse_categorical_crossentropy',
                       optimizer=rmsprop, metrics=['accuracy'])
    return mlp

Then set the early stopping and train the model. 91 observations are used for monitoring, and 800 observations are used to train. Here we just try usual hyperparameters to train and see the result.

In [38]:
earlystop = EarlyStopping(min_delta=0.0001, patience=5)

# use test data to monitor early stopping
model = mlp_model(hidden_dims=[50,25],
                  dropout=0.2,
                  lr=0.001)
model.fit(x_train[:800].reshape(-1,7,1), y_train[:800].reshape(-1,1),
               batch_size=32,
               epochs=100,
               validation_data=(x_train[800:].reshape(-1,7,1), y_train[800:].reshape(-1,1)),
               callbacks=[earlystop],
               initial_epoch=0)

Train on 800 samples, validate on 91 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Ep

<keras.callbacks.History at 0x7fbd7346d940>

The result above is actually not satisfied, so we use random search to find the best set of hyperparameters.

In [None]:
best_parameters={'lr': 0.001, 'dropout': 0.5, 'hidden_dims': []}
lowest_err = 1000
lr_range = (0.1,0.0001); dropout_range = (0.3,0.8); dense_range = (512,2048)
while True:
    lr = np.random.uniform(lr_range[0], lr_range[1])
    dropout = np.random.uniform(dropout_range[0], dropout_range[1])
    dense_dim = int(np.random.uniform(dense_range[0], dense_range[1]))
    drop_conv, avgpool = np.random.binomial(1,0.5,2)
    ktf.clear_session()
    test_err, test_acc = simpleCNN_model(lr, dropout, dense_dim, drop_conv, avgpool)
    if test_err < lowest_err:
        print('new lowest: ', round(test_err,2), round(test_acc,2), 
              (round(lr,4), round(dropout,2), dense_dim, bool(drop_conv), bool(avgpool)))
        lowest_err = test_err
        best_parameters['lr'] = lr
        best_parameters['dropout'] = dropout
        best_parameters['dense_dim'] = dense_dim

## Create submission

This part is used to generate submission file for Kaggle competition using trained models.

In [72]:
# create submission file
create_submission(ens, x_test.astype(np.float), '../submission/submission_ens2_new_tf.csv')

In [36]:
# for stacking
classifiers = [mlp, svm, rf, knn, gbm]
pred = []
for clf in classifiers:
    pred.append(clf.predict(x_test.astype(np.float)).astype(np.float))
    
pred = np.array(pred).T
print(pred.shape)
create_submission(stack_2, pred.astype(np.float), '../submission/submission_stack2_new_tf.csv')

(418, 5)
