In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
import scipy.stats as sci

from titanic import *

# Build the model

First we will load the training data and extract the training labels

In [2]:
titanicData = pd.read_csv("train.csv")
y = titanicData["Survived"].values

## Preparing the data

Several columns contain missing data

In [3]:
titanicData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We will with each column differently. The easiest column to deal with is the `Embarked` data. Here we will simply fill in missing values with the most frequent departure point.

In [4]:
mostEmbarked = titanicData['Embarked'].mode()[0]
titanicData.at[pd.isnull(titanicData['Embarked']), "Embarked" ] = mostEmbarked

The next column that definitely needs to be filled in is `Age`. As the ages are spread over a fairly wide range we may need to be slightly more clever than replacing missing values with the average.

A simple way to improve estimate of age is to consider the honourific of each passenger. As an example the title of *Master.* is only applied to male children thus they are very likely to fall below the average of all ages. By collecting all honourifics and the mean age for each we can then fill in the missing ages based on these values.

In [5]:
means = titleMeans( titanicData )

inferAge(titanicData, means)

In a similar vein, we will later need to fill in fare data. To this end it will be useful to know the average fare for each class.

In [6]:
avgFirst  = titanicData['Fare'][ titanicData['Pclass'] == 1 ].mean()
avgSecond = titanicData['Fare'][ titanicData['Pclass'] == 2 ].mean()
avgThird  = titanicData['Fare'][ titanicData['Pclass'] == 3 ].mean()

print(f"Average first class: {avgFirst}\nAverage second class: {avgSecond}\nAverage third class: {avgThird}")

avgFares = { 1 : avgFirst, 2 : avgSecond, 3 : avgThird }

Average first class: 84.1546875
Average second class: 20.662183152173913
Average third class: 13.675550101832993


To help or model to find patterns we will attempt to add a column to the data that indicates which family a given passenger belongs to. For our purposes are people who are part of a family must have either a non-zero `Parch` or `SibSp` entry and there must be more than some `minSize` number of people in the training set who share their surname.

The parameter `minSize` can be tuned to avoid adding to many classes for small families, ease it is set to 3 by default. Another issue is families with members who have different surnames.

In [7]:
titanicData["FamSize"] = titanicData["Parch"] + titanicData["SibSp"]

families = findFamilies( titanicData, minSize = 1 )
addFamily( titanicData, families )

Finally we will do some basic data preparation operations: one-hot encode, scale, and the like.

In [8]:
titanicData['Sex'] = pd.Categorical(titanicData['Sex'], categories = ["male", "female"])
titanicData['Sex'] = titanicData['Sex'].cat.codes

titanicData['Embarked'] = pd.Categorical(titanicData['Embarked'], categories = ["S", "Q", "C"])
titanicData['Embarked'] = titanicData['Embarked'].cat.codes

titanicData['Family'] = pd.Categorical(titanicData['Family'], categories = [ "***", *sorted(families.keys()) ])
titanicData['Family'] = titanicData['Family'].cat.codes

In [9]:
numeric = Pipeline( [ ("select", DataFrameSelector(["Age", "FamSize", "Fare"])), #"SibSp", "Parch"
                     ("scale", MinMaxScaler()) ])

sex = Pipeline([ ("select", DataFrameSelector(["Sex"])) ])

pclass = Pipeline([ ("select", DataFrameSelector(["Pclass"])),
                   ("onehot", OneHotEncoder(sparse = False)) ])

embark = Pipeline([ ("select", DataFrameSelector(["Embarked"])),
                   ("onehot", OneHotEncoder(sparse = False)) ])

family = Pipeline([ ("select", DataFrameSelector(["Family"])),
                    ("onehot", OneHotEncoder(sparse = False)) ])

dataPrep = FeatureUnion( transformer_list=[ ("numeric", numeric),
                                            ("sex", sex),
                                            ("pclass", pclass),
                                            ("embark", embark),
                                            ("family", family)] )

In [10]:
trainData = dataPrep.fit_transform( titanicData )

## Fitting the model

In [11]:
paramsKNN = { "weights" : ["uniform", "distance"],
              "n_neighbors": [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ] }

knn = KNeighborsClassifier()

gridKNN = GridSearchCV( knn, paramsKNN, cv = 10, verbose = 1, n_jobs = -1 )
gridKNN.fit(trainData, y)

Fitting 10 folds for each of 26 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    8.9s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [12]:
gridKNN.best_score_

0.8260381593714927

In [13]:
gridKNN.best_params_

{'n_neighbors': 11, 'weights': 'distance'}

In [14]:
d = list(range(1,31))
d.append(None)

paramsForest = { "n_estimators" : range(1,31),
                 "max_features" : [2, 3, 4],
                 "max_depth"    : d,
                 'bootstrap': [False, True] }

forest = RandomForestClassifier()

gridForest = GridSearchCV( forest, paramsForest, cv = 10, verbose = 1, n_jobs = -1 )
gridForest.fit(trainData, y)

Fitting 10 folds for each of 5580 candidates, totalling 55800 fits


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 2368 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 5868 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 10768 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 17040 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 20932 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 25482 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 30732 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 36682 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 43332 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 50682 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 55800 out of 55800 | elapsed:  6.9min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': range(1, 31), 'max_features': [2, 3, 4], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, None], 'bootstrap': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [15]:
gridForest.best_score_

0.8372615039281706

In [16]:
gridForest.best_params_

{'bootstrap': True, 'max_depth': 28, 'max_features': 3, 'n_estimators': 19}

In [17]:
from sklearn.linear_model import SGDClassifier

paramsSGD = { "alpha" : sci.expon( scale = 1.0 ),
              "l1_ratio" : sci.uniform() }
sgd = SGDClassifier(  loss = "log", penalty = "elasticnet", tol = 1.0E-06, max_iter = 10000 )

randSGD = RandomizedSearchCV( sgd, paramsSGD, cv = 10, n_iter = 2000,
                              verbose = 1, n_jobs = -1 )
randSGD.fit(trainData, y)

Fitting 10 folds for each of 2000 candidates, totalling 20000 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 421 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 1021 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done 1484 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2048 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2698 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3545 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 4395 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 5402 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 6494 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 7748 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 9182 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 10896 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 12408 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 14132 tasks      | elapsed: 

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=10000, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=1e-06, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=2000, n_jobs=-1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fef9c79f710>, 'l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fef9c79f588>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [18]:
randSGD.best_score_

0.8058361391694725

In [19]:
randSGD.best_params_

{'alpha': 0.0022283397720714343, 'l1_ratio': 0.05734028411346026}

In [20]:
from sklearn.svm import SVR

params = { "kernel" : [ "linear", "rbf" ],
            "C" : sci.uniform(1, 500),
            "gamma" : sci.expon(scale=1.0)
         }

svc = SVC( probability = True )

rndSVC = RandomizedSearchCV( svc, param_distributions = params,
                                n_iter = 50, cv = 5, verbose = 1, n_jobs = -1 )

rndSVC.fit(trainData, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

In [None]:
rndSVC.best_score_

In [None]:
rndSVC.best_params_

In [None]:
knnBest = KNeighborsClassifier( **gridKNN.best_params_ )
forestBest = RandomForestClassifier( **gridForest.best_params_ )
sgdBest = SGDClassifier( loss = "log", penalty = "elasticnet", tol = 1.0E-06,
                         max_iter = 10000, **randSGD.best_params_ )
svcBest = SVC( probability = True, **rndSVC.best_params_ )


vote = VotingClassifier( estimators = [ ( 'knn', knnBest ),
                                        ( 'forest', forestBest ),
                                        ( "sgd", sgdBest ),
                                        ( "svc", svcBest ) ] )

knnBest.fit(trainData, y)
forestBest.fit(trainData, y)
sgdBest.fit(trainData, y)
svcBest.fit(trainData, y)

weights = []

for _ in range(2000):
    tmp = np.random.dirichlet( [1,1,1,1], size = 1 )[0]
    weights.append(tmp)
    
gridVote = GridSearchCV( vote, { "weights" : weights, "voting" : ["hard", "soft"] },
                         cv = 10, verbose = 1, n_jobs = -1 )

gridVote.fit(trainData, y)

In [None]:
gridVote.best_score_

In [None]:
gridVote.best_params_

In [None]:
bestVote = VotingClassifier( estimators = [ ( 'knn', knnBest ),
                                        ( 'forest', forestBest ),
                                        ( "sgd", sgdBest ),
                                        ( "svc", svcBest ) ],
                             **gridVote.best_params_ )
bestVote.fit(trainData, y)

pred = bestVote.predict( trainData )
accuracy_score( y, pred)

# Test set

In [None]:
testData = pd.read_csv("test.csv")
testData.info()

In [None]:
testData["FamSize"] = testData["Parch"] + testData["SibSp"]

inferFares( testData, avgFares )
inferAge( testData, means )
addFamilyTest( testData, families.keys() )

In [None]:
testData['Sex'] = pd.Categorical(testData['Sex'], categories = ["male", "female"])
testData['Sex'] = testData['Sex'].cat.codes

testData['Embarked'] = pd.Categorical(testData['Embarked'], categories = ["S", "Q", "C"])
testData['Embarked'] = testData['Embarked'].cat.codes

testData['Family'] = pd.Categorical(testData['Family'], categories = [ "***", *sorted(families.keys()) ])
testData['Family'] = testData['Family'].cat.codes

testData.info()

In [None]:
test = dataPrep.transform( testData )
testPred = gridVote.predict( test )

In [None]:
output = pd.DataFrame()
output["PassengerId"] = testData["PassengerId"]
output["Survived"] = testPred

output.to_csv( "res.csv", index = False )

In [None]:
np.sum(testPred)