# Modelling

In [4]:
import numpy as np
import pandas as pd
import warnings

from pathlib import Path

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import RFECV, SelectKBest, chi2
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

In [5]:
warnings.filterwarnings('ignore')

In this notebook I will try a few different techniques to find the best model. The process I will take are:

1. Produce a baseline model using all variables.
2. Use univariate selection to create a subset of features to improve my baseline model.
3. Use recursive feature elimination to create another subset of features to train a model with.
4. Train a number of different models with my final subset of features to find the most appropriate model.
5. Tune parameters in final model.

In [6]:
# Read in the data
DATA_PATH = Path('../data/processed/')
training_no_dummies = pd.read_csv(DATA_PATH/'final_training.csv')
test_no_dummies = pd.read_csv(DATA_PATH/'final_test.csv')
training_df = pd.read_csv(DATA_PATH/'final_training_w_dummies.csv')
test_df = pd.read_csv(DATA_PATH/'final_test_w_dummies.csv')
test_ids = pd.read_csv(DATA_PATH/'test_ids.csv')

In [7]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Deck           891 non-null int64
Relatives      891 non-null int64
Child          891 non-null int64
Sex_male       891 non-null int64
Embarked_Q     891 non-null int64
Embarked_S     891 non-null int64
Title_Miss     891 non-null int64
Title_Mr       891 non-null int64
Title_Mrs      891 non-null int64
Title_Other    891 non-null int64
dtypes: float64(2), int64(14)
memory usage: 111.5 KB


First I will produce a metrics function so I can produce all the desired metrics.

In [8]:
def generate_metrics(model, X, y_true):
    """Prints accuracy, f1 and confusion matrix for classification model."""
    predictions = model.predict(X)
    print(f'Accuracy score: {accuracy_score(y_true, predictions)}')
    print(f'F1 score: {f1_score(y_true, predictions)}')
    print('\n')
    print(pd.crosstab(pd.Series(predictions, name='Predicted'), pd.Series(y_true, name='Actual')))

## Baseline Model

I will start by creating a baseline model using logistic regression. As our test set doesn't include target variable, I will use cross validation to get a more accurate estimate of the performance of the model than if I were to train once on the entire training set and then evaluate using the training set.

In [9]:
# Create predictor set and target variable
X_train = training_df.drop('Survived', axis=1)
y_train = training_df['Survived']

In [10]:
baseline_lr = LogisticRegressionCV(cv=10).fit(X_train, y_train)
generate_metrics(baseline_lr, X_train, y_train)

Accuracy score: 0.8338945005611672
F1 score: 0.7791044776119402


Actual       0    1
Predicted          
0          482   81
1           67  261


## Univariate Feature Selection

To compare the variables in our model we need to convert any continuous data to discrete by putting the continuous data into bins and any nominal data we will encode using discrete values to be a value input for sklearn chi2.

In [11]:
def continuous_to_discrete(dataframe):
    """Creates new dataframe with continuous data put in bins."""
    continuous = dataframe.dtypes[dataframe.dtypes == np.float64].index
    discrete_df = dataframe.drop(continuous, axis=1)
    for column in continuous:
        discrete_df = pd.concat([discrete_df, pd.cut(dataframe[column], 20)], axis=1)
        
    return discrete_df

In [12]:
def ordinal_encoder(dataframe):
    """Encodes columns using discrete values."""
    ordinal_encoder = OrdinalEncoder().fit(dataframe)
    return pd.DataFrame(ordinal_encoder.transform(dataframe), columns=dataframe.columns)

Now we transform our data to a form that the chi2 function can use.

In [13]:
def chi2_form(dataframe):
    """Transforms dataframe to form accepted by sklearn chi2."""
    discrete_df = continuous_to_discrete(dataframe)
    discrete_df = ordinal_encoder(discrete_df)
    return discrete_df

In [14]:
training_chi2 = chi2_form(training_no_dummies)
X_chi2 = training_chi2.drop('Survived', axis=1)
y_chi2 = training_chi2['Survived']

Now using sklearn's chi2 and SelectKBest functions we can find the most associated features with survival.

In [15]:
feature_selector = SelectKBest(chi2, k=8).fit(X_chi2, y_chi2)

Lets display our features in order of the p values obtained by performing the chi squared test on each variable with the target.

In [16]:
selected_features_df = pd.DataFrame({'Feature':list(X_chi2.columns), 'P Value':feature_selector.pvalues_})
selected_features_df.sort_values(by='P Value', ascending=True)

Unnamed: 0,Feature,P Value
10,Fare,3.2461859999999997e-48
1,Sex,6.077838e-22
5,Deck,1.038247e-13
0,Pclass,1.581715e-13
4,Embarked,0.001402485
3,Parch,0.001484707
9,Age,0.009063848
8,Child,0.02517566
6,Relatives,0.214191
2,SibSp,0.2662355


Now we can train our model using the K best features. I will use the continuous data again, where applicable, and convert any categorical columns to dummy variables.

In [17]:
X_chi2_selected = training_no_dummies[X_chi2.columns[feature_selector.get_support()]]
X_chi2_selected = pd.get_dummies(X_chi2_selected, drop_first=True)

In [18]:
chi_selected_lr = LogisticRegressionCV(cv=10).fit(X_chi2_selected, y_chi2)
generate_metrics(chi_selected_lr, X_chi2_selected, y_chi2)

Accuracy score: 0.8013468013468014
F1 score: 0.7338345864661654


Actual     0.0  1.0
Predicted          
0.0        470   98
1.0         79  244


This hasn't imporoved our model, which indicates using more features could be useful for training our model.

## Recursive Feature Elimination

We can use recursive feature elimination to find an optimal subset of features to train on. First we will scale our variables.

In [19]:
scaler = StandardScaler().fit(X_train)
scaled_X_train = scaler.transform(X_train)

Now we need to train our recursive feature eliminator. We will use a logistic regression model as our estimator to assess the accuracy at each iteration.

In [20]:
estimator = LogisticRegression()
selector = RFECV(estimator, step=1, cv=10, scoring='accuracy')
selector = selector.fit(scaled_X_train, y_train)

In [21]:
selector.n_features_

13

By doing our recursive feature elimination we have found an optimal subset of features to train on.

In [22]:
RFE_selected_X = pd.DataFrame(selector.transform(scaled_X_train), columns=X_train.columns[selector.get_support()])
rfe_lr = LogisticRegressionCV(cv=10).fit(RFE_selected_X, y_train)
generate_metrics(rfe_lr, RFE_selected_X, y_train)

Accuracy score: 0.8372615039281706
F1 score: 0.7826086956521738


Actual       0    1
Predicted          
0          485   81
1           64  261


This gives us a slight improvement over our baseline model, lets train a selection of models using this subset of features to see if we can improve on the accuracy of the model.

## Model Training

As we don't have a test set to evaluate our models with, we will use cross validation to estimate the accuracy of our models and create confidence intervals using the t distribution.

In [23]:
def cross_val_mean_std(model, X, y, score_func):
    """
    Calculates and prints the mean and standard deviation for the scoring function based on
    a 10 fold cross validation.
    """
    scores = cross_val_score(model, X, y, scoring=score_func, cv=10)
    mean = round(scores.mean(), 3)
    standard_deviation = round(scores.std(), 3)
    print(f'Scoring function: {score_func}')
    print(f'Mean score: {mean}')
    print(f'Standard deviation of scores: {standard_deviation}')

### Decision Tree

In [24]:
dtc = DecisionTreeClassifier().fit(RFE_selected_X, y_train)
cross_val_mean_std(dtc, RFE_selected_X, y_train, 'accuracy')

Scoring function: accuracy
Mean score: 0.8
Standard deviation of scores: 0.05


### Random Forest

In [25]:
forest = RandomForestClassifier(random_state=0).fit(RFE_selected_X, y_train)
cross_val_mean_std(forest, RFE_selected_X, y_train, 'accuracy')

Scoring function: accuracy
Mean score: 0.813
Standard deviation of scores: 0.047


### SVC

In [26]:
svc = SVC(gamma='auto').fit(RFE_selected_X, y_train)
cross_val_mean_std(svc, RFE_selected_X, y_train, 'accuracy')

Scoring function: accuracy
Mean score: 0.836
Standard deviation of scores: 0.036


### K Nearest Neighbours

In [118]:
KNN = KNeighborsClassifier().fit(RFE_selected_X, y_train)
cross_val_mean_std(KNN, RFE_selected_X, y_train, 'accuracy')

Scoring function: accuracy
Mean score: 0.823
Standard deviation of scores: 0.032


It seems SVC may be the best method to use for this problem, now we should try parameter tuning to create the best model feasible.

## Parameter Tuning

We will use a grid search to find the optimal parameters to produce the best results.

In [101]:
search_parameters = [
    {
        'C': [0.001,0.01,0.1,1,10,20,50,100],
        'gamma': [1,0.1,0.01,0.001],
        'kernel': ['rbf', 'sigmoid']
    },
    {
        'C': [0.001,0.01,0.1,1, 10],
        'gamma': [1,0.1,0.01,0.001],
        'kernel': ['poly']
    },
]

In [102]:
svc_parameter_search = GridSearchCV(SVC(), search_parameters, cv=5, verbose=2, n_jobs =-1)
svc_parameter_search.fit(RFE_selected_X, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:   23.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 20, 50, 100],
                          'gamma': [1, 0.1, 0.01, 0.001],
                          'kernel': ['rbf', 'sigmoid']},
                         {'C': [0.001, 0.01, 0.1, 1, 10],
                          'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [103]:
svc_parameter_search.best_params_

{'C': 20, 'gamma': 0.01, 'kernel': 'rbf'}

Now that we have found the optimal parameters we can estimate the accuracy of the model.

In [119]:
cross_val_mean_std(svc_parameter_search.best_estimator_, RFE_selected_X, y_train, 'accuracy')

Scoring function: accuracy
Mean score: 0.834
Standard deviation of scores: 0.038


In [105]:
generate_metrics(svc_parameter_search.best_estimator_, RFE_selected_X, y_train)

Accuracy score: 0.8406285072951739
F1 score: 0.7835365853658537


Actual       0    1
Predicted          
0          492   85
1           57  257


Now we can use the trained model to predict the survival of the test data.

In [106]:
scaled_test = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
X_test = pd.DataFrame(selector.transform(scaled_test), columns=scaled_test.columns[selector.get_support()])
test_predictions = pd.DataFrame({
    'PassengerId': test_ids.values.T.flatten(), 
    'Survived': svc_parameter_search.best_estimator_.predict(X_test)
})

In [107]:
PRED_DATA_PATH = Path('../data/predictions/')
test_predictions.to_csv(PRED_DATA_PATH/'predictions.csv', index=False)