In [38]:
from datasets import titanic_data

import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

import pandas as pd

from tools import roc

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [39]:
original_X, original_y, train_X, train_y, test_X, test_y = titanic_data()

# Forward selection

## Use 5-fold cross validation to estimate best number of features

In [40]:
r2_means = []
number_of_features = original_X.shape[1]

# try every possible number of features
for i in range(1, number_of_features):
    selection_forward = SequentialFeatureSelector(
        LogisticRegression(penalty='none'),
        n_features_to_select=i,
        direction='forward').fit(original_X, original_y)

    selected_features = original_X.columns[selection_forward.get_support()]
    r2_means.append(cross_val_score(LogisticRegression(penalty='none'), original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,number_of_features), r2_means), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)').show()

## Select features

In [41]:
selection_forward = SequentialFeatureSelector(
    LogisticRegression(penalty='none'), n_features_to_select=4, direction='forward').fit(original_X, original_y)

In [42]:
selected_features = original_X.columns[selection_forward.get_support()]
selected_features

Index(['pclass', 'sex', 'sibsp', 'parch'], dtype='object')

## Fit Logistic Regression

In [43]:
# add constant, since statsmodels does not add it by default
original_X_const = sm.add_constant(original_X[selected_features])
model = sm.Logit(original_y, original_X_const)

result = model.fit()

Optimization terminated successfully.
         Current function value: 0.459658
         Iterations 6


In [44]:
result.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,891.0
Model:,Logit,Df Residuals:,886.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 16 Jun 2021",Pseudo R-squ.:,0.3097
Time:,14:31:43,Log-Likelihood:,-409.56
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,2.8540000000000004e-78

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6390,0.089,-7.188,0.000,-0.813,-0.465
pclass,-0.7848,0.089,-8.815,0.000,-0.959,-0.610
sex,1.3196,0.093,14.141,0.000,1.137,1.503
sibsp,-0.2579,0.109,-2.359,0.018,-0.472,-0.044
parch,-0.0405,0.089,-0.455,0.649,-0.215,0.134


## Confusion matrix and ROC

In [45]:
model = LogisticRegression(penalty='none')
model = model.fit(original_X[selected_features], original_y)

In [46]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(original_X[selected_features]).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 230, TN: 483, FP: 66, FN: 112'

In [47]:
roc(model, original_X[selected_features], original_y).show()

# Forward selection

## Use 5-fold cross validation to estimate best number of features

In [48]:
r2_means = []
number_of_features = original_X.shape[1]

# try every possible number of features
for i in range(1, number_of_features):
    selection_backward = SequentialFeatureSelector(
        LogisticRegression(penalty='none'),
        n_features_to_select=i,
        direction='backward').fit(original_X, original_y)

    selected_features = original_X.columns[selection_backward.get_support()]
    r2_means.append(cross_val_score(LogisticRegression(penalty='none'), original_X[selected_features], original_y).mean())

result = pd.DataFrame(zip(range(1,number_of_features), r2_means), columns=['n of features', 'R^2 (mean)'])
px.line(result, x='n of features', y='R^2 (mean)').show()

## Select features

In [49]:
selection_backward = SequentialFeatureSelector(
    LogisticRegression(penalty='none'), n_features_to_select=5, direction='backward').fit(original_X, original_y)

In [50]:
selected_features = original_X.columns[selection_backward.get_support()]
selected_features

Index(['pclass', 'sex', 'age', 'sibsp', 'embarked_S'], dtype='object')

## Fit Logistic Regression

In [51]:
# add constant, since statsmodels does not add it by default
original_X_const = sm.add_constant(original_X[selected_features])
model = sm.Logit(original_y, original_X_const)

result = model.fit()

Optimization terminated successfully.
         Current function value: 0.441442
         Iterations 6


In [52]:
result.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,891.0
Model:,Logit,Df Residuals:,885.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 16 Jun 2021",Pseudo R-squ.:,0.3371
Time:,14:31:49,Log-Likelihood:,-393.32
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,2.9570000000000003e-84

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6585,0.091,-7.225,0.000,-0.837,-0.480
pclass,-0.9752,0.101,-9.684,0.000,-1.173,-0.778
sex,1.2908,0.093,13.917,0.000,1.109,1.473
age,-0.5100,0.101,-5.039,0.000,-0.708,-0.312
sibsp,-0.3649,0.114,-3.210,0.001,-0.588,-0.142
embarked_S,-0.1899,0.089,-2.145,0.032,-0.363,-0.016


## Confusion matrix and ROC

In [53]:
model = LogisticRegression(penalty='none')
model = model.fit(original_X[selected_features], original_y)

In [54]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(original_X[selected_features]).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 247, TN: 461, FP: 88, FN: 95'

In [55]:
roc(model, original_X[selected_features], original_y).show()