In [23]:
from datasets import titanic_data

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from tools import roc

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
np.set_printoptions(suppress=True)

In [9]:
original_X, original_y, train_X, train_y, test_X, test_y = titanic_data()

## Choose number of components

In [10]:
pca = PCA(n_components=original_X.shape[1])
pca = pca.fit(original_X)

In [11]:
pca.explained_variance_ratio_

array([0.23949963, 0.19635608, 0.18352062, 0.11329848, 0.09189104,
       0.07476993, 0.06099501, 0.03966921, 0.        ])

## PCA plot

In [12]:
pca = PCA(n_components=3)
pca = pca.fit(original_X)

In [13]:
transformed_original_X = pca.transform(original_X)

In [16]:
transformed_df = pd.DataFrame(transformed_original_X, columns=['x','y','z'])
transformed_df['target'] = original_y.values
px.scatter_3d(transformed_df, x='x', y='y', z='z',
              color='target').show()

## PCA Classification

In [18]:
pca = PCA(n_components=5)
pca = pca.fit(original_X)

In [19]:
transformed_original_X = pca.transform(original_X)

## Estimate test error by 5-fold cross validation

In [25]:
model = LogisticRegression(penalty='none')
cross_val_score(model, transformed_original_X, original_y).mean()

0.7878789780930262

## Fit Logistic Regression model

In [26]:
# add constant, since statsmodels does not add it by default
original_X_const = sm.add_constant(transformed_original_X)
model = sm.Logit(original_y, original_X_const)

result = model.fit()

Optimization terminated successfully.
         Current function value: 0.461362
         Iterations 6


In [27]:
result.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,891.0
Model:,Logit,Df Residuals:,885.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 16 Jun 2021",Pseudo R-squ.:,0.3072
Time:,14:48:47,Log-Likelihood:,-411.07
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,1.316e-76

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6131,0.088,-6.944,0.000,-0.786,-0.440
x1,0.6975,0.067,10.415,0.000,0.566,0.829
x2,0.2831,0.069,4.129,0.000,0.149,0.417
x3,-0.0293,0.068,-0.433,0.665,-0.162,0.103
x4,0.5350,0.083,6.455,0.000,0.373,0.698
x5,-1.1504,0.108,-10.667,0.000,-1.362,-0.939


## Confusion matrix and ROC

In [29]:
model = LogisticRegression(penalty='none')
model = model.fit(transformed_original_X, original_y)

In [30]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(transformed_original_X).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 234, TN: 477, FP: 72, FN: 108'

In [31]:
roc(model, transformed_original_X, original_y).show()