In [1]:
from datasets import titanic_data

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.special import expit
import pandas as pd
import numpy as np

from tools import roc, polynomial_features

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
original_X, original_y, train_X, train_y, test_X, test_y = titanic_data()

## Degree 2

In [14]:
original_X_2 = polynomial_features(original_X, 2)

In [15]:
original_X_2 = pd.DataFrame(StandardScaler().fit_transform(original_X_2), columns=original_X_2.columns)

In [16]:
original_X_2.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,pclass^2,sex^2,age^2,sibsp^2,parch^2,fare^2,embarked_C^2,embarked_Q^2,embarked_S^2
0,0.827377,-0.737695,-0.565736,0.432793,-0.473674,-0.502445,-0.482043,-0.307562,0.615838,-0.371647,-0.737695,-0.393776,-0.182761,-0.226598,-0.125992,-0.482043,-0.307562,-0.615838
1,-1.566107,1.355574,0.663861,0.432793,-0.473674,0.786845,2.074505,-0.307562,-1.623803,1.711501,1.355574,-0.323902,-0.182761,-0.226598,-0.064192,2.074505,-0.307562,1.623803
2,0.827377,1.355574,-0.258337,-0.474545,-0.473674,-0.488854,-0.482043,-0.307562,0.615838,-0.371647,1.355574,-0.540481,-0.174241,-0.226598,-0.128262,-0.482043,-0.307562,-0.615838
3,-1.566107,1.355574,0.433312,0.432793,-0.473674,0.42073,-0.482043,-0.307562,0.615838,1.711501,1.355574,-0.470394,-0.182761,-0.226598,-0.138706,-0.482043,-0.307562,-0.615838
4,0.827377,-0.737695,0.433312,-0.474545,-0.473674,-0.486337,-0.482043,-0.307562,0.615838,-0.371647,-0.737695,-0.470394,-0.174241,-0.226598,-0.128676,-0.482043,-0.307562,-0.615838


### Estimate test error by 5-fold cross validation

In [17]:
model = LogisticRegression(penalty='none')
cross_val_score(model, original_X_2, original_y).mean()

0.8114493754315486

### Confusion matrix and ROC

In [18]:
model = LogisticRegression(penalty='none')
model = model.fit(original_X_2, original_y)

In [19]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(original_X_2).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 248, TN: 478, FP: 71, FN: 94'

In [20]:
roc(model, original_X_2, original_y).show()

## Degree 3

In [21]:
original_X_3 = polynomial_features(original_X, 3)

In [22]:
original_X_3 = pd.DataFrame(StandardScaler().fit_transform(original_X_3), columns=original_X_3.columns)

In [23]:
original_X_3.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,pclass^2,...,embarked_S^2,pclass^3,sex^3,age^3,sibsp^3,parch^3,fare^3,embarked_C^3,embarked_Q^3,embarked_S^3
0,0.827377,-0.737695,-0.565736,0.432793,-0.473674,-0.502445,-0.482043,-0.307562,0.615838,-0.371647,...,-0.615838,0.652601,-0.737695,-0.138944,-0.127566,-0.148157,-0.092039,-0.482043,-0.307562,0.615838
1,-1.566107,1.355574,0.663861,0.432793,-0.473674,0.786845,2.074505,-0.307562,-1.623803,1.711501,...,1.623803,-1.752661,1.355574,-0.043631,-0.127566,-0.148157,-0.08052,2.074505,-0.307562,-1.623803
2,0.827377,1.355574,-0.258337,-0.474545,-0.473674,-0.488854,-0.482043,-0.307562,0.615838,-0.371647,...,-0.615838,0.652601,1.355574,-0.105976,-0.134211,-0.148157,-0.091851,-0.482043,-0.307562,0.615838
3,-1.566107,1.355574,0.433312,0.432793,-0.473674,0.42073,-0.482043,-0.307562,0.615838,1.711501,...,-0.615838,-1.752661,1.355574,-0.086134,-0.127566,-0.148157,-0.088262,-0.482043,-0.307562,0.615838
4,0.827377,-0.737695,0.433312,-0.474545,-0.473674,-0.486337,-0.482043,-0.307562,0.615838,-0.371647,...,-0.615838,0.652601,-0.737695,-0.086134,-0.134211,-0.148157,-0.091818,-0.482043,-0.307562,0.615838


### Estimate test error by 5-fold cross validation

In [25]:
model = LogisticRegression(penalty='none', max_iter=2000)
cross_val_score(model, original_X_3, original_y).mean()

0.8013809553700332

### Confusion matrix and ROC

In [26]:
model = LogisticRegression(penalty='none', max_iter=2000)
model = model.fit(original_X_3, original_y)

In [28]:
tn, fp, fn, tp = confusion_matrix(original_y, model.predict(original_X_3).ravel()).ravel()

f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}"

'TP: 248, TN: 477, FP: 72, FN: 94'

In [29]:
roc(model, original_X_3, original_y).show()