#### **Logistic Regression** in Sklearn

In [68]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [69]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support

In [70]:
df = pd.read_csv('titanic.csv')
df['male'] = df['Sex'] == 'male'

X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 27)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [71]:
print('whole dataset: ', X.shape, y.shape)
print('training set: ', X_train.shape, y_train.shape)
print('test set: ', X_test.shape, y_test.shape)

whole dataset:  (887, 6) (887,)
training set:  (665, 6) (665,)
test set:  (222, 6) (222,)


<br/>

**Logistic Regression Threshold:** <br/>
We can choose any threshold between 0 and 1. <br/>
If we make the threshold higher, we'll have fewer positive predictions, but our predictions are more likely to be correct. This means that the **precision** would be higher and the recall lower. On the other hand, if we make the threshold lower, we'll have more positive predictions, so we're more likely to catch all the positive cases. This means that the recall would be higher and the precision lower. 

In [72]:
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))
print('confusion_matrix: ', confusion_matrix(y_test, y_pred)) 
# row1: Actual Negative, column1: Predicted Negative
# row2: Actual Positive, column2: Predicted Positive
sensitivity_score = recall_score
print('sensitivity score: ', sensitivity_score(y_test, y_pred))
print('precision_recall_fscore_support: ', precision_recall_fscore_support(y_test, y_pred))

accuracy:  0.7882882882882883
precision:  0.7368421052631579
recall:  0.6746987951807228
f1 score:  0.7044025157232704
confusion_matrix:  [[119  20]
 [ 27  56]]
sensitivity score:  0.6746987951807228
precision_recall_fscore_support:  (array([0.81506849, 0.73684211]), array([0.85611511, 0.6746988 ]), array([0.83508772, 0.70440252]), array([139,  83]))


In [73]:
def specificity_score(y_true, y_pred):
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
    return r[0]

print('specificity_score: ',specificity_score(y_test, y_pred))

specificity_score:  0.8561151079136691


In [74]:
print('predict proba')
print(model.predict_proba(X_test)[:5, :5])

predict proba
[[0.53171991 0.46828009]
 [0.31779192 0.68220808]
 [0.48898444 0.51101556]
 [0.53143654 0.46856346]
 [0.24206161 0.75793839]]


In [78]:
y_pred = model.predict_proba(X_test)[:, 1] > 0.75 # threshold 0.75
print('precision: ', precision_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))

precision:  0.9714285714285714
recall:  0.40963855421686746
