### Logistic Regression

- example from Wiki: https://en.wikipedia.org/wiki/Logistic_regression
- Binary classification 

In [29]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
# import csv data to dataframe

url = 'https://github.com/prasertcbs/basic-dataset/raw/master/study_hours.csv' # Aj.Prasert's github
df = pd.read_csv(url)
df.head()

Unnamed: 0,Hours,Pass
0,0.5,0
1,0.75,0
2,1.0,0
3,1.25,0
4,1.5,0


In [3]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Hours,0.5,0.75,1.0,1.25,1.5,1.75,1.75,2.0,2.25,2.5,2.75,3.0,3.25,3.5,4.0,4.25,4.5,4.75,5.0,5.5
Pass,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


### Scikit-learn: Logistic Regression

In [4]:
X, y = df[['Hours']], df.Pass

In [5]:
X.head()

Unnamed: 0,Hours
0,0.5
1,0.75
2,1.0
3,1.25
4,1.5


In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Pass, dtype: int64

In [7]:
# split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 7)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14, 1), (6, 1), (14,), (6,))

In [9]:
len(X)

20

In [12]:
14/20, 6/20

(0.7, 0.3)

In [13]:
model = LogisticRegression()
# model = LogisticRegression(C = 1e6) # disable regularization

model

In [14]:
model.fit(X_train, y_train) # train a model

In [16]:
model.score(X_train, y_train) # get accuracy score of model

0.7142857142857143

In [17]:
model.intercept_

array([-2.09991962])

In [18]:
model.coef_

array([[0.92276914]])

In [19]:
X_test

Unnamed: 0,Hours
1,0.75
17,4.75
2,1.0
5,1.75
11,3.0
0,0.5


In [20]:
predicted = model.predict(X_test)
predicted

array([0, 1, 0, 0, 1, 0])

In [21]:
y_test

1     0
17    1
2     0
5     0
11    0
0     0
Name: Pass, dtype: int64

In [22]:
y_test.values

array([0, 1, 0, 0, 0, 0])

In [23]:
model.score(X_test, y_test)

0.8333333333333334

In [24]:
5/6

0.8333333333333334

In [25]:
model.predict_proba(X_test)

array([[0.80342547, 0.19657453],
       [0.09252108, 0.90747892],
       [0.76443507, 0.23556493],
       [0.61894522, 0.38105478],
       [0.33885793, 0.66114207],
       [0.8373355 , 0.1626645 ]])

In [27]:
# create a function 

def sp(intercept, coef, x): # universal function

    ex = np.exp(-(intercept + x * coef))
    return (1 / (1 + ex))

In [28]:
# call a function sp

sp(model.intercept_, model.coef_, 0.75)

array([[0.19657453]])

In [30]:
X_test['Hours'].values

array([0.75, 4.75, 1.  , 1.75, 3.  , 0.5 ])

In [31]:
y_test.values

array([0, 1, 0, 0, 0, 0])

In [33]:
predicted = model.predict(X_test)
predicted

array([0, 1, 0, 0, 1, 0])

### scilit-learn: confusion matrix

In [35]:
metrics.confusion_matrix(y_test, predicted)

array([[4, 1],
       [0, 1]])

In [37]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predicted).ravel()

In [38]:
print(f'tn = {tn}') # true negative
print(f'fp = {fp}') # false positive
print(f'fn = {fn}') # false negative
print(f'tp = {tp}') # true positive

tn = 4
fp = 1
fn = 0
tp = 1


In [39]:
(tp + tn)/(tp+tn+fp+fn) # accuracy value from confusion matrix

0.8333333333333334

In [40]:
model.score(X_test, y_test)

0.8333333333333334

In [42]:
metrics.accuracy_score(y_test, predicted)

0.8333333333333334

In [41]:
tp/(tp + fp) # Precision (class = 1)

0.5

In [44]:
metrics.precision_score(y_test, predicted)

0.5

In [45]:
tp/(tp + fn) # Recall

1.0

In [47]:
metrics.recall_score(y_test, predicted)

1.0

In [48]:
2 * ((tp/(tp + fp)*tp/(tp + fn))/(tp/(tp + fp)+ tp/(tp + fn))) # F1 = 2 x ((precision x recall)/(precisin + recall))

0.6666666666666666

In [49]:
2 * (metrics.precision_score(y_test, predicted)*metrics.recall_score(y_test, predicted)/(metrics.precision_score(y_test, predicted) + metrics.recall_score(y_test, predicted)))

0.6666666666666666

In [50]:
metrics.f1_score(y_test, predicted)

0.6666666666666666

In [51]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.50      1.00      0.67         1

    accuracy                           0.83         6
   macro avg       0.75      0.90      0.78         6
weighted avg       0.92      0.83      0.85         6



In [52]:
y_test.value_counts() # support

0    5
1    1
Name: Pass, dtype: int64