# Demo Logistic Regression

## Data Preparation

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import importlib
import sys
sys.path.append('..')  
from utils.ROCanalysis import calculate_roc_metrics

data = load_breast_cancer()
n_features = 4
X = data.data[:,:n_features]          # 4 numeric features
feature_names = data.feature_names[:n_features].tolist()
y = data.target        # 0 = malignant, 1 = benign

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Training
### Model training with sklearn

In [2]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty=None,max_iter=10000)  # increase max_iter to ensure convergence
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)
accuracy = (y_test == y_pred).sum()/len(y_test)
print(f"Accuracy:{accuracy}")

proba = log_reg.predict_proba(X_test_scaled)[:,1]
roc_df = pd.DataFrame({
    'p': proba,
    'y': y_test,
    'w': np.ones(len(y_test))
})
print(calculate_roc_metrics(roc_df, 'p', 'y'))

all_params = np.concatenate([log_reg.coef_.ravel(), log_reg.intercept_])
coef_df = pd.DataFrame({
    "feature": list(feature_names)+['intercept'],
    "coefficient": all_params})
print(coef_df)

Accuracy:0.9298245614035088
{'auc': 98.347, 'gini': 96.693, 'ks': 85.913}
          feature  coefficient
0     mean radius    29.469177
1    mean texture    -0.986754
2  mean perimeter   -26.738366
3       mean area    -8.509443
4       intercept    -0.222692


### Model training with sklearn

Features of all samples $X\in \mathbb{R}^{n\times (K+1)}$ including the all one column, labels $\mathbf{y}\in {\mathbb{R}^n}$. The model output of all samples is

$$
\mathbf{p} = f(X;\mathbf{\beta}).
$$

The loss is

$$
l(\mathbf{\beta}) = -\frac{1}{n}\sum_{i=1}^n \left[y_i\ln p_i + (1-y_i)\ln(1-p_i)\right].
$$

whose gradient is 

$$
\frac{\partial}{\partial \mathbf{\beta}}l(\mathbf{\beta}) = \frac{1}{n} X^T(\mathbf{p}-\mathbf{y}).
$$


In python, `beta` is a (K+1,) np.array, `X` is (n,k) np.array.  Adding a all-one column `X` to have `X_new`, (n,k+1) np.array as

`X_new = np.hstack([X, np.ones((X.shape[0], 1))])`

The probabity of all sample `p` (n,) np.array is calculated as

`p = expit((X_new*self.beta).sum(axis=-1))`

loss is calculated as

`
loss = -(y * np.log(p) + (1 - y) * np.log(1-p)).mean()
`

The gradient is calculated as

`
grad = X_new.T @ (p - y)/n
`

Model is trained as

`
logreg.beta -= lr * (X_new.T @ (p - y)/n)
`

In [3]:
import numpy as np
from scipy.special import expit
class LogisticReg:
    def __init__(self, dim, eps=0.00000001, seed=42):
        np.random.seed(seed)
        self.beta = np.random.rand(dim+1)
        self.eps = eps

    def _with_intercept(self, X):
        return np.hstack([X, np.ones((X.shape[0], 1))])

    def __call__(self, X):
        X_new = self._with_intercept(X)
        p = expit(X_new@self.beta)
        return p

    def loss(self, X, y):
        eps = self.eps
        p = np.clip(self(X), eps, 1-eps)
        loss = -(y * np.log(p) + (1 - y) * np.log(1-p)).mean()
        return loss

    def loss_grad(self, X, y):
        n = X.shape[0]
        p = self(X)
        X_new = self._with_intercept(X)
        grad = X_new.T @ (p - y)/n # no clipping needed for backprop
        return grad

logreg = LogisticReg(X_train.shape[1])
lr = 0.2
for i in range(500000):
    logreg.beta -= lr*logreg.loss_grad(X_train_scaled,y_train)
    if (i+1)%50000==0:
        print(f'loss = {logreg.loss(X_train_scaled,y_train)}')
        print(logreg.beta)

loss = 0.19940263619168946
[ 10.50456862  -0.93770551 -16.74856317   2.19618139   0.78184272]
loss = 0.19316770270187206
[ 16.62015457  -0.95284609 -20.84730212  -0.25071182   0.5395708 ]
loss = 0.19070757760042098
[ 20.51474481  -0.95873387 -22.76249723  -2.57044082   0.322716  ]
loss = 0.18955236136765827
[ 23.16369384  -0.96396168 -23.93887765  -4.30242174   0.16288009]
loss = 0.1889879085791541
[ 25.01007901  -0.96906387 -24.75170823  -5.528848     0.05013951]
loss = 0.18870760975528852
[ 26.3108033   -0.97360294 -25.33553041  -6.38688082  -0.02865227]
loss = 0.18856705241345517
[ 27.23225113  -0.97734585 -25.75865795  -6.98753106  -0.08380213]
loss = 0.1884960934569378
[ 27.88725537  -0.98028518 -26.06528264  -7.40973587  -0.12257355]
loss = 0.18846009818172804
[ 28.35393681  -0.98252351 -26.28698322  -7.70783135  -0.14995336]
loss = 0.1884417757171696
[ 28.68698228  -0.98419474 -26.44691479  -7.91910061  -0.1693617 ]
