# Logistic Regression (from scratch)

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import numpy as np

In [3]:
class LogRegression():
    def __init__(self, w=[], eta=1e-2, eps=1e-2, max_iter=1e5, t=0.5):
        self.w = np.array(w)
        self.eta = eta
        self.eps = eps
        self.max_iter = max_iter
        self.t = t
    
    def reset(self, X):
        self.w = np.zeros(X.shape[1])
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def predict_proba(self, X):
        return self.sigmoid(np.dot(X, self.w))
    
    def predict(self, X):
        proba = self.sigmoid(np.dot(X, self.w))
        return np.array(list(map(int, logreg.predict_proba(X) > self.t)))
    
    def log_loss(self, X, y):
        p = self.sigmoid(np.dot(X, self.w))
        l = X.shape[0]
        return - (1 / l) * np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
    
    def grad_log_loss(self, X, y):
        p = self.sigmoid(np.dot(X, self.w))
        l = X.shape[0]
        grad = np.zeros(self.w.shape[0])
        for j in range(self.w.shape[0]):
            grad[j] = (1 / l) * np.sum((p - y) * X[:, j])
        return grad
    
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.w = np.zeros(X.shape[1])
        weight_dist = np.inf
        iter_num = 0
        w = self.w
        while (weight_dist > self.eps) and (iter_num < self.max_iter):
            w_new = w - self.eta*self.grad_log_loss(X, y)
            weight_dist = np.linalg.norm(w_new - w)
            iter_num += 1
            w = w_new
        self.w = w

# Algorithm validation

In [4]:
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

### Let's generate some toy dataset for classification task

In [5]:
X, y = datasets.make_classification(n_samples=1000, n_classes=2, n_features=2, n_informative=2, n_redundant=0)
feature_ones = np.ones([len(X), 1]) # constant feature
X = np.hstack([feature_ones, X]) # add constant feature to X as X[:, 0]

### Let's train the model and calculate Accuracy and ROC-AUC score

In [6]:
logreg = LogRegression()
logreg.fit(X, y)

y_proba = logreg.predict_proba(X)
y_pred = logreg.predict(X)
print('Accuracy score: {}'.format(accuracy_score(y, y_pred)))
print('ROC-AUC score: {}'.format(roc_auc_score(y, y_proba)))

Accuracy score: 0.915
ROC-AUC score: 0.9595478381913527


### Let's compare it with the standard Scikit-learn Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
logreg_sklearn = LogisticRegression()
logreg_sklearn.fit(X, y)

y_proba = logreg_sklearn.predict_proba(X)[:,1]
y_pred = logreg_sklearn.predict(X)
print('Accuracy score: {}'.format(accuracy_score(y, y_pred)))
print('ROC-AUC score: {}'.format(roc_auc_score(y, y_proba)))

Accuracy score: 0.926
ROC-AUC score: 0.9602718410873643


### Pretty good!