# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [2]:
#import utils

In [3]:
%%capture 
%run nihil_ml_utils.ipynb

In [4]:
def sigmoid(z):
    """z is a vector with length of number of features"""
    return 1./(1. + np.exp(-z))

In [5]:
def calc_z(w, x, y):
    """w is weights vector
    x is feature matrix for some object
    y is target vector"""
    return y * np.dot(X, w)

In [6]:
class GD(object):
    def __init__(self, k=0.1, c=0.0, eps=1.0e-5):
        """k is step, default=0.1
        C is regularization coef, default=10"""
        self.k = k
        self.c = c
        self.eps = eps
        self.w = None

    def __repr__(self):
        return f"GD(k={self.k}, c={self.c}, eps={self.eps})"

    def update_weights(self, w, X, y):
        for i, _ in enumerate(w):
            v = y*X.iloc[:, i]*(1 - sigmoid(calc_z(w, X, y)))
            w[i] = w[i] + self.k * v.mean() - self.k * self.c * w[i]

    def fit(self, X, y, init_weights=None):
        if not init_weights:
            init_weights = np.zeros(X.shape[1])
        elif X.shape[1] != init_weights.shape[0]:
            raise Exception(f'Bad init weight shape {init_weights.shape[0]}. '
                            f'Expected {X.shape[1]}')
        w = init_weights
        num_iter = 1
        while True:
            old_w = w.copy()
            self.update_weights(w, X, y)
            # check euclidean distance for convergence
            cur_eps = np.linalg.norm(old_w - w)
            if cur_eps <= self.eps:
                print(f'Converged in {num_iter} iterations')
                break
            if num_iter < 10000:
                num_iter += 1
            else:
                raise Exception('Convergence failed')
        # store result
        self.w = w

    def predict(self, X):
        return X.apply(lambda X_row: sigmoid(np.dot(self.w, X_row)), axis=1)

In [7]:
# data upload and extraction
data = pd.read_csv(to_path('data-logistic.csv'), header=None)
X = data[[1, 2]]
y = data[0]
data

Unnamed: 0,0,1,2
0,-1,-0.663827,-0.138526
1,1,1.994596,2.468025
2,-1,-1.247395,0.749425
3,1,2.309374,1.899836
4,1,0.849143,2.407750
5,1,1.454271,-0.665416
6,1,2.254227,2.263786
7,-1,-0.067580,1.469141
8,-1,-0.861961,-0.824856
9,1,0.699179,2.032488


In [8]:
# train classifier without regularization term
clf = GD()
clf.fit(X, y)
# check classifier
y_clf = clf.predict(X)
clf_score = roc_auc_score(y, y_clf)

Converged in 236 iterations


In [9]:
# train classifier with regularization term, C = 10
clf_reg = GD(c=10.0)
clf_reg.fit(X, y)
# check classifier with regularization
y_clf_reg = clf_reg.predict(X)
clf_reg_score = roc_auc_score(y, y_clf_reg)

Converged in 6 iterations


In [15]:
print(f'ROC AUC for simple clf - {clf_score:.3f},', 
      f'regularized - {clf_reg_score:.3f}')

ROC AUC for simple clf - 0.927, regularized - 0.936
