In [1]:
import sys
sys.path.append('../../pyutils')

import numpy as np
import scipy.linalg
import torch

import metrics
import utils
from sklearn.linear_model import LogisticRegression

np.random.seed(12)

# Binary Logistic Regression

Let $X$ training input of size $n * p$.  
It contains $n$ examples, each with $p$ features.  
Let $y$ training target of size $n$.  
Each input $X_i$, vector of size $p$, is associated with it's target, $y_i$, which is $0$ or $1$.  
Logistic regression tries to fit a linear model to predict the target $y$ of a new input vector $x$.

The predictions of the model are denoted $\hat{y}$.
$$o_i = X_i\beta = \sum_{j=1}^{p} X_{ij}\beta_j$$
$$P(y_i = 1 | X_i) = \hat{y_i} = \sigma(o_i)$$
$$\sigma(x) = \frac{1}{1 + e^{-x}}$$

## Cross Entropy

The cost function is the cross-entropy.  
$$J(\beta) = - \sum_{i=1}^n (y_i log(\hat{y_i}) + (1 - y_i) log(1 - \hat{y_i}))$$

$$\frac{\partial J(\beta)}{\partial \hat{y_i}} = \frac{\hat{y_i} - y_i}{\hat{y_i}(1 - \hat{y_i})}$$
$$\frac{\partial J(\beta)}{\partial \hat{y}} = \frac{\hat{y} - y}{\hat{y}(1 - \hat{y})}$$

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

y_out = np.random.randn(13).astype(np.float32)
y_true = np.random.randint(0, 2, (13)).astype(np.float32)
y_pred = sigmoid(y_out)
j = - np.sum(y_true * np.log(y_pred) + (1-y_true) * np.log(1-y_pred))

ty_true = torch.tensor(y_true, requires_grad=False)
ty_pred = torch.tensor(y_pred, requires_grad=True)
criterion = torch.nn.BCELoss(reduction='sum')
tj = criterion(ty_pred, ty_true)
tj.backward()

print(j)
print(tj.data.numpy())
print(metrics.tdist(j, tj.data.numpy()))

10.678722
10.678722
0.0


In [3]:
dy_pred = (y_pred - y_true) / (y_pred * (1 - y_pred))
tdy_pred_sol = ty_pred.grad.data.numpy()
print(dy_pred)
print(tdy_pred_sol)
print(metrics.tdist(dy_pred, tdy_pred_sol))

[-1.6231388 -2.9766939  2.274354  -6.4779763 -1.4708843  1.2155157
 -1.9948862  1.8867183  1.4462028 18.669147   1.5500078 -1.6234685
 -1.3342199]
[-1.6231389 -2.976694   2.274354  -6.477976  -1.4708843  1.2155157
 -1.9948862  1.8867184  1.4462028 18.669147   1.5500077 -1.6234685
 -1.3342199]
5.717077e-07


$$\frac{\partial J(\beta)}{\partial o_i} = \hat{y_i} - y_i$$
$$\frac{\partial J(\beta)}{\partial o} = \hat{y} - y$$

In [4]:
y_out = np.random.randn(13).astype(np.float32)
y_true = np.random.randint(0, 2, (13)).astype(np.float32)
y_pred = sigmoid(y_out)
j = - np.sum(y_true * np.log(y_pred) + (1-y_true) * np.log(1-y_pred))

ty_true = torch.tensor(y_true, requires_grad=False)
ty_out = torch.tensor(y_out, requires_grad=True)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
tj = criterion(ty_out, ty_true)
tj.backward()

print(j)
print(tj.data.numpy())
print(metrics.tdist(j, tj.data.numpy()))

10.849605
10.849605
0.0


In [5]:
dy_out = y_pred - y_true
dy_out_sol = ty_out.grad.data.numpy()
print(dy_out)
print(dy_out_sol)
print(metrics.tdist(dy_out, dy_out_sol))

[-0.7712122   0.5310385  -0.7378207  -0.13447696  0.20648097  0.28622478
 -0.7465389   0.5608791   0.53383535 -0.75912154 -0.4418677   0.6848638
  0.35961235]
[-0.7712122   0.5310385  -0.7378207  -0.13447696  0.20648097  0.28622478
 -0.7465389   0.5608791   0.53383535 -0.75912154 -0.4418677   0.6848638
  0.35961235]
0.0


Can be trained with gradient descent

In [6]:
def log_reg_sk(X, y):
    
    m = LogisticRegression(fit_intercept=False)
    m.fit(X, y)
    return m.coef_

def get_error(X, y, w):
    y_pred = sigmoid(X @ w)
    err = - np.sum(y * np.log(y_pred) + (1-y) * np.log(1-y_pred))
    return err

def log_reg(X, y):

    w = np.random.randn(X.shape[1])
    
    for epoch in range(10000):
        
        y_pred = sigmoid(X @ w)
        dy_out = y_pred - y
        dw = X.T @ dy_out
        
        w -= 0.001 * dw
        if epoch % 100 == 0:
            err = get_error(X, y, w)
            print('SGD Error = {}'.format(err))
        
    return w
    


X = np.random.randn(73, 4).astype(np.float32)
y = np.random.randint(0, 2, (73)).astype(np.float32)

    
w1 = log_reg_sk(X, y)[0]
w2 = log_reg(X, y)
print('SK Error = {}'.format(get_error(X, y, w1)))
print('SGD Error = {}'.format(get_error(X, y, w2)))
print(w1)
print(w2)

SGD Error = 71.14744133609668
SGD Error = 49.65028785288255
SGD Error = 48.91772028291884
SGD Error = 48.888462052036814
SGD Error = 48.88680421514018
SGD Error = 48.88669058552164
SGD Error = 48.88668168135676
SGD Error = 48.886680916022215
SGD Error = 48.88668084643879
SGD Error = 48.88668083991474
SGD Error = 48.886680839293305
SGD Error = 48.88668083923365
SGD Error = 48.8866808392279
SGD Error = 48.886680839227346
SGD Error = 48.88668083922729
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922729
SGD Error = 48.88668083922729
SGD Error = 48.88668083922729
SGD Error = 48.88668083922729
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922729
SGD Error = 48.88668083922728
SGD Error = 48.88668083922729
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Err



SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error = 48.88668083922728
SGD Error 

## Multiclass Logistic Regression

In [7]:
def softmax(x):
    x_e = np.exp(x)
    return x_e / np.sum(x_e, axis=1, keepdims=True)

In [8]:
y_out = np.random.randn(93, 4).astype(np.float32)
y_true = np.zeros((93, 4)).astype(np.float32)
for i in range(y_true.shape[0]):
    y_true[i][np.random.randint(0, y_true.shape[1])] = 1
y_pred = softmax(y_out)

j = - np.sum(y_true * np.log(y_pred))

ty_true = torch.tensor(y_true, requires_grad=False)
ty_true = torch.argmax(ty_true, dim=1)
ty_out = torch.tensor(y_out, requires_grad=True)

criterion = torch.nn.CrossEntropyLoss(reduction='sum')
tj = criterion(ty_out, ty_true)
tj.backward()

print(j)
print(tj.data.numpy())
print(metrics.tdist(j, tj.data.numpy()))

148.84998
148.85
3.0517578e-05


In [9]:
y_out = np.random.randn(7, 4).astype(np.float32)
y_true = np.zeros((7, 4)).astype(np.float32)
for i in range(y_true.shape[0]):
    y_true[i][np.random.randint(0, y_true.shape[1])] = 1
y_pred = softmax(y_out)

j = - np.sum(y_true * np.log(y_pred))

ty_true = torch.tensor(y_true, requires_grad=False)
ty_pred = torch.tensor(y_pred, requires_grad=True)
tj = - torch.sum(ty_true * torch.log(ty_pred))
tj.backward()

print(j)
print(tj.data.numpy())
print(metrics.tdist(j, tj.data.numpy()))

14.296462
14.296461
9.536743e-07


In [10]:
dy_pred = - y_true / y_pred
dy_pred_sol = ty_pred.grad.data.numpy()

print(dy_pred)
print(dy_pred_sol)
print(metrics.tdist(dy_pred, dy_pred_sol))

[[ -0.        -10.283339   -0.         -0.       ]
 [-10.58094    -0.         -0.         -0.       ]
 [ -0.         -0.         -2.7528124  -0.       ]
 [-46.90987    -0.         -0.         -0.       ]
 [ -0.         -0.         -1.3170731  -0.       ]
 [ -7.9531765  -0.         -0.         -0.       ]
 [ -0.        -10.990683   -0.         -0.       ]]
[[ -0.        -10.283339   -0.         -0.       ]
 [-10.58094    -0.         -0.         -0.       ]
 [ -0.         -0.         -2.7528124  -0.       ]
 [-46.90987    -0.         -0.         -0.       ]
 [ -0.         -0.         -1.3170731  -0.       ]
 [ -7.9531765  -0.         -0.         -0.       ]
 [ -0.        -10.990683   -0.         -0.       ]]
0.0


$$\frac{\partial J(\beta)}{\partial o_{ij}} = \hat{y_{ij}} - y_{ij}$$
$$\frac{\partial J(\beta)}{\partial o} = \hat{y} - y$$

In [11]:
y_out = np.random.randn(7, 4).astype(np.float32)
y_true = np.zeros((7, 4)).astype(np.float32)
for i in range(y_true.shape[0]):
    y_true[i][np.random.randint(0, y_true.shape[1])] = 1
y_pred = softmax(y_out)

j = - np.sum(y_true * np.log(y_pred))

ty_true = torch.tensor(y_true, requires_grad=False)
ty_true = torch.argmax(ty_true, dim=1)
ty_out = torch.tensor(y_out, requires_grad=True)

criterion = torch.nn.CrossEntropyLoss(reduction='sum')
tj = criterion(ty_out, ty_true)
tj.backward()

print(j)
print(tj.data.numpy())
print(metrics.tdist(j, tj.data.numpy()))

12.387552
12.387553
9.536743e-07


In [12]:
dy_out = y_pred - y_true
dy_out_sol = ty_out.grad.data.numpy()

print(dy_out)
print(dy_out_sol)
print(metrics.tdist(dy_out, dy_out_sol))

[[-0.71088123  0.25399554  0.31700996  0.13987577]
 [ 0.02140404  0.3097546   0.29681578 -0.6279745 ]
 [ 0.60384715  0.03253903  0.0066169  -0.6430031 ]
 [ 0.22169167 -0.88766754  0.03120301  0.63477284]
 [ 0.05100057 -0.38170385  0.10363309  0.22707026]
 [ 0.02778155  0.6928965  -0.8194856   0.09880757]
 [ 0.03780703  0.9247614   0.02876937 -0.99133784]]
[[-0.71088123  0.2539955   0.31700993  0.13987575]
 [ 0.02140405  0.30975467  0.29681584 -0.6279744 ]
 [ 0.60384715  0.03253903  0.0066169  -0.6430031 ]
 [ 0.22169165 -0.88766754  0.03120301  0.6347728 ]
 [ 0.05100057 -0.38170385  0.10363309  0.22707026]
 [ 0.02778155  0.6928965  -0.8194856   0.09880759]
 [ 0.03780702  0.9247613   0.02876936 -0.99133784]]
2.0499465e-07


Can be trained with gradient descent

In [13]:
def get_error_multi(X, y, w):
    y_pred = softmax(X @ w)
    err = - np.sum(y * np.log(y_pred))
    return err


def multilog_reg(X, y):

    w = np.random.randn(X.shape[1], y.shape[1])
    
    for epoch in range(10000):
        
        y_pred = softmax(X @ w)
        dy_out = y_pred - y
        dw = X.T @ dy_out
        
        w -= 0.001 * dw
        if epoch % 100 == 0:
            err = get_error_multi(X, y, w)
            print('SGD Error = {}'.format(err))
        
    return w

    
X = np.random.randn(93, 4).astype(np.float32)
y_true = np.zeros((93, 4)).astype(np.float32)
for i in range(y_true.shape[0]):
    y_true[i][np.random.randint(0, y_true.shape[1])] = 1
y_true_sk = np.argmax(y_true, axis=1)
    
w1 = log_reg_sk(X, y_true_sk)
w2 = multilog_reg(X, y_true)
print('SK Error = {}'.format(get_error_multi(X, y_true, w1)))
print('SGD Error = {}'.format(get_error_multi(X, y_true, w2)))
print(w1)
print(w2)

SGD Error = 264.5967568728954
SGD Error = 124.52928999771657
SGD Error = 120.69338069535253
SGD Error = 120.60511291188504
SGD Error = 120.60208822782775
SGD Error = 120.60195961583351
SGD Error = 120.60195360857097
SGD Error = 120.60195331813674
SGD Error = 120.60195330392729
SGD Error = 120.60195330322918
SGD Error = 120.60195330319483
SGD Error = 120.60195330319314
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error



SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Error = 120.60195330319306
SGD Erro