# Logistic Regression

In [310]:
import numpy as np
import scipy.optimize
import sklearn.datasets
def load_iris_binary():
    D, L = sklearn.datasets.load_iris()['data'].T, sklearn.datasets.load_iris()['target']
    D = D[:, L != 0] # remove setosa from D
    L = L[L!=0] # remove setosa from L
    L[L==2] = 0 # We assign label 0 to virginica (was label 2)
    return D, L

## Implementation of L-BFGS algorithm

I define an example function f(x)

In [311]:
def f(x):
    y = x[0]
    z = x[1]
    return (y+3)**2 + np.sin(y) + (z+1)**2

Now i call the funcion of scipy.optimize to apply the L-BFGS algorithm. With the parameter approx_grad=True i am saying that the gradient is automatically obtained through finite differences method

In [312]:
x, f, d = scipy.optimize.fmin_l_bfgs_b(func=f, x0=np.zeros(2), approx_grad = True, iprint = 0)

In [313]:
print('Point of minimum: %s'%(x))

Point of minimum: [-2.57747138 -0.99999927]


In [314]:
print('Value of the minimum: %s'%(f))

Value of the minimum: -0.3561430123647649


In [315]:
print('Number of iterations: %s'%(d['funcalls']))

Number of iterations: 21


By passing an explicit approximation of the gradient:

In [316]:
def f_grad(x):
    y = x[0]
    z = x[1]
    grad_y = 2*(y+3) + np.cos(y)
    grad_z = 2*(z+1)
    val = (y+3)**2 + np.sin(y) + (z+1)**2
    return val, np.array([grad_y,grad_z])

In [317]:
x, f, d = scipy.optimize.fmin_l_bfgs_b(func=f_grad, x0=np.zeros(2), approx_grad = False, iprint = 0)

In [318]:
print('Point of minimum: %s'%(x))

Point of minimum: [-2.57747137 -0.99999927]


In [319]:
print('Value of the minimum: %s'%(f))

Value of the minimum: -0.3561430123647611


In [320]:
print('Number of iterations: %s'%(d['funcalls']))

Number of iterations: 7


With this method the algorithm only performs 7 iterations instead of 21 with the same result for minimum!

## Binary Logistic Regression

In [321]:
def split_db_2to1(D, L, seed=0):
    nTrain = int(D.shape[1]*2.0/3.0) # 2/3 of the dataset D are used for training, 1/3 for validation
    np.random.seed(seed)
    idx = np.random.permutation(D.shape[1]) # take a random array of 150 elements, each element is 0<x<=149 (np.arange(150))
    idxTrain = idx[0:nTrain] # first 100 are indices of training samples 
    idxTest = idx[nTrain:] # remaining 50 are indices of validation samples
    DTR = D[:, idxTrain] # D for training
    DTE = D[:, idxTest] # D for validation
    LTR = L[idxTrain] # L for training
    LTE = L[idxTest] # L for validation
    return (DTR, LTR), (DTE, LTE)

In [322]:
D, L = load_iris_binary()
(DTR, LTR), (DTE, LTE) = split_db_2to1(D, L)

In the dataset there are only samples belonging to class virginica and versicolor

We want to optimize the function $J(w,b) = \frac{\lambda}{2}||w||^2 + \frac{1}{n}\sum_{i=1}^{n} log(1+e^{-z_i (w^T x_i + b)})$ with 
 $ z_i =
  \begin{cases}
    1       & \quad \text{if } c_i = 1 \\
    -1  & \quad \text{if } c_i = 0
  \end{cases}
 $

In [323]:
class logRegClass:
    def __init__(self, DTR, LTR, l):
        self.DTR = DTR
        self.LTR = LTR
        self.l = l
        
    def __compute_zi(self, ci):
        return 2*ci-1
    
    def logreg_obj(self, v): # still works if DTR is one sample only? yes but it must be of shape (4,1)
        w, b = v[0:-1], v[-1]
        J = l/2*(np.linalg.norm(w)**2)
        summary = 0
        for i in range(self.DTR.shape[1]):
            xi = self.DTR[:,i:i+1]
            ci = self.LTR[i]
            zi = self.__compute_zi(ci)
            summary += np.logaddexp(0,-zi*(np.dot(w.T,xi)+b))
        J += (1/self.DTR.shape[1]) * summary
        return J
        

In [324]:
D = 4 # dimensionality of the feature space
l = 10**-6
logRegObj = logRegClass(DTR, LTR, l)
x, f, d = scipy.optimize.fmin_l_bfgs_b(func=logRegObj.logreg_obj, 
                                       x0=np.zeros(DTR.shape[0]+1), 
                                       approx_grad = True, 
                                       iprint = 0)

In [325]:
print('Point of minimum: %s'%(x))

Point of minimum: [ 14.87051718  -9.39021566 -29.49339455 -88.83860813 231.12771345]


In [326]:
print('Value of the minimum: %s'%(f))

Value of the minimum: [0.0075415]


In [327]:
print('Number of iterations: %s'%(d['funcalls']))

Number of iterations: 372


Now we can compute the predictions according to the model parameters w,b that we have obtained:

In [328]:
w, b = x[0:-1], x[-1]
S=np.zeros((DTE.shape[1]))
for i in range(DTE.shape[1]):
    xi = DTE[:,i:i+1]
    s = np.dot(w.T,xi)+b
    S[i] = s
LP = S>0
acc = sum(LP == LTE)/len(LTE)
err = 1 - acc
print('Error rate with lambda=%f is: %s%%'%(l,err*100))

Error rate with lambda=0.000001 is: 11.764705882352944%


## Multiclass Logistic Regression