In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize

# Data loading and cleaning

In [2]:
f = open("bank-note/data-desc.txt", "r")
print(f.read())

https://archive.ics.uci.edu/ml/datasets/banknote+authentication

Data Set Information:

Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.


We use 4 attributions (the first 4 columns)

1. variance of Wavelet Transformed image (continuous) 
2. skewness of Wavelet Transformed image (continuous) 
3. curtosis of Wavelet Transformed image (continuous) 
4. entropy of image (continuous) 

The label is the last column: genuine or forged





In [3]:
df = pd.read_csv("bank-note/train.csv", 
                 names = ['variance', 'skewness', 'curtosis', 'entropy', 'label'])
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,label
0,3.8481,10.1539,-3.8561,-4.2228,0
1,4.0047,0.45937,1.3621,1.6181,0
2,-0.048008,-1.6037,8.4756,0.75558,0
3,-1.2667,2.8183,-2.426,-1.8862,1
4,2.2034,5.9947,0.53009,0.84998,0


In [4]:
data_t = np.genfromtxt("bank-note/train.csv", dtype=float, delimiter=',')
data_t = np.hstack((np.ones((data_t.shape[0],1)), data_t))
data_t.shape

(872, 6)

In [5]:
data_train = data_t[:,:-1]
label_train = data_t[:,-1].astype(int)
data_train.shape, label_train.shape

((872, 5), (872,))

In [6]:
data_test = np.genfromtxt("bank-note/test.csv", dtype=float, delimiter=',')
data_test = np.hstack((np.ones((data_test.shape[0],1)), data_test))
data_test.shape

(500, 6)

In [7]:
test = data_test[:,:-1]
label = data_test[:,-1].astype(int)
test.shape, label.shape

((500, 5), (500,))

# Defining functions 

In [8]:
rg = 1 #prior scale

In [9]:
#sigmoid = lambda z : 1/(1+np.exp(-z)) 
def sigmoid(z):
    mask = (z>=0).astype(int)
    return mask/(1+np.exp(-mask*z)) + (1-mask) *np.exp((1-mask)*z)/(1+np.exp((1-mask)*z))
    

def neg_log_likelihood(w, data, label, rg = rg):
    """
    w is (5,) vector
    train is (n, 4) vector
    label is (n,) vector
    
    """
    eps = 1e-10
    s = sigmoid(data @ w)
    mask = (label==1).astype(int)
    y_hat = mask * np.log(s+eps) + (1-mask) * np.log(1-s+eps)
    return - y_hat.sum() + .5 * rg * np.dot(w,w)


def grad_neg_log_likelihood(w, data, label, rg = 1):
    m = label.shape[0]
    mask = (label==1).astype(int)
    grad = (mask * (1 - sigmoid(data @ w)) - (1-mask) * sigmoid(data @ w)) @ data
    return -grad + rg * w

def Hessian_(w, data, label, rg = rg):
    m, n = data.shape
    HH = sigmoid(data @ w)*(1 - sigmoid(data @ w))
    HH =  data.T @ (HH.reshape(-1,1)*data)
    return HH + rg * np.eye(n)

def accuracy(w, data, label, h = None):
    if h is None:
        h = sigmoid
    y_hat = (h(data@w) >= 0.5).astype(int)
    return (y_hat == label).mean()

In [10]:
rg = 1
grad = lambda w: grad_neg_log_likelihood(w, data_train, label_train, rg = rg)
loss = lambda w: neg_log_likelihood(w, data_train, label_train, rg = rg)
Hessian = lambda w : Hessian_(w, data_train, label_train, rg = rg)

In [11]:
def num_grad(w, loss_function = loss):
    l = []
    for i in range(4):
        w_1 = w.copy()
        w_2 = w.copy()
        w_1[i] -= 1e-6
        w_2[i] += 1e-6
        l.append((loss_function(w_2) -loss_function(w_1))/2e-6)
    return np.array(l)

In [12]:
def num_Hessian(w, grad_function = grad):
    l = []
    n = w.shape[0]
    for i in range(n):
        w_1 = w.copy()
        w_2 = w.copy()
        w_1[i] -= 1e-6
        w_2[i] += 1e-6
        l.append((grad_function(w_2) -grad_function(w_1))/2e-6)
    return np.array(l)

In [13]:
def Newton_Raphson(w, grad=grad, Hessian = Hessian, max_it = 100, tolerance = 1e-5, unit_disck = False):
    c = 0
    eps = 1e-10
    while True:
        w_old = w.copy()
        w = w - np.linalg.inv(Hessian(w)+ eps*np.eye(w.shape[0]))@grad(w)
        
        if unit_disck and np.linalg.norm(w)>1:
            w = w/np.linalg.norm(w)
            
        c += 1
        if c >= max_it or np.linalg.norm(w_old - w) < tolerance:
            return w , c    

## check if the Hessian is implimented correctly

In [14]:
w_0 = np.random.normal(loc=0.0, scale=1.0, size=5)
num_Hessian(w_0, grad_function = grad)

array([[  36.06574604,   -2.12106636,   -2.54285214,   52.5418622 ,
         -25.61763969],
       [  -2.12106646,  219.12390355, -239.16958253,  125.36382314,
          89.32705785],
       [  -2.54285189, -239.16958207,  319.0960042 , -224.73712971,
         -98.11838768],
       [  52.5418622 ,  125.36382314, -224.73712943,  443.83065335,
          19.71979327],
       [ -25.61763969,   89.32705782,  -98.11838754,   19.71979333,
         104.40391429]])

In [15]:
Hessian(w_0)

array([[  36.06574608,   -2.12106646,   -2.54285188,   52.54186219,
         -25.61763969],
       [  -2.12106646,  219.12390339, -239.1695821 ,  125.36382315,
          89.32705789],
       [  -2.54285188, -239.1695821 ,  319.09600461, -224.73712973,
         -98.11838764],
       [  52.54186219,  125.36382315, -224.73712973,  443.83065318,
          19.71979332],
       [ -25.61763969,   89.32705789,  -98.11838764,   19.71979332,
         104.40391434]])

# Training using  Newton Raphson Method

## 1. set all the weights to be zero

In [16]:
w = np.zeros(5)
w_MAP, c_ = Newton_Raphson(w.copy())
w_MAP, c_

(array([ 2.85594015, -2.69321743, -1.59105632, -1.89926156, -0.17689773]), 10)

In [17]:
accuracy(w_MAP, data_train, label_train), accuracy(w_MAP, test, label)

(0.9908256880733946, 0.99)

In [18]:
loss(w), loss(w_MAP)

(604.4243412738722, 34.900646722038424)

## 2. random initialization

In [19]:
#np.random.seed(19)
w = 1*np.random.normal(loc=0.0, scale=1.0, size=5)
w_MAP, c_ = Newton_Raphson(w.copy())
w_MAP, c_

(array([ -112.00280204,  -947.3944766 , -3283.84446633,  1427.65680292,
          613.46207925]),
 100)

In [20]:
accuracy(w_MAP, data_train, label_train), accuracy(w_MAP, test, label)

(0.6364678899082569, 0.634)

### 3. Initiate w randomly near to zero

In [21]:
w = 1e-5*np.random.normal(loc=0.0, scale=1.0, size=5)
w_MAP, c_ = Newton_Raphson(w.copy(), unit_disck = True)
w_MAP, c_

(array([ 0.81996897, -0.43668681, -0.22649189, -0.29186132,  0.02176976]), 11)

In [22]:
accuracy(w_MAP, data_train, label_train), accuracy(w_MAP, test, label)

(0.9827981651376146, 0.984)

### 4. Finding $w$ only in Unit Ball!!

In [23]:
w = np.random.normal(loc=0.0, scale=1.0, size=5)
w_MAP, c_ = Newton_Raphson(w.copy(), unit_disck = True)
w_MAP, c_

(array([ 0.81996649, -0.43668688, -0.22649644, -0.29186447,  0.02177198]), 16)

In [24]:
accuracy(w_MAP, data_train, label_train), accuracy(w_MAP, test, label)

(0.9827981651376146, 0.984)

# Training using scipy.optimize

In [25]:
w = 1e-5* np.random.normal(loc=0.0, scale=1.0, size=5)
r = minimize(loss, w, method = 'L-BFGS-B', jac= grad, options={'disp': False, 'maxiter': 100}, tol= 1e-5)

In [26]:
r.x

array([ 2.85591159, -2.69301687, -1.59109426, -1.89922607, -0.17701738])

In [27]:
accuracy(r.x, data_train, label_train), accuracy(r.x, test, label)

(0.9908256880733946, 0.99)

In [28]:
w_0 = 1e-5*np.random.normal(loc=0.0, scale=1.0, size=5)
model =  minimize(loss, w_0, jac= grad, method='BFGS', tol= 1e-5, options={'disp': True, 'maxiter': 100})

Optimization terminated successfully.
         Current function value: 34.900647
         Iterations: 34
         Function evaluations: 37
         Gradient evaluations: 37


In [29]:
model.x

array([ 2.85594009, -2.69321744, -1.59105633, -1.89926156, -0.17689774])

In [30]:
accuracy(model.x, data_train, label_train), accuracy(r.x, test, label)

(0.9908256880733946, 0.99)

# (b) Implement MAP estimation algorithm for Probit regression model

In [31]:
from scipy.stats import norm

In [32]:
def NLL_Probit_loss(w, data, label, rg = 1):
    s = norm.cdf(data@w)
    mask = (label==1).astype(int)
    y_hat = mask * np.log(s+1e-10) + (1-mask) * np.log(1-s+1e-10)
    return - y_hat.sum() + .5 * rg * np.dot(w,w)

In [33]:
eps = 1e-10
def grad_NLL_Probit_loss(w, data, label, rg = 1):
    s = norm.pdf(data@w)
    ss = norm.cdf(data@w)
    mask = (label==1).astype(int)
    grad = ((-mask/(eps+ss) + (1-mask)/(eps+1-ss)) * s) @ data
    return grad + rg * w

In [34]:
Probit_loss = lambda w: NLL_Probit_loss(w, data_train, label_train, rg = 1)
grad_Probit = lambda w: grad_NLL_Probit_loss(w, data_train, label_train, rg = 1)

# Using BFGS method

### 1. set all the weights to be zero

In [35]:
w_0 = np.zeros(5)
model_0 = minimize(Probit_loss, w_0, method='BFGS', jac =grad_Probit, 
                   tol= 1e-6, options={'disp': True, 'maxiter': 100})

Optimization terminated successfully.
         Current function value: 25.111541
         Iterations: 38
         Function evaluations: 41
         Gradient evaluations: 41


In [36]:
model_0.x

array([ 2.0787747 , -2.01663088, -1.18786756, -1.43311194, -0.13853574])

In [37]:
accuracy(model_0.x, data_train, label_train, h=norm.cdf), accuracy(model_0.x, test, label, h=norm.cdf)

(0.9908256880733946, 0.988)

### 2. random initialization

In [38]:
w_0 = np.random.normal(loc=0.0, scale=1.0, size=5)
model_p = minimize(Probit_loss, w_0, method='BFGS', jac =grad_Probit, tol= 1e-5, options={'disp': True, 'maxiter': 100})

Optimization terminated successfully.
         Current function value: 25.111541
         Iterations: 35
         Function evaluations: 38
         Gradient evaluations: 38


In [39]:
model_p.x

array([ 2.07877472, -2.01663087, -1.18786756, -1.43311195, -0.13853576])

In [40]:
accuracy(model_p.x, data_train, label_train, h=norm.cdf), accuracy(model_p.x, test, label, h=norm.cdf)

(0.9908256880733946, 0.988)

# (c) Using Newton Raphson Method

In [41]:
def Hessian_NLL_Probit_loss(w, data, label, rg = 1):
    eps = 1e-6
    score = data @ w
    s_1 = norm.pdf(score)
    y = norm.cdf(data @ w)
    
    term_1 = (label/((y+eps)**2) + (1-label)/(1-y+eps)**2) * (s_1**2)
    term_2 = (y - label)/((y+eps)*(1-y+eps)) * s_1 * score
    
    return data.T@ ((term_1+term_2).reshape(-1,1) * data) + np.eye(data.shape[1])

In [42]:
def Hessian_NLL_Probit_loss_new(w, data, label, rg = 1): # More numerically stable implementation 
    eps = 1e-12
    score = data @ w
    s_1 = norm.pdf(score)
    y = np.zeros(s_1.shape)
    y[s_1>=0] = 1- norm.cdf(-score[s_1>=0])
    y[s_1<0] = norm.cdf(score[s_1<0])
    
    term_1 = np.zeros(s_1.shape)
    term_1[label == 1] = 1/(y[label == 1]+eps)**2
    term_1[label == 0] = 1/(1-y[label == 0]+eps)**2
    term_1 = term_1 * (s_1**2)
    
    
    term_2 = np.zeros(s_1.shape)
    term_2[label == 1] = 1/(y[label == 1]+eps)
    term_2[label == 0] = -1/(1-y[label == 0]+eps)
    
    term_2 = term_2 * s_1 * score
    
    return data.T@ ((term_1+term_2).reshape(-1,1) * data) + np.eye(data.shape[1])

In [43]:
Hessian_NLL = lambda w: Hessian_NLL_Probit_loss_new(w, data_train, label_train, rg = 1)

In [44]:
w_1 = np.zeros(5)
y = norm.cdf(-40)
y

0.0

In [45]:
w_0 = 1e-3*np.random.normal(loc=0.0, scale=1.0, size=5)
H_1 = Hessian_NLL(w_0)
H_1

array([[  557.02490148,   255.75643042,  1080.60236096,   737.50059716,
         -649.33056046],
       [  255.75643042,  4420.61233036,  2983.64907329, -2157.75278719,
          504.84464663],
       [ 1080.60236096,  2983.64907329, 20852.66627844, -9318.5950864 ,
        -4721.96316601],
       [  737.50059716, -2157.75278719, -9318.5950864 , 11060.63241168,
          681.72658334],
       [ -649.33056046,   504.84464663, -4721.96316601,   681.72658334,
         3094.09619535]])

In [46]:
H_2 = num_Hessian(w_0, grad_function = grad_Probit)
H_2

array([[  557.02490126,   255.75643031,  1080.60236062,   737.50059681,
         -649.3305602 ],
       [  255.75643033,  4420.61232866,  2983.64907201, -2157.7527863 ,
          504.84464643],
       [ 1080.60236054,  2983.64907201, 20852.66627023, -9318.59508259,
        -4721.96316414],
       [  737.50059686, -2157.75278639, -9318.59508285, 11060.63240718,
          681.72658307],
       [ -649.33056019,   504.84464623, -4721.963164  ,   681.72658303,
         3094.0961941 ]])

### Initilizing with random weights 

In [47]:
w_0 = 1e-3*np.random.normal(loc=0.0, scale=1.0, size=5)
w_pp, c = Newton_Raphson(w_0, grad=grad_Probit, Hessian = Hessian_NLL, max_it = 100, tolerance = 1e-5)
w_pp.shape, c

((5,), 11)

In [48]:
accuracy(w_pp, data_train, label_train, h=norm.cdf), accuracy(w_pp, test, label, h=norm.cdf)

(0.9908256880733946, 0.988)

In [49]:
w_pp, model_p.x

(array([ 2.0787747 , -2.01663088, -1.18786756, -1.43311194, -0.13853574]),
 array([ 2.07877472, -2.01663087, -1.18786756, -1.43311195, -0.13853576]))

### Initilizing with zero weights 

In [50]:
w_0 = np.zeros(5)
w_pp, c = Newton_Raphson(w_0, grad=grad_Probit, Hessian = Hessian_NLL, max_it = 100, tolerance = 1e-5)
w_pp.shape, c

((5,), 11)

In [51]:
accuracy(w_pp, data_train, label_train, h=norm.cdf), accuracy(w_pp, test, label, h=norm.cdf)

(0.9908256880733946, 0.988)