In [1]:
import pandas as pd
import numpy as np
from scipy.stats import logistic, norm, multivariate_normal
from scipy.optimize import minimize

In [2]:
'''
This file contains sample code about how to use Gauss–Hermite quadrature to compute a 
specific type of integral numerically.

The general form of this type of integral is:( see https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature 
for more details)

F = int_{ -inf}^{+inf} e^{-x*x) f(x) dx,  (1)

in which we're calculating the integral of f(x) in the range ( -inf, +inf) weighted by e^(-x*x ).
Note that for f(x) being polynomial function, this integral is guaranteed to converge. But for some others 
convergence is not guaranteed.
'''

import numpy as np


def gass_hermite_quad(f, degree):
    '''
    Calculate the integral (1) numerically.
    :param f: target function, takes a array as input x = [x0, x1,...,xn], and return a array of function 
    values f(x) = [f(x0),f(x1), ..., f(xn)]
    :param degree: integer, >=1, number of points
    :return:
    '''

    points, weights = np.polynomial.hermite.hermgauss(degree)

    #function values at given points
    f_x = f(points)

    #weighted sum of function values
    F = np.sum(f_x  * weights)

    return F

In [3]:
f = open("data/bank-note/data-desc.txt", "r")
print(f.read())

https://archive.ics.uci.edu/ml/datasets/banknote+authentication

Data Set Information:

Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.


We use 4 attributions (the first 4 columns)

1. variance of Wavelet Transformed image (continuous) 
2. skewness of Wavelet Transformed image (continuous) 
3. curtosis of Wavelet Transformed image (continuous) 
4. entropy of image (continuous) 

The label is the last column: genuine or forged





In [4]:
df = pd.read_csv("data/bank-note/train.csv", 
                 names = ['variance', 'skewness', 'curtosis', 'entropy', 'label'])
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,label
0,3.8481,10.1539,-3.8561,-4.2228,0
1,4.0047,0.45937,1.3621,1.6181,0
2,-0.048008,-1.6037,8.4756,0.75558,0
3,-1.2667,2.8183,-2.426,-1.8862,1
4,2.2034,5.9947,0.53009,0.84998,0


In [5]:
data = np.genfromtxt("data/bank-note/train.csv", dtype=float, delimiter=',')
data = np.hstack((np.ones((data.shape[0],1)), data))
data.shape

(872, 6)

In [6]:
data_train = data[:,:-1]
label_train = data[:,-1].astype(int)
data_train.shape, label_train.shape

((872, 5), (872,))

In [7]:
data_test = np.genfromtxt("data/bank-note/test.csv", dtype=float, delimiter=',')
data_test = np.hstack((np.ones((data_test.shape[0],1)), data_test))
data_test.shape

(500, 6)

In [8]:
test = data_test[:,:-1]
label = data_test[:,-1].astype(int)
test.shape, label.shape

((500, 5), (500,))

### Part (a).

In [9]:
from scipy.stats import logistic, norm, multivariate_normal
from scipy.optimize import minimize

In [10]:
def log_poterior_(x, t, w):
    y = logistic.cdf(x@w)
    y[t==0] = 1 - y[t==0]
    y = np.log(y)
    return -np.dot(w,w)/2 + y.sum()

In [11]:
def d_log_poterior_(x, t, w):
    y = t - logistic.cdf(x@w)
    return -w + x.T @ y

In [12]:
def dd_log_poterior_(x, t, w):
    y = logistic.cdf(x@w)
    R = y*(1-y)
    #print(-np.eye(w.shape[0])+x.T @ np.diag(R) @ x)
    return -np.eye(w.shape[0]) - x.T @ (R.reshape(-1,1)*x)

In [13]:
neg_log_poterior = lambda w: -log_poterior_(data_train, label_train, w)

In [14]:
d_neg_log_poterior = lambda w: -d_log_poterior_(data_train, label_train, w)

In [15]:
dd_neg_log_poterior = lambda w: -dd_log_poterior_(data_train, label_train, w)

### Numerical Check

In [16]:
w = np.random.normal(0, 1, size = (5))
dw = np.zeros(5)
for i in range(5):
    x = w.copy()
    y = w.copy()
    x[i] += 1e-5
    y[i] -= 1e-5
    dw[i] = (neg_log_poterior(x)-neg_log_poterior(y))/2e-5
dw-d_neg_log_poterior(w)

array([-0.00873432,  0.01461048, -0.0098306 ,  0.00354918,  0.01437763])

In [17]:
ddw = np.zeros((5,5))
for i in range(5):
    x = w.copy()
    y = w.copy()
    x[i] += 1e-5
    y[i] -= 1e-5
    ddw[i] = (d_neg_log_poterior(x)-d_neg_log_poterior(y))/2e-5
ddw-dd_neg_log_poterior(w)

array([[-5.38022960e-11,  4.27239755e-09, -3.96971700e-09,
        -4.68628514e-09, -2.03490913e-09],
       [ 4.09476186e-09, -2.41204816e-08,  3.78045399e-08,
         7.12670811e-09, -1.66438099e-08],
       [-2.72399916e-08,  1.06016643e-07, -3.02742137e-07,
        -6.05508035e-08,  1.10671351e-07],
       [-6.77573553e-11,  1.44236978e-09, -3.70738462e-09,
        -1.06653388e-08,  6.24646646e-09],
       [ 9.84897497e-10, -1.66438383e-08,  1.11953682e-08,
         5.62124569e-10, -3.28384431e-10]])

In [18]:
w = np.random.normal(0, 1, size = (5))
model_Q2a = minimize(neg_log_poterior, w, method = 'L-BFGS-B', jac = d_neg_log_poterior)

In [19]:
#model_Q2a = minimize(neg_log_poterior, w, method = 'L-BFGS-B')

In [20]:
#model_Q2a.x

### Laplace approximation for the posterior distribution

In [21]:
mu_Laplace = model_Q2a.x
mu_Laplace

array([ 2.85593266, -2.69321621, -1.59105559, -1.89926044, -0.17689869])

In [22]:
Cov_inv_Laplace = dd_neg_log_poterior(mu_Laplace)

In [23]:
Cov_Laplace = np.linalg.inv(Cov_inv_Laplace)

In [24]:
Cov_Laplace

array([[ 0.13679694, -0.0563484 , -0.02708617, -0.0406639 ,  0.02068524],
       [-0.0563484 ,  0.11964223,  0.05318781,  0.07181364,  0.00963916],
       [-0.02708617,  0.05318781,  0.05011853,  0.05299426,  0.0231902 ],
       [-0.0406639 ,  0.07181364,  0.05299426,  0.06216994,  0.01966918],
       [ 0.02068524,  0.00963916,  0.0231902 ,  0.01966918,  0.03822871]])

In [25]:
Laplace_dis = multivariate_normal(mean = mu_Laplace, cov = Cov_Laplace)

In [26]:
def predict(x,t,w):
    y = logistic.cdf(x@w)
    l_pred = (y>=0.5).astype(int)
    return (t==l_pred).mean()

### Accuracy baced on w_MAP

In [27]:
predict(data_train, label_train, mu_Laplace)

0.9908256880733946

In [28]:
predict(test, label, mu_Laplace)

0.99

### The average predictive likelihood

In [29]:
def predictive_likelihood(x, mu = mu_Laplace, Cov = Cov_Laplace):
    m = np.dot(x,mu)
    s = x.reshape(1,-1)@ Cov @x.reshape(-1,1)
    f = lambda z: logistic.cdf(np.sqrt(2)*s*z+m) 
    return 1/np.sqrt(np.pi)*gass_hermite_quad(f,200)

In [30]:
def accuracy_predictive_likelihood(x, t, mu = mu_Laplace, Cov = Cov_Laplace):
    n= x.shape[0]
    average_predictive_likelihood = np.zeros(n)
    for i in range(n):
        xi = test[i]
        average_predictive_likelihood[i] = predictive_likelihood(xi, mu, Cov)
    P = (average_predictive_likelihood>=0.5).astype(int)
    return np.mean(P == t), average_predictive_likelihood

In [31]:
n= test.shape[0]
average_predictive_likelihood = np.zeros(n)
for i in range(n):
    x = test[i]
    average_predictive_likelihood[i] = predictive_likelihood(x)
P = (average_predictive_likelihood>=0.5).astype(int)
np.mean(P == label)

0.99

In [32]:
accuracy_a_p_laplace, average_predictive_likelihood_laplace = accuracy_predictive_likelihood(test, label)

In [33]:
accuracy_a_p_laplace

0.99

In [34]:
average_predictive_likelihood_laplace[label == 0] = 1- average_predictive_likelihood_laplace[label == 0]
average_predictive_likelihood_laplace.mean()

0.9743241483386471

###  Laplace approximation with the diagonal Hessian

In [35]:
Cov_diag = np.eye(Cov_Laplace.shape[0])*Cov_Laplace

In [36]:
accuracy_a_p_diag, average_predictive_likelihood_diag = accuracy_predictive_likelihood(x= test, t = label, 
                                                                                       mu = mu_Laplace, Cov = Cov_diag)

In [37]:
accuracy_a_p_diag

0.99

In [38]:
average_predictive_likelihood_diag[label == 0] = 1- average_predictive_likelihood_diag[label == 0]
average_predictive_likelihood_diag.mean()

0.9139301859825464

### (c) Variational logistic regression

In [39]:
def Lambda(x):
    return (logistic.cdf(x)- 1/2)/(2*x)

In [40]:
def E_update(m_0,s_0,x,t,xi):
    s_0_inv = np.linalg.inv(s_0)
    lambda_xi = Lambda(xi)
    
    s_N_inv = s_0_inv + 2 * x.T @ (lambda_xi.reshape(-1,1)*x) #needs to review
    
    s_N = np.linalg.inv(s_N_inv)
    m_N = s_N @(s_0_inv@m_0 + (t-0.5) @ x)
    
    return s_N, m_N

In [41]:
def M_update(x, s_N, m_N):
    xi_squared = np.diag(x @ (s_N + m_N.reshape(-1,1) @ m_N.reshape(1,-1)) @ x.T )
    return np.sqrt(xi_squared)

In [42]:
def EM_vlr(m_0,s_0,x,t,xi_start):
    while True:
        s_N, m_N = E_update(m_0, s_0, x, t, xi_start)
        xi = M_update(x, s_N, m_N)
        if np.linalg.norm(xi - xi_start)> 1e-5:
            xi_start = xi
        else:
            return m_N, s_N

In [43]:
n, d = data_train.shape
xi_0 = np.abs(np.random.normal(0,1, size = (n)))

In [44]:
mu_vlr, s_vlr = EM_vlr(m_0 = np.zeros(d), s_0 = np.eye(d), x = data_train,t =label_train, xi_start = xi_0)

In [45]:
predict(test, label, mu_vlr)

0.99

In [46]:
mu_vlr,s_vlr

(array([ 2.89882616, -2.77086021, -1.63751078, -1.95377589, -0.19364377]),
 array([[ 0.0287931 , -0.00270207, -0.00196961, -0.0041901 ,  0.00504233],
        [-0.00270207,  0.00360754, -0.00021745,  0.00078054, -0.001883  ],
        [-0.00196961, -0.00021745,  0.00189673,  0.00157568,  0.00167853],
        [-0.0041901 ,  0.00078054,  0.00157568,  0.0022857 ,  0.00045613],
        [ 0.00504233, -0.001883  ,  0.00167853,  0.00045613,  0.00640488]]))

In [47]:
accuracy_a_p_vlr, average_pred_likelihood_vlr  = accuracy_predictive_likelihood(x = test, t = label, 
                                                                                mu = mu_vlr, Cov = s_vlr)

In [48]:
accuracy_a_p_vlr

0.99

In [49]:
average_pred_likelihood_vlr[label==0] = 1 - average_pred_likelihood_vlr[label==0]
average_pred_likelihood_vlr.mean()

0.9770066184572976

### Variational logistic regression via  fully factorized posterior

In [50]:
def E_update_Q_2d_v(x, t, mu, sigma, xi):
    d = mu.shape[0]
    l_xi = Lambda(xi)
    temp = ((x*x)*l_xi.reshape(-1,1)).sum(axis = 0)
    sigma_squared =1/(1+2*temp)
        
    R1 = 2*l_xi.reshape(-1,1)*(x @ mu.reshape(-1,1) - x * mu.reshape(1,-1))
    R2 = t.reshape(-1,1)  - 1/2 -  R1
    R3 = (R2 * x).sum(axis = 0)
    R4 = sigma_squared * R3
         
    return R4, np.sqrt(sigma_squared)

In [51]:
def E_update_Q_2d(x, t, mu, sigma, xi):
    d = mu.shape[0]
    l_xi = Lambda(xi)
    temp = ((x*x)*l_xi.reshape(-1,1)).sum(axis = 0)
    sigma_squared =1/(1+2*temp)
    
    for i in range(d):
        temp1 = 2*l_xi * (x @ mu -  (mu[i]*x[:,i]))
        temp2 = t  - 1/2 - temp1 
        temp3 = np.dot(x[:,i], temp2)
        mu[i] = sigma_squared[i]*temp3
         
    return mu, np.sqrt(sigma_squared)

In [52]:
def M_update_Q_2d(x, mu, sigma):
    B = (x*x) @ (sigma**2) + (x @ mu)**2
    return np.sqrt(B)

In [53]:
def Em_Q_2d(x,t):
    c = 0
    n, d = x.shape
    mu = np.zeros(d)
    sigma = np.ones(d)
    xi_0 = np.abs(np.random.normal(0,1, size = (n)))
    while True:
        mu, sigma = E_update_Q_2d(x,t,mu,sigma,xi_0)
        xi = M_update_Q_2d(x, mu, sigma)
        if np.linalg.norm(xi_0-xi)<1e-8 and c>3000:
            return mu, sigma
        else:
            c += 1
            xi_0 = xi

In [54]:
mu, sigma = Em_Q_2d(x= data_train, t = label_train)

In [55]:
mu , sigma**2

(array([ 2.89537268, -2.76699462, -1.63499389, -1.94911109, -0.19317358]),
 array([0.01532219, 0.00247973, 0.00047451, 0.00058978, 0.00290673]))

In [56]:
mu_vlr, np.diag(s_vlr)

(array([ 2.89882616, -2.77086021, -1.63751078, -1.95377589, -0.19364377]),
 array([0.0287931 , 0.00360754, 0.00189673, 0.0022857 , 0.00640488]))

In [57]:
mu , sigma

(array([ 2.89537268, -2.76699462, -1.63499389, -1.94911109, -0.19317358]),
 array([0.12378284, 0.04979686, 0.02178334, 0.02428546, 0.0539141 ]))

In [58]:
predict(test, label, mu)

0.99

In [59]:
d = test.shape[1]
Cov =sigma*sigma* np.eye(d)

In [60]:
accuracy_a_p_vlr, average_pred_likelihood_vlr  = accuracy_predictive_likelihood(x = test, t = label, 
                                                                                mu = mu, Cov = Cov)

In [61]:
accuracy_a_p_vlr

0.99

In [62]:
average_pred_likelihood_vlr[label==0] = 1-average_pred_likelihood_vlr[label==0]
average_pred_likelihood_vlr.mean()

0.9769986162278478