In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sc

In [2]:
data = pd.read_csv('diabetes.txt', sep = " ")

In [3]:
# This is the good data
data.shape

(442, 11)

In [4]:
# Define X and Y arrays
X = data.iloc[:,:data.shape[1]-1].as_matrix()
Y = data.iloc[:,data.shape[1]-1].ravel()

In [5]:
# We center the Y
Y_tilde = Y - Y.mean(axis=0)

## 1] Empirical Bayes by Marginal Maximum Likelihood

The Gibbs sampler used here uses the following full conditional distributions:

- The full conditional for $\beta$ is:

<h3 align="center"> $\mathcal{N}_p(A^{-1}X^T\tilde{y}, \sigma^2A^{-1})$ where $A = X^TX+D^{-1}_\tau$ and $D_\tau = diag(\tau^2_1,...,\tau^2_p)$ </h3>

- The full conditional for $\sigma^2$ is:

<h3 align="center"> $\mathcal{I}nverse\mathcal{G}amma(\frac{n-1+p}{2}, \frac{(\tilde{y}-X\beta)^T(\tilde{y}-X\beta) + \beta^TD^{-1}_\tau\beta}{2})$ </h3>

- $\tau^2_1, ..., \tau^2_p$ are conditionnaly independent and $\frac{1}{\tau^2_j}$ has as conditional distribution:

<h3 align="center"> $\mathcal{I}nverse\mathcal{G}ausian(\sqrt{\frac{\lambda^2\sigma^2}{\beta^2_j}}, \lambda^2)$ </h3>

In [8]:
# Initialization
beta = np.random.uniform(size = X.shape[1])
sigma_sq = np.random.uniform()
tau_sq = np.random.uniform(size = X.shape[1])

beta_ = []
sigma_sq_ = []
tau_sq_ = []

#### Full conditional for $\beta$

In [9]:
D_tau = np.diag(tau_sq)
A = X.transpose().dot(X) + np.linalg.inv(D_tau)
multi_norm_mean = np.linalg.inv(A).dot(X.transpose()).dot(Y_tilde)
multi_norm_cov = sigma_sq * np.linalg.inv(A)
beta_.append(np.random.multivariate_normal(multi_norm_mean, multi_norm_cov))

#### Full conditional for $\sigma^2$

In [10]:
shape = (X.shape[0]-1+X.shape[1])/2
scale = ((Y_tilde - X.dot(beta)).dot((Y_tilde - X.dot(beta))) + beta.transpose().dot(np.linalg.inv(D_tau)).dot(beta))/2
sigma_sq_.append(sc.invgamma.rvs(a = shape, scale = scale))

#### Full conditional for $\tau^2_1, ..., \tau^2_p$

In [11]:
###############
# Say lambda 0 is equal to 1
lambda_ = 2

In [32]:
mean = np.sqrt(lambda_**2*sigma_sq/beta**2)
scale = lambda_**2
tau_sq_.append(np.random.wald(mean, scale))

In [42]:
t = []
for i in range(len(tau_sq)):
    t.append(np.random.wald(mean[i], scale))
tau_sq_.append(np.array(t))

#### Gibbs sampler

## 1] Empirical Bayes by Marginal Maximum Likelihood

The Gibbs sampler used here uses the following full conditional distributions:

- The full conditional for $\beta$ is:

<h3 align="center"> $\mathcal{N}_p(A^{-1}X^T\tilde{y}, \sigma^2A^{-1})$ where $A = X^TX+D^{-1}_\tau$ and $D_\tau = diag(\tau^2_1,...,\tau^2_p)$ </h3>

- The full conditional for $\sigma^2$ is:

<h3 align="center"> $\mathcal{I}nverse\mathcal{G}amma(\frac{n-1+p}{2}, \frac{(\tilde{y}-X\beta)^T(\tilde{y}-X\beta) + \beta^TD^{-1}_\tau\beta}{2})$ </h3>

- $\tau^2_1, ..., \tau^2_p$ are conditionnaly independent and $\frac{1}{\tau^2_j}$ has as conditional distribution:

<h3 align="center"> $\mathcal{I}nverse\mathcal{G}ausian(\sqrt{\frac{\lambda^2\sigma^2}{\beta^2_j}}, \lambda^2)$ </h3>

In [6]:
def Gibbs_sampler(n, lambda_):
    # Initialization
    beta = [np.random.uniform(size = X.shape[1])]
    sigma_sq = [np.random.uniform()]
    tau_sq = [np.random.uniform(size = X.shape[1])]
    for i in range(n):
        # Full conditional for beta
        D_tau = np.diag(tau_sq[i])
        A = X.transpose().dot(X) + np.linalg.inv(D_tau)
        multi_norm_mean = np.linalg.inv(A).dot(X.transpose()).dot(Y_tilde)
        multi_norm_cov = sigma_sq[i] * np.linalg.inv(A)
        beta.append(np.random.multivariate_normal(multi_norm_mean, multi_norm_cov))
        # Full conditional for sigma_sq
        shape = (X.shape[0]-1+X.shape[1])/2
        scale = ((Y_tilde - X.dot(beta[i+1])).dot((Y_tilde - X.dot(beta[i+1]))) + beta[i+1].transpose().dot(np.linalg.inv(D_tau)).dot(beta[i+1]))/2
        sigma_sq.append(sc.invgamma.rvs(a = shape, scale = scale))
        # Full conditional for tau_1,...,tau_p
        mean = np.sqrt(lambda_**2*sigma_sq[i+1]/beta[i+1]**2)
        scale = np.repeat(lambda_**2, X.shape[1])
        tau_sq.append(1/np.random.wald(mean, scale))
    return tau_sq[int(n/2):]

#### Empirical Bayes by Marginal Maximum Likelihood

For the Bayesian Lasso, each iteration of the algorithm involves running the Gibbs sampler using a $\lambda$ value estimated from the sample of the previous iteration. Specifically, iteration $k$ uses the Gibbs sampler of Section 2 with hyperparameter $\lambda^{(k-1)}$ (i.e., the estimate from iteration $k-1$) to approximate the ideal updated estimate:

<h3 align="center"> $\lambda^{(k)} = \sqrt{\frac{2p}{\sum\limits^p_{j=1}E_{\lambda^{(k-1)}}[\tau^2_j|\tilde{y}]}}$ </h3>

by replacing the conditional expectations with averages from the Gibbs sample. We suggest the initial value:

<h3 align="center"> $\lambda^{(0)} = \frac{p\sqrt{\hat{\sigma}^2_{LS}}}{\sum\limits^p_{j=1}|\hat{\beta}^{LS}_j|}$ </h3>

where $\hat{\sigma}^2_{LS}$ and $\hat{\beta}^{LS}_j$ are estimates from the usual least squares procedure.

In [7]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# Lambda_init
lambda_init = (X.shape[1]*np.sqrt((np.sum((Y - lm.predict(X))**2))/(X.shape[0]-X.shape[1])))/np.sum(np.abs(lm.coef_))

In [9]:
# This will be used to find the next k lambda
for i in range(50):
    if i==0:
        lambda_ = np.sqrt(2*X.shape[1]/sum(np.mean(Gibbs_sampler(1000, lambda_init), axis=0)))
    else:
        lambda_ = np.sqrt(2*X.shape[1]/sum(np.mean(Gibbs_sampler(1000, lambda_), axis=0)))
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [10]:
lambda_

0.23828477705791679

#### Hyperpriors for the Lasso Parameter

We need to modify the previous Gibbs sampler by introducing the full conditional distribution of $\lambda^2$, which is:

<h3 align="center"> $\mathcal{G}amma(p+r, \frac{\sum\limits_{j=1}^p\tau^2_j}{2}+\delta)$ </h3>

In [36]:
def Gibbs_sampler_bis(n, r, delta):
    # Initialization
    beta = [np.random.uniform(size = X.shape[1])]
    sigma_sq = [np.random.uniform()]
    tau_sq = [np.random.uniform(size = X.shape[1])]
    lambda_sq = [np.random.uniform()]
    for i in range(n):
        # Full conditional for beta
        D_tau = np.diag(tau_sq[i])
        A = X.transpose().dot(X) + np.linalg.inv(D_tau)
        multi_norm_mean = np.linalg.inv(A).dot(X.transpose()).dot(Y_tilde)
        multi_norm_cov = sigma_sq[i] * np.linalg.inv(A)
        beta.append(np.random.multivariate_normal(multi_norm_mean, multi_norm_cov))
        # Full conditional for sigma_sq
        shape = (X.shape[0]-1+X.shape[1])/2
        scale = ((Y_tilde - X.dot(beta[i+1])).dot((Y_tilde - X.dot(beta[i+1]))) + beta[i+1].transpose().dot(np.linalg.inv(D_tau)).dot(beta[i+1]))/2
        sigma_sq.append(sc.invgamma.rvs(a = shape, scale = scale))
        # Full conditional for tau_1,...,tau_p
        mean = np.sqrt(lambda_sq[i]*sigma_sq[i+1]/beta[i+1]**2)
        scale = np.repeat(lambda_sq[i], X.shape[1])
        tau_sq.append(1/np.random.wald(mean, scale))
        # Full conditional for lambda_sq
        shape = X.shape[1] + r
        rate = sum(tau_sq[i+1])/2+delta
        lambda_sq.append(np.random.gamma(shape, rate))
    return lambda_sq

In [38]:
lambda_sq_bis = Gibbs_sampler_bis(100, 1, 1.78)

In [39]:
[np.sqrt(i) for i in lambda_sq_bis]

[0.62062817810426596,
 16.157373770097731,
 5.4679394269839383,
 4.6901907931653808,
 5.9815623561679434,
 6.0032726704971004,
 7.3349773707512362,
 5.9764369899816199,
 5.7204033158312786,
 5.2999992693004812,
 6.3926540857261029,
 5.2718478726361901,
 5.7601066453121863,
 6.4975610349059627,
 4.3161302478534447,
 7.1059933549703924,
 5.3413023322325781,
 5.3164966535817166,
 5.2041865342265403,
 4.5205408645188045,
 5.8711032104448533,
 5.9786193301490691,
 5.2453268891217322,
 7.295675667999741,
 4.090849351159318,
 5.2372853503111658,
 6.3915260509538845,
 6.6300395221264274,
 6.6110112554705349,
 7.5782867749974638,
 5.0147225234389214,
 5.575158055009064,
 6.3113603799692921,
 6.7273973067649084,
 5.5573052374052203,
 4.7786952064296386,
 5.603501250681405,
 4.8152826031754019,
 7.0857585810376973,
 5.0926067255852479,
 7.237634206677928,
 4.3497960419842636,
 6.5437604054940959,
 5.5881298940489845,
 5.8539812384697631,
 4.9370015661326896,
 5.1679562805266013,
 5.32577146507855

# 2] Test on largest dataset

In [2]:
train = pd.read_csv('digits_train.csv')

In [3]:
train.shape

(42000, 785)