In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import torch
import pyro
from tqdm.notebook import tqdm
#import sklearn.datasets
print(pyro.__version__)

Create synthetic data

In [None]:
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='int') # class labels
for j in range(K):
    ix = range(N*j,N*(j+1))
    r = np.linspace(0.0, 0.5, N) # radius
    t = np.linspace(j*4, (j+1)*4, N) + np.random.randn(N)*0.2 # theta
    X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
    y[ix] = j

In [None]:
#X, y = sklearn.datasets.make_moons(200, noise=0.2)
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

fig, ax = plt.subplots(figsize=(6, 3))
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral_r, alpha=0.5);

### Artificial neural network

- Artificial Neuron: Linear regressor plus activation function (e.g. sigmoid -> Logistic regression)

$$
\hat y = \mathcal{S}(WX + B)\\ \mathcal{S}(z) = \frac{1}{1+e^{-z}}
$$
- Fully connected layer: Several artificial neurons with the same input
- Multilayer perceptron: Several fully connected layers

Neural network model in pytorch
- class that inherits from `torch.nn.Module`
- `__init__(self, args):` Define layers, e.g. fully-connected (`Linear`), convolutional (`Conv1D`, `Conv2D`)
- `forward(self, x):` Define how layers are connected

In [None]:
class NNet_classifier(torch.nn.Module):
    
    def __init__(self, num_hidden=10):
        super(NNet_classifier, self).__init__()
        self.layer1 = torch.nn.Linear(2, num_hidden) 
        self.layer2 = torch.nn.Linear(num_hidden, num_hidden)
        self.layer3 = torch.nn.Linear(num_hidden, 3)
        self.activation = torch.nn.ReLU()
        
    def forward(self, x): 
        z = self.activation(self.layer1(x))
        z = self.activation(self.layer2(z))
        return self.layer3(z) #Neural net output

### Training a neural network for classification

- Categorical label $y={0, 1, 2, 3, ..., K-1}$ 
- A softmax activation is used in the last layer
$$
\hat y_k = \frac{e^{f_k}}{\sum_{j=1}^K e^{f_j}}, k=0,\ldots, K-1
$$
- The network is trained by minimizing
$$
L(\hat y, y) = - \log \frac{e^{f_y}}{\sum_{j=1}^K e^{f_j}} = - f_y + \log \sum_{j=1}^K e^{f_j}
$$
the negative log likelihood (categorical)



#### In Pytorch

Define
- `criterion`: Cost function to be minimized 
- `optimizer`: Optimization algorithm, typically based on stochastic gradient descent ($\eta$ is the learning rate)
$$
w_{t+1} = w_{t} - \eta \nabla_w L(w_t)
$$

Training is performed by
1. Evaluating the network using `forward`
1. Calculating the error/loss selected in `criterion`
1. Computing the derivatives of the error using the `backward` attribute of the error
1. Updating parameters according to `optimizer`

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)
line2 = ax[1].plot([], [])

def update_plot(k, model):
    ax[0].cla()
    Z = model.forward(torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype('float32')))
    zz = torch.nn.Softmax(dim=1)(Z).argmax(dim=1).detach().numpy().reshape(xx.shape[0], xx.shape[1])
    ax[0].pcolormesh(xx, yy, zz, cmap=plt.cm.Set1, alpha=0.75)
    for i, m in enumerate(['o', 'x', 'd']):
        ax[0].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)
    
    line2[0].set_xdata(range(k))
    line2[0].set_ydata(epoch_loss[:k])
    for ax_ in ax:
        ax_.relim()
        ax_.autoscale_view()
    fig.canvas.draw()

In [None]:
model = NNet_classifier(num_hidden=10)
display(model)
criterion = torch.nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_one_epoch(x, y, phase='train'):
    haty = model.forward(x) # Evaluate the model
    loss = criterion(haty, y) # Calculate errors
    if phase == 'train':
        optimizer.zero_grad()
        loss.backward() # Compute derivatives
        optimizer.step() # Update parameters 
    return loss.item()

x_train = torch.from_numpy(X.astype('float32'))#.reshape(-1, 1)
y_train = torch.from_numpy(y)#.reshape(-1, 1)
epoch_loss = np.zeros(shape=(6000,)) 

for k in tqdm(range(len(epoch_loss))):
    epoch_loss[k] = train_one_epoch(x_train, y_train)
    if k % 100 == 0: 
        update_plot(k, model)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)

Z = torch.nn.Softmax(dim=1)(model.forward(torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype('float32'))))
zz = Z.argmax(dim=1).detach().numpy().reshape(xx.shape[0], xx.shape[1])
ax[0].pcolormesh(xx, yy, zz, cmap=plt.cm.Set1, alpha=0.75)
for i, m in enumerate(['o', 'x', 'd']):
    ax[0].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)
    
zz = -(Z*(Z+1e-32).log()).sum(dim=1).reshape(xx.shape).detach().numpy()
cf = ax[1].contourf(xx, yy, zz, cmap=plt.cm.Blues, alpha=0.75, vmin=0., vmax=np.log(3))
fig.colorbar(cf, ax=ax[1])
for i, m in enumerate(['o', 'x', 'd']):
    ax[1].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)

### Bayesian neural network

Instead of training via maximum likelihood we place a prior on the parameters $p(w)$ and aim for the  posterior

$$
p(w| \mathcal{D}) = \frac{p(\mathcal{D}|w)p(w)}{\int p(\mathcal{D}|w) p(w) \,dw}
$$


#### Problem:  No closed form for the evidence for hierchical/non-linear models

#### Solution for models with few parameters: MCMC

#### Solution for models with millions of parameters: Variational inference

Propose an approximate (simple) posterior $q_\phi(w)$ and optimize so that it looks similar to the actual posterior

We do this by maximizing a lower bound on the evidence

$$
\mathcal{L}(\phi) = \mathbb{E}_{q_\phi(w)}[ \log p(\mathcal{D}|w)] - \text{KL}[q_\phi(w)|p(w)]
$$

An we use $q_\phi(w)$ as our replacement for $p(w|\mathcal{D})$ to calculate the **posterior predictive distribution**

$$
p(\mathbf{y}|\mathbf{x}, \mathcal{D}) = \int p(\mathbf{y}|\mathbf{x}, w) p(w| \mathcal{D}) \,dw
$$



In [None]:
import pyro.distributions as dist

class BayesianNNet_classifier(pyro.nn.PyroModule):
    def __init__(self, num_hidden=10, prior_std=1.):
        super().__init__()
        prior = dist.Normal(0, prior_std)
        self.layer1 = pyro.nn.PyroModule[torch.nn.Linear](2, num_hidden)
        self.layer1.weight = pyro.nn.PyroSample(prior.expand([num_hidden, 2]).to_event(2))
        self.layer1.bias = pyro.nn.PyroSample(prior.expand([num_hidden]).to_event(1))
        
        self.layer2 = pyro.nn.PyroModule[torch.nn.Linear](num_hidden, num_hidden)
        self.layer2.weight = pyro.nn.PyroSample(prior.expand([num_hidden, num_hidden]).to_event(2))
        self.layer2.bias = pyro.nn.PyroSample(prior.expand([num_hidden]).to_event(1))
        
        self.layer3 = pyro.nn.PyroModule[torch.nn.Linear](num_hidden, 3)
        self.layer3.weight = pyro.nn.PyroSample(prior.expand([3, num_hidden]).to_event(2))
        self.layer3.bias = pyro.nn.PyroSample(prior.expand([3]).to_event(1))        
        
        self.activation = torch.nn.Tanh()

    def forward(self, x, y=None):
        h = self.activation(self.layer1(x))
        h = self.activation(self.layer2(h))
        p = self.layer3(h).squeeze(1)
        with pyro.plate("data", size=x.shape[0], dim=-1):
            obs = pyro.sample("obs", dist.Categorical(logits=p), obs=y)
            #obs = pyro.sample("obs", dist.Bernoulli(logits=p), obs=y)
        return p

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)
line2 = ax[1].plot([], [])

def update_plot(k, samples):
    ax[0].cla()
    p = torch.nn.functional.one_hot(samples["obs"], num_classes=3).sum(dim=0)
    zz = p.argmax(dim=1).reshape(xx.shape).detach().numpy()
    ax[0].pcolormesh(xx, yy, zz, cmap=plt.cm.Set1, alpha=0.75)
    for i, m in enumerate(['o', 'x', 'd']):
        ax[0].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)    

    line2[0].set_xdata(range(k))
    line2[0].set_ydata(epoch_loss[:k])
    for ax_ in ax:
        ax_.relim()
        ax_.autoscale_view()
    fig.canvas.draw()

In [None]:
pyro.enable_validation(True)
pyro.clear_param_store()
model = BayesianNNet_classifier(num_hidden=10, prior_std=10.)
print(pyro.poutine.trace(model).get_trace(x_train, y_train).format_shapes())

from pyro.infer.autoguide import AutoDiagonalNormal
guide = AutoDiagonalNormal(model)

svi = pyro.infer.SVI(model, 
                     guide, 
                     optim=pyro.optim.ClippedAdam({'lr':1e-3}),
                     loss=pyro.infer.Trace_ELBO())

epoch_loss = np.zeros(shape=(10000,))
for k in tqdm(range(len(epoch_loss))):
    epoch_loss[k] = svi.step(x_train, y_train)
    if k % 100 == 0:
        predictive = pyro.infer.Predictive(model, guide=guide, num_samples=10)
        samples = predictive(torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype('float32')))
        update_plot(k, samples)

In [None]:
predictive = pyro.infer.Predictive(model, 
                                   guide=guide, 
                                   num_samples=100)
samples = predictive(torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype('float32')))

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(9, 2), tight_layout=True)

for k in range(4):
    zz = samples["obs"][k].reshape(xx.shape).detach().numpy()
    ax[k].pcolormesh(xx, yy, zz, cmap=plt.cm.Set1)
    for i, m in enumerate(['o', 'x', 'd']):
        ax[k].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)    

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)

p = torch.nn.functional.one_hot(samples["obs"], num_classes=3).sum(dim=0)
zz = p.argmax(dim=1).reshape(xx.shape).detach().numpy()
ax[0].pcolormesh(xx, yy, zz, cmap=plt.cm.Set1, alpha=0.75)
for i, m in enumerate(['o', 'x', 'd']):
    ax[0].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)

zz = -(p/100.*(p/100.+1e-32).log()).sum(dim=1).reshape(xx.shape).detach().numpy()
cf = ax[1].contourf(xx, yy, zz, cmap=plt.cm.Blues, alpha=0.75)
fig.colorbar(cf, ax=ax[1])
for i, m in enumerate(['o', 'x', 'd']):
    ax[1].scatter(X[y==i, 0], X[y==i, 1], c='k', marker=m, s=20, alpha=0.25)

### Bayesian neural network for MNIST

In [None]:
import torchvision
mnist_test = torchvision.datasets.MNIST(root='~/datasets', train=False, transform=torchvision.transforms.ToTensor())
mnist_loader = torch.utils.data.DataLoader(mnist_test, batch_size=128, shuffle=True)

In [None]:
import pyro.distributions as dist

class BayesianNNet_classifier(pyro.nn.PyroModule):
    def __init__(self, ninput=28*28, num_hidden=10, prior_std=1.):
        super().__init__()
        prior = dist.Normal(0, prior_std)
        
        self.layer1 = pyro.nn.PyroModule[torch.nn.Linear](ninput, num_hidden)
        self.layer1.weight = pyro.nn.PyroSample(prior.expand([num_hidden, ninput]).to_event(2))
        self.layer1.bias = pyro.nn.PyroSample(prior.expand([num_hidden]).to_event(1))
        
        self.layer2 = pyro.nn.PyroModule[torch.nn.Linear](num_hidden, 10)
        self.layer2.weight = pyro.nn.PyroSample(prior.expand([10, num_hidden]).to_event(2))
        self.layer2.bias = pyro.nn.PyroSample(prior.expand([10]).to_event(1))
        
        self.activation = torch.nn.Softplus()

    def forward(self, x, y=None):
        #x = self.activation(self.conv1(x))
        p = self.layer2(self.activation(self.layer1(x))).squeeze(1)
        with pyro.plate("data", size=x.shape[0], dim=-1):
            obs = pyro.sample("obs", dist.Categorical(logits=p), obs=y)
        return p

In [None]:
pyro.enable_validation(True)
pyro.clear_param_store()
model = BayesianNNet_classifier(num_hidden=100)

from pyro.infer.autoguide import AutoDiagonalNormal
guide = AutoDiagonalNormal(model)

svi = pyro.infer.SVI(model, 
                     guide, 
                     optim=pyro.optim.ClippedAdam({'lr':1e-2}),
                     loss=pyro.infer.Trace_ELBO())


fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)
line2 = ax[1].plot([], [])

epoch_loss = np.zeros(shape=(100,))
for k in tqdm(range(len(epoch_loss))):
    for images, labels in mnist_loader:
        # calculate the loss and take a gradient step
        epoch_loss[k] += svi.step(images.reshape(-1, 28*28), labels)
    #break    
    if k % 1 == 0:
        ax[0].cla()
        line2[0].set_xdata(range(k))
        line2[0].set_ydata(epoch_loss[:k])
        for ax_ in ax:
            ax_.relim()
            ax_.autoscale_view()
        fig.canvas.draw()

Getting the mode and entropy from the posterior predictive distribution

In [None]:
predictive = pyro.infer.Predictive(model, 
                                   guide=guide, 
                                   num_samples=100)
samples = predictive(mnist_test.data.reshape(-1, 28*28)/255.)
p = torch.nn.functional.one_hot(samples["obs"], num_classes=10).sum(dim=0)
mode = p.argmax(dim=1)
entropy = -(p/100.*(p/100.+1e-32).log()).sum(dim=1)

Finding the most uncertain digits

In [None]:
import ipywidgets as widgets

fig, ax = plt.subplots(1, 2, figsize=(5, 3), tight_layout=True)

digit = 4
mask = mnist_test.targets == digit
idx = np.argsort(entropy[mask].numpy())[::-1]
k = 0
def update(x):
    global k
    for ax_ in ax:
        ax_.cla()
    ax[0].imshow(mnist_test.data[mask][idx[k]], cmap=plt.cm.Greys_r)
    res = ax[1].hist(samples['obs'][:, mask][:, idx[k]], range=(0, 10))
    ax[1].set_title("%d %0.4f" %(mode[mask][idx[k]], entropy[mask][idx[k]]))
    ax[1].set_xticks(range(10));
    k+=1

bnext = widgets.Button(description='next')
bnext.on_click(update)
bnext

### Variational Autoencoder in pyro



In [None]:
class EncoderDual(torch.nn.Module):
    def __init__(self, latent_dim, input_dim=28*28, hidden_dim=128):
        super(EncoderDual, self).__init__()
        self.hidden1 = torch.nn.Linear(input_dim, hidden_dim)
        self.hidden2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.z_loc = torch.nn.Linear(hidden_dim, latent_dim)
        self.z_scale = torch.nn.Linear(hidden_dim, latent_dim)
        self.activation = torch.nn.Softplus()

    def forward(self, x):
        h = self.activation(self.hidden1(x))
        h = self.activation(self.hidden2(h))
        return self.z_loc(h), torch.exp(self.z_scale(h))
    
class Decoder(torch.nn.Module):
    def __init__(self, latent_dim, output_dim=28*28, hidden_dim=128):
        super(Decoder, self).__init__()
        self.hidden1 = torch.nn.Linear(latent_dim, hidden_dim)
        self.hidden2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.output = torch.nn.Linear(hidden_dim, output_dim)
        self.activation = torch.nn.Softplus()

    def forward(self, z):
        h = self.activation(self.hidden1(z))
        h = self.activation(self.hidden2(h))
        return self.output(h)

In [None]:
from pyro.distributions import Bernoulli, Normal

class VariationalAutoEncoder(torch.nn.Module):
    
    def __init__(self, latent_dim, hidden_dim=128):
        super(VariationalAutoEncoder, self).__init__() 
        self.encoder = EncoderDual(latent_dim, hidden_dim=hidden_dim)
        self.decoder = Decoder(latent_dim, hidden_dim=hidden_dim)
        self.latent_dim = latent_dim
        
    def model(self, x):
        pyro.module("decoder", self.decoder)
        with pyro.plate("data", size=x.shape[0]):
            # p(z)
            z_loc = torch.zeros(x.shape[0], self.latent_dim, device=x.device)
            z_scale = torch.ones(x.shape[0], self.latent_dim, device=x.device)
            z = pyro.sample("latent", Normal(z_loc, z_scale).to_event(1))
            # p(x|z)
            p_logits = self.decoder.forward(z)
            pyro.sample("observed", Bernoulli(logits=p_logits, validate_args=False).to_event(1), 
                        obs=x.reshape(-1, 28*28))
    
    def guide(self, x):
        pyro.module("encoder", self.encoder)
        with pyro.plate("data", size=x.shape[0]):
            # q(z|x)
            z_loc, z_scale  = self.encoder.forward(x.reshape(-1, 28*28))
            pyro.sample("latent", Normal(z_loc, z_scale).to_event(1))

In [None]:
from torch.utils.data import DataLoader, SubsetRandomSampler


mnist_train = torchvision.datasets.MNIST(root='~/datasets', train=True, 
                                         transform=torchvision.transforms.ToTensor())

np.random.seed(0)
idx = list(range(len(mnist_train)))
np.random.shuffle(idx)
split = int(0.7*len(idx))

train_loader = DataLoader(mnist_train, batch_size=128, drop_last=True,
                          sampler=SubsetRandomSampler(idx[:split]))

valid_loader = DataLoader(mnist_train, batch_size=128, drop_last=True,
                          sampler=SubsetRandomSampler(idx[split:]))

test_loader = DataLoader(mnist_test, batch_size=1024, drop_last=False, shuffle=False)

In [None]:
pyro.enable_validation(True) # BUG?
pyro.clear_param_store()

vae = VariationalAutoEncoder(latent_dim=2)

use_gpu = False
if use_gpu:
    vae = vae.cuda()
    
svi = pyro.infer.SVI(model=vae.model, 
                     guide=vae.guide, 
                     optim=pyro.optim.Adam({"lr": 1e-2}), 
                     loss=pyro.infer.Trace_ELBO())

fig, ax = plt.subplots()
for nepoch in tqdm(range(10)):
    # Plot latent space on the fly
    Z = torch.tensor([], device='cuda') if use_gpu else torch.tensor([], device='cpu')
    for x, label in test_loader:
        if use_gpu:
            x = x.cuda()
        Z = torch.cat((Z, torch.cat((vae.encoder(x.reshape(-1, 28*28))), dim=1)), dim=0)
    Z = Z.detach().cpu().numpy()
    ax.cla()
    for digit in range(10):
        mask = mnist_test.targets == digit
        ax.errorbar(x=Z[mask, 0], y=Z[mask, 1], 
                    xerr=Z[mask, 2], yerr=Z[mask, 3],
                    fmt='none', alpha=0.5, label=str(digit))
    plt.legend()
    fig.canvas.draw()
    
    # Actual training
    epoch_loss = 0.0
    for x, label in train_loader:
        if use_gpu:
            x = x.cuda()
        epoch_loss += svi.step(x)
    print("%d %f" %(nepoch, epoch_loss))

In [None]:
if use_gpu:
    vae = vae.cpu()
    
output_activation = torch.nn.Sigmoid()
fig, ax = plt.subplots(2, 10, figsize=(8, 2), tight_layout=True)

x, label = next(iter(train_loader))
z_loc, z_scale = vae.encoder.forward(x.reshape(-1, 28*28))
for i in range(10):
    ax[0, i].imshow(x.detach().numpy()[i, 0, :, :], cmap=plt.cm.Greys_r)
    ax[0, i].axis('off')
    reconstructions_mean = output_activation(vae.decoder(z_loc)).reshape(-1, 28, 28). detach().numpy()
    ax[1, i].imshow(reconstructions_mean[i], cmap=plt.cm.Greys_r)
    ax[1, i].axis('off')

In [None]:
M = 30
z_plot = np.linspace(-3, 3, num=M)
big_imag = np.zeros(shape=(28*M, 28*M))

for i in range(M):
    for j in range(M):
        z = torch.tensor(np.array([z_plot[j], z_plot[M-1-i]]), dtype=torch.float32)
        xhat = output_activation(vae.decoder.forward(z)).reshape(28, 28). detach().numpy()
        big_imag[i*28:(i+1)*28, j*28:(j+1)*28] = xhat

fig, ax = plt.subplots(figsize=(9, 9), tight_layout=True)
Z_plot1, Z_plot2 = np.meshgrid(z_plot, z_plot)
ax.matshow(big_imag, vmin=0.0, vmax=1.0, cmap=plt.cm.gray, extent=[-4, 4, -4, 4])
#H, xedge, yedge = np.histogram2d(Z[:, 0], Z[:, 1], bins=30, range=[[-4, 4], [-4, 4]])
#ax.contour(Z_plot1, Z_plot2, H.T, linewidths=3, levels=[1], cmap=plt.cm.Reds);

### Neural Networks and irregular sampling other than phased-lstm?

- https://arxiv.org/abs/1806.07366v5, https://arxiv.org/abs/1907.03907
- http://proceedings.mlr.press/v80/binkowski18a.html
- https://openreview.net/forum?id=r1efr3C9Ym, https://papers.nips.cc/paper/6475-a-scalable-end-to-end-gaussian-process-adapter-for-irregularly-sampled-time-series-classification.pdf
