In [None]:
import torch
from torch import nn

from torch.utils import data
import torch.optim as optim

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.datasets import make_moons

import jupyter_black

jupyter_black.load()

import numpy as np

## Normalizing Flows

Normalizing flows are a class of models that can be used to represent complex densities. They are based on the following idea: given a simple density $p_0(z)$, we can transform it into a more complex density $p_k(z)$ by applying a sequence of invertible transformations $f_k = f \circ f_{k-1} \circ \dots \circ f_1$:
The change of variables formula gives us: 
$$ p_k(z) = p_0(f_k^{-1}(z)) \left| \det \frac{\partial f_k^{-1}(z)}{\partial z} \right| \qquad \qquad (1)$$ 
where $\left| \det \frac{\partial f_k(z)}{\partial z} \right|$ is the Jacobian determinant of $f_k$.

Unfortunately, naively applying this idea is not practical as the Jacobian determinant is very expensive to compute. Yet, as it turns out we can especially design the transformations $f_k$ such that the Jacobian determinant is easy to compute.

### Affine transformations

The simplest transformation is an affine transformation:
$$ f(z) = \mu + \sigma \odot z $$
where $\sigma \neq 0$ are learnable parameters and $\odot$ is the element-wise product. The Jacobian determinant of this transformation is:

$$ \left| \det \frac{\partial f(z)}{\partial z} \right| = \left| \det \sigma \right| = \prod_{i=1}^D \sigma_i $$
where $D$ is the dimensionality of $z$.

And the inverse transformation is:
$$ f^{-1}(z) = \frac{z - \mu}{\sigma} $$

So what is the determinant $\frac{\partial f^{-1}(z)}{\partial z}$ of the inverse transformation  (the quantity we actually need in formula (1))?

So let's implement this transforation and its inverse.

*Task 1*:

Complete the following `affine_bijectory` and its inverse function.

In [None]:
def affine_bijector(params, x):
    """
    transforms x with affine transformation

    args:
        params: shape (batch, dim, 2)
        x: shape (batch, dim)
    return:
        y: shape (batch, dim)
    """
    shift = # TODO: which dimensions of params correspond to shift?
    scale = # TODO: which dimensions of params correspond to scale?
    scale = torch.exp(scale)  # make the scale positive
    return # TODO: implement affine transformation of x


def affine_bijector_inv_and_log_det(params, y):
    """
    transforms y with inverse affine transformation

    args:
        params: shape (batch, dim, 2)
        y: shape (batch, dim)
    return: x, log_det
    """
    shift = # TODO: which dimensions of params correspond to shift?
    scale = # TODO: which dimensions of params correspond to scale?
    scale = torch.exp(scale) # make the scale positive

    x = # TODO: implement inverse affine transformation of y
    log_det = # TODO: what is the log determinant of the Jacobian of the inverse affine transformation?
    return x, log_det

With the following three cells you can check if your implementation is correct. 

Be aware that we implemented the `params` argument in such a way that each `x` has its own parameters (the batch dimension).

In [None]:
# test for some parameters and some data x_test

# a batch with 3 elements
x_test = torch.tensor([[1.0, 2.0], [3.0, 4.0], [3.0, 7.0]])

# here we need to repeat the params as we have 3 data points x:
# so we will have dimension of (batch, 2, 2) with batch = 3
params = torch.tensor([[1.0, 3.0], [2.0, 1.0]]).repeat(3, 1, 1)

# test affine transformation
ys = affine_bijector(params, x_test)

# test inverse affine transformation and log determinant
xs_rec, log_det = affine_bijector_inv_and_log_det(params, ys)

print("Recovered tensors are the same:", torch.isclose(xs_rec, x_test).all())
print("log_det:", log_det)

Is the `log_det` correct? How can you check this on the example above? 

*Hint:* 
To transform the scales to positive numbers, we take the $\exp$. 

### Check on one dimensional distribution
To have a closer look at the `log_det` we will test the transformation on a one dimensional distribution.

For this we will check if the transformed samples follow the transformed pdf.

In [None]:
# Test for random one dimensional data

# define base distribution p0
p0 = torch.distributions.Normal(torch.tensor(0.0), torch.tensor([1.0]))

# define number of samples
n = 2_000

# if we want to have a fixed transformation for the whole batch,
# we need to repeat the parameters n times
params = torch.tensor([[2.0, 1.0]]).repeat(
    n, 1, 1
)  # shift and scale. Remember: scale is in log space!

# sample from base distribution
xs = p0.sample((n,))
# calculate log probability of samples
log_prob_xs = p0.log_prob(xs)

# transform samples
ys = affine_bijector(params, xs)

# inverse transform and compute log determinant
xs_rec, log_det = affine_bijector_inv_and_log_det(params, ys)

# calculate log probability of transformed samples
log_prob_ys = p0.log_prob(xs_rec).squeeze() + log_det.squeeze()
# Can you explain why we need to add the log determinant to the log_prob?


# We can now plot samples from the transformed distribution as well as
# the transformed density function to check if the log_det is correct
s1 = plt.plot(
    xs.squeeze(),
    torch.exp(log_prob_xs.squeeze()).numpy(),
    "o",
    label="p0: Base distribution",
    color="red",
)

s2 = plt.plot(
    ys.squeeze(),
    torch.exp(log_prob_ys.squeeze()).numpy(),
    "o",
    label="p1: Transformed distribution",
    color="blue",
)
# plot histograms of the samples
_ = plt.hist(xs.numpy(), bins=20, density=True, alpha=0.5, color="red")
_ = plt.hist(ys.numpy(), bins=50, density=True, alpha=0.5, color="blue")
plt.legend()

print("Recovered tensors are the same:", torch.isclose(xs_rec, xs, atol=1e-3).all())

Let's look how a two dimensional transformed distribution might look like:

In [None]:
# Test for two dimensional data

# define base distribution p0 as a multivariate normal distribution
p0 = torch.distributions.MultivariateNormal(torch.zeros(2), torch.eye(2))

# sample from the base distribution
n = 2_000
xs = p0.sample((n,))

# define parameters for affine transformation (and again repeat them for the whole batch)
params = torch.tensor([[3.0, 0.1], [-4.0, 2.0]]).repeat(n, 1, 1)

# transform samples and calculate log probability of samples
ys = affine_bijector(params, xs)
log_prob_xs = p0.log_prob(xs)

# inverse transform and log probs
xs_rec, log_det = affine_bijector_inv_and_log_det(params, ys)

log_prob_ys = p0.log_prob(xs_rec) + log_det

# Plot a two dimensional kernel density estimation (kde)

sns.kdeplot(
    x=xs[:, 0].numpy(),
    y=xs[:, 1].numpy(),
    fill=False,
    cmap="Reds",
    color="red",
    label="p0: Base distribution",
    vmin=0,
)
sns.kdeplot(
    x=ys[:, 0].numpy(),
    y=ys[:, 1].numpy(),
    fill=False,
    cmap="Blues",
    label="p1: Transformed distribution",
)

# lets add colored patches to the legend
handles = [
    mpatches.Patch(facecolor=plt.cm.Reds(100), label="p0: Base distribution"),
    mpatches.Patch(facecolor=plt.cm.Blues(100), label="p1: Transformed distribution"),
]
plt.legend(handles=handles)

plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.xlabel("x1")
plt.ylabel("x2")


print("Recovered tensors are the same:", torch.isclose(xs_rec, xs, atol=1e-5).all())

Nice! We now already can tranform base distributions with an affine transformation.

The downside is: the transformed distribution is still a Gaussian....

So what to do...????

Well, we could transform some dimensions with a function that depends on the other dimensions, so called 

## Coupling layers

Problematically affine transformations alone won't help us to represent complex densities. An affine transformation of a Gaussian is still a Gaussian. To overcome this problem, we can use **coupling layers**. The idea is to split the input $z$ into two parts $z_1$ and $z_2$ and only transform $z_2$:

$$ \begin{align*}
z_1, z_2 &= \text{split}(z)\\
z_1' &= z_1 \\
z_2' &= z_2 \odot \exp(g_{\phi_2}(z_1)) + g_{\phi_1}(z_1)  
\end{align*} $$

The Jacobian determinant of this transformation still has a simple form:
$$ \left| \det \frac{\partial f(z)}{\partial z} \right| = \exp \left( \sum_{i=1}^D g_{\phi_2}^i(z_1) \right) $$
which you can easily verify yourself.

Further we can easily compute the inverse transformation:
$$ \begin{align*}
z_1, z_2 &= \text{split}(z)\\
z_1' &= z_1 \\
z_2' &= (z_2 - g_{\phi_1}(z_1)) \odot \exp(-g_{\phi_2}(z_1))
\end{align*} $$

Take a close look: These are the same formulas as above, but we replaced the constant shift and scale by the functions 
 $g_{\phi_1}$  and $\exp(g_{\phi_2})$.

We can now parameterize $g_{\phi_1}$ and $g_{\phi_2}$ using neural networks. This allows us to learn complex transformations $f$ that can be used to represent complex densities.

Notably with a single coupling layer, we do not change the distribution of $z_1$ and only transform $z_2$. Therefor we will stack several layers later on. 

#### Important:
To use coupling layers for **conditional** density estimation, the functions $g_i$ will additionally take a context variable as input.

*Task*:

Complete the code below to implement one single coupling layer. 

In [None]:

class CouplingNet(nn.Module):
    def __init__(
        self,
        input_dim,
        context_dim,
        bijector_dim,
        bijector_fn,
        bijector_inverse_and_log_det_fn,
        hidden_dims=[50, 50],
        act=nn.ReLU(),
    ):
        """
        implements a neural network which parameterizes the bijector.
        args:
            input_dim: dimension of the input
            context_dim: dimension of the context
            bijector_dim: needed dimensions for the bijector
            bijector_fn: function which implements the forward pass of the bijector
            bijector_inverse_and_log_det_fn: function which implements the inverse pass of the bijector
            hidden_dims: list of hidden dimensions
            act: activation function
        """
        super(CouplingNet, self).__init__()

        self.bijector_fn = bijector_fn
        self.bijector_inverse_and_log_det_fn = bijector_inverse_and_log_det_fn
        self.bijector_dim = bijector_dim

        # Let's construct the neural network which will be used to parameterize the bijector
        self.n_change = input_dim // 2  # number of dimensions to modify
        self.n_unchange = (
            input_dim - self.n_change
        )  # number of dimensions to keep unchanged
        random_dims = torch.randperm(input_dim)  # random permutation of the dimensions
        self.changedim = random_dims[
            : self.n_change
        ]  # choose the the dimensions to change
        self.unchangedim = random_dims[
            self.n_change :
        ]  # choose the the dimensions to keep unchanged
        output_dim = (
            # TODO: what should the output dimension of the network be?
        )  
        block = [nn.Linear(#TODO: what is th input of the first linear layer?
            , hidden_dims[0]), act]
        for i in range(1, len(hidden_dims)):
            block += [nn.Linear(hidden_dims[i - 1], hidden_dims[i]), act]
        block += [nn.Linear(hidden_dims[-1], output_dim)]
        self.net = nn.Sequential(*block)

    def forward(self, x, context):
        """This will implement the forward pass of the coupling layer."""
        # split the input into the dimensions to change and the dimensions to keep unchanged
        x1 = x[:, self.unchangedim]
        x2 = x[:, self.changedim]
        # calculate the parameters of the bijector,
        # dependent on the dimensions to keep unchanged and the context
        bijector_params = self.net(torch.cat([x1, context], dim=-1)).reshape(
            # TODO: we have to reshape the output of the network to the correct shape. 
            # What should the shape be?
        )
        # apply the bijector
        y2 = self.bijector_fn(bijector_params, x2)
        # replace the dimensions to change with the transformed dimensions
        x[:, self.changedim] = y2
        return x

    def inverse(self, y, context):
        """This will implement the inverse pass of the coupling layer."""
        y1 = # TODO
        y2 = # TODO
        bijector_params = self.net(torch.cat([y1, context], dim=-1)).reshape(
            # TODO
        )
        x2, log_det = self.bijector_inverse_and_log_det_fn(bijector_params, y2)
        y[:, self.changedim] = #TODO

        return y, log_det

In [None]:
# Let's test the CouplingNet for some random data
input_dim = 10
context_dim = 2
n = 10  # number of samples
# create random data
x = torch.randn(n, input_dim)
# create some random context (same number of samples as x)
context = torch.randn(n, context_dim)

# initialise a CouplingNet
coupling_net = CouplingNet(
    input_dim, context_dim, 2, affine_bijector, affine_bijector_inv_and_log_det
)  # bijector dim is 2 (shift and scale)

# apply the forward pass
y = coupling_net(x, context)

# apply the inverse pass
x_rec, log_det = coupling_net.inverse(y, context)

# check if x and x_rec are the same
print("The inverse is working for all x: ", torch.allclose(x, x_rec, atol=1e-5))

### Normalizing flows with coupling layers

We can now use coupling layers to construct a normalizing flow. We start with a simple density $p_0(z)$ and then apply a sequence of coupling layers to obtain a more complex density $p_k(z)$.
To address the problem that only $z_2$ is transformed, we can just choose different dimensions to transfor in each layer. 

In [None]:
# TODO: comment the code below to explain what is happening


class NormalizingFlow(nn.Module):
    def __init__(
        self,
        input_dim,
        context_dim,
        bijector_dim,
        bijector_fn=affine_bijector,
        bijector_inverse_and_log_det_fn=affine_bijector_inv_and_log_det,
        num_layers=5,
        hidden_dims=[50, 50],
        act=nn.ReLU(),
    ):
        """
        Implements a normalizing flow with coupling layers.
        args:
            input_dim: dimension of the input
            context_dim: dimension of the context
            bijector_dim: needed dimensions for the bijector
            bijector_fn: function which implements the forward pass of the bijector
            bijector_inverse_and_log_det_fn: function which implements the inverse pass of the bijector
            num_layers: number of coupling layers
            hidden_dims: list of hidden dimensions per coupling layer
            act: activation function
        """
        super(NormalizingFlow, self).__init__()
        self.bijectors = nn.ModuleList(
            [
                CouplingNet(
                    input_dim,
                    context_dim,
                    bijector_dim,
                    bijector_fn,
                    bijector_inverse_and_log_det_fn,
                    hidden_dims,
                    act,
                )
                for _ in range(num_layers)
            ]
        )
        self.base_dist = torch.distributions.Independent(
            torch.distributions.Normal(torch.zeros(input_dim), torch.ones(input_dim)), 1
        )

    def forward(self, x, context):
        y = x
        for bijector in self.bijectors:
            y = bijector(y, context)
        return y

    def log_prob(self, y, context):
        x, log_det = self.inverse(y, context)
        return self.base_dist.log_prob(x) + log_det

    def sample(self, num_samples, context):
        """samples from flow distribution for one context"""
        # sample from base distribution
        y = self.base_dist.sample((num_samples,))
        # repeat the context s.t. it also has a batch dimension
        context = torch.broadcast_to(context, y.shape[:-1] + context.shape[-1:])
        for bijector in self.bijectors:
            y = bijector(y, context)
        return y

    def inverse(self, y, context):
        x = y
        log_det = 0
        for bijector in reversed(
            self.bijectors
        ):  # we need to reverse the order of the bijectors!!!
            x, log_det_ = bijector.inverse(x, context)
            log_det += log_det_
        return x, log_det

In [None]:
input_dim = 7
context_dim = 3
bijector_dim = 2
net = NormalizingFlow(
    input_dim, context_dim, bijector_dim, affine_bijector, num_layers=5
)

In [None]:
x = torch.randn(10, input_dim)
context = torch.randn(10, context_dim)
y = net(x, context)
x_rec, log_det = net.inverse(y, context)

# check if x and x_rec are the same
print("The inverse is working for all x: ", torch.allclose(x, x_rec, atol=1e-5))

## Conditional Flows on Two Moons dataset

We can now use normalizing flows to construct expressive conditional density estimators $q_\phi(\theta|x)$. Let's look at the same example of the two moons:

In [None]:
def get_moons(n, noise=0.1, mode="fixed", shift=torch.tensor([[0.0, 0.0], [0.0, 0.0]])):
    """Returns moons dataset with two moons and labels.
    Args:
        n: number of samples
        noise: noise level
        shift: shift of the moons. (2,2) or "random"
    Returns:
        X: shape (n, 2)
        ids: shape (n, 1)
    """
    X, ids = make_moons(n_samples=n, noise=noise)
    X = torch.tensor(X, dtype=torch.float)
    ids = torch.tensor(ids, dtype=torch.float)

    # shift the moons
    if mode == "fixed":
        X[ids == 0] += shift[0]
        X[ids == 1] += shift[1]
    elif mode == "random":
        shift = (torch.rand(n, 2) - 0.5) * 10
        X += shift

    return X, ids.unsqueeze(-1), shift

In [None]:
shift = torch.tensor([[-2.0, 1.0], [0.0, 0.0]])

X, context, shift = get_moons(1_000, shift=shift)

In [None]:
# plot the data
plt.plot(
    X[context.squeeze() == 0, 0],
    X[context.squeeze() == 0, 1],
    ".",
    alpha=0.5,
    label="context 0",
)
plt.plot(
    X[context.squeeze() == 1, 0],
    X[context.squeeze() == 1, 1],
    ".",
    alpha=0.5,
    label="context 1",
)

plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Two Moons Dataset")
plt.xlim(-4, 4)
plt.ylim(-3, 3)

# plt.plot(shift[:, 0], shift[:, 1], "x", color="red", label="shifts")

plt.legend()

In [None]:
# cretae a dataset
dataset = data.TensorDataset(X, context)
# define a dataloader
train_loader = data.DataLoader(
    dataset,
    batch_size=50,
)

In [None]:
# standard optimizer used across a lot of neural network tasks

input_dim = 2
context_dim = 1
bijector_dim = 2

flow = NormalizingFlow(
    input_dim, context_dim, bijector_dim, affine_bijector, num_layers=5
)
opt = optim.Adam(flow.parameters(), lr=0.001)

# training loop
for e in range(100):
    for x_batch, context_batch in train_loader:
        opt.zero_grad()
        log_probs = flow.log_prob(x_batch, context_batch)
        loss = -log_probs.mean()
        loss.backward()
        opt.step()
    if (e % 10) == 0:
        print("epoch {}: loss {}".format(e, loss.item()))

In [None]:
# create sample from the posterior for one context
context_gt = torch.ones(1, 1)
posterior_samples = flow.sample(500, context_gt).detach()

In [None]:
# evaluate the pdf on a grid
x, y = torch.meshgrid(
    torch.arange(-4, 4, 0.01), torch.arange(-3, 3, 0.01), indexing="xy"
)
xy = torch.stack((x.flatten(), y.flatten()), dim=1)

probs = torch.exp(flow.log_prob(xy, context_gt.repeat(xy.shape[0], 1)).detach())
probs[torch.isnan(probs)] = 0.0

probs_np = probs.reshape(x.shape).numpy()

In [None]:
# plot the data

plot_posterior_samples = False
plot_posterior_pdf = True

plt.plot(
    X[context.squeeze() == 0, 0],
    X[context.squeeze() == 0, 1],
    ".",
    alpha=0.4,
    label="context 0",
)
plt.plot(
    X[context.squeeze() == 1, 0],
    X[context.squeeze() == 1, 1],
    ".",
    alpha=0.4,
    label="context 1",
)

if plot_posterior_samples:
    plt.plot(
        posterior_samples[:, 0],
        posterior_samples[:, 1],
        ".",
        label=f"Posterior Samples for context {context_gt.item()}",
        color="red",
        alpha=0.5,
    )

if plot_posterior_pdf:
    plt.pcolormesh(x[0,], y[:, 0], probs_np[:-1, :-1], label="posterior pdf")

plt.xlabel("x1")
plt.ylabel("x2")
plt.xlim(-4, 4)
plt.ylim(-3, 3)
plt.legend()

In [None]:
if plot_posterior_pdf:
    plt.pcolormesh(x[0,], y[:, 0], probs_np[:-1, :-1], label="posterior pdf")

### Now lets test this with different shifts

In [None]:
n = 10_000

X, ids, shift = get_moons(n, mode="random")
context = torch.cat([ids.T, shift.T]).T

In [None]:
# cretae a dataset
dataset = data.TensorDataset(X, context)
# define a dataloader
train_loader = data.DataLoader(
    dataset,
    batch_size=100,
)

In [None]:
# standard optimizer used across a lot of neural network tasks

input_dim = 2
context_dim = 3
bijector_dim = 2

flow = NormalizingFlow(
    input_dim, context_dim, bijector_dim, affine_bijector, num_layers=5
)
opt = optim.Adam(flow.parameters(), lr=0.001)

# training loop
for e in range(100):
    for x_batch, context_batch in train_loader:
        opt.zero_grad()
        log_probs = flow.log_prob(x_batch, context_batch)
        loss = -log_probs.mean()
        loss.backward()
        opt.step()
    if (e % 10) == 0:
        print("epoch {}: loss {}".format(e, loss.item()))

In [None]:
# define one observation
# id and shift
id_gt = 0
context_gt = torch.tensor([id_gt, -2, 5]).unsqueeze(0)

# sample from posterior
posterior_samples = flow.sample(10000, context_gt).detach()

# get ground truth samples with fixed context

# put just [0,0] into the moon id which is not needed
n = 1000
if id_gt == 0:
    shift = torch.tensor(
        [[context_gt.squeeze()[1].item(), context_gt.squeeze()[2].item()], [0.0, 0.0]]
    )
elif id_gt:
    shift = torch.tensor(
        [[0.0, 0.0], [context_gt.squeeze()[1].item(), context_gt.squeeze()[2].item()]]
    )

X, ids, shift = get_moons(n, mode="fixed", shift=shift)

In [None]:
# plot the data
plt.plot(
    X[ids.squeeze() == 0, 0],
    X[ids.squeeze() == 0, 1],
    ".",
    alpha=0.5,
    label="context 0",
)
plt.plot(
    X[ids.squeeze() == 1, 0],
    X[ids.squeeze() == 1, 1],
    ".",
    alpha=0.5,
    label="context 1",
)

plt.plot(
    posterior_samples[:, 0],
    posterior_samples[:, 1],
    ".",
    label=f"Posterior Samples for context {context_gt}",
    color="red",
    alpha=0.1,
)


plt.xlabel("x1")
plt.ylabel("x2")
plt.xlim(-5, 5)
plt.ylim(-5, 8)
plt.legend()

*Bonus question:*

Can you come up with an example 2d dataset, for which the normalizing flow fails or has difficulties? 