## Week 10 Lecture 1 - Gaussian Processes

McElreath's lectures for today: https://www.youtube.com/watch?v=PIuqxOBJqLU&list=PLDcUM9US4XdMROZ57-OIRtIK0aOynbgZN&index=16

McElreath's lectures for the whole book are available here: https://github.com/rmcelreath/stat_rethinking_2022

An R/Stan repo of code is available here: https://vincentarelbundock.github.io/rethinking2/

Dustin Stansbury has some lovely PyMC Code available here: https://github.com/dustinstansbury/statistical-rethinking-2023

You are encouraged to work through both of these versions to re-enforce what we're doing in class.

In [None]:
# Import python packages
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp 
import random as rd
import pdb
import pymc as pm
import arviz as az
import networkx as nx
from matplotlib import pyplot as plt
import dataframe_image as dfi

# Helper functions
def stdize(x):
    return (x-np.mean(x))/np.std(x)


def indexall(L):
    poo = []
    for p in L:
        if not p in poo:
            poo.append(p)
    Ix = np.array([poo.index(p) for p in L])
    return poo,Ix

def logit(p):
    return np.log(p) - np.log(1 - p)

def invlogit(p):
    return np.exp(p) / (1 + np.exp(p))


from matplotlib.patches import Ellipse
from scipy.stats import chi2


def Gauss2d(mu, cov, ci, ax=None, ec='k'):
    """Copied from statsmodel"""
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    v_, w = np.linalg.eigh(cov)
    u = w[0] / np.linalg.norm(w[0])
    angle = np.arctan(u[1]/u[0])
    angle = 180 * angle / np.pi # convert to degrees

    for level in ci:
        v = 2 * np.sqrt(v_ * chi2.ppf(level, 2)) #get size corresponding to level
        ell = Ellipse(mu[:2], v[0], v[1], 180 + angle, facecolor='None',
                      edgecolor=ec,
                      alpha=(1-level)*.5,
                      lw=1.5)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)
    
    return ax

# Gaussian processes

While linear models have got us a long way, and are typically fit for purpose in many contexts, we might also profitably bulid models that need a bit more wiggle to them. This can be in terms of both the mean function and aspects related to covariances. We saw how splines were built early on, but in terms of non-linearities we might be able to do a bit better, and estimate regularized functions of the data that don't break down into linear model components with arbitrary numbers of knots and basis functions. 

One particularly Bayesian way to do this is through the use of [Gaussian Processes](https://blog.dominodatalab.com/fitting-gaussian-process-models-python/) (GP's), which are in essence, distributions of functions. This might seem weird, but all we're doing in putting a GP prior in place is drawing the values for function f(x) from a multivarite normal distribution. PyMC is particuarly good for fitting GP means or covariances, with [great built in functions](https://www.pymc.io/projects/docs/en/stable/api/gp.html).

So what are we talking about, well stealing [this example from the PyMC-learn site](https://pymc-learn.readthedocs.io/en/latest/notebooks/GaussianProcessRegression.html), we can simulate some data

In [None]:
n = 150 # The number of data points
X = np.linspace(start = 0, stop = 10, num = n)[:, None] # The inputs to the GP, they must be arranged as a column vector

# Define the true covariance function and its parameters
length_scale_true = 1.0
signal_variance_true = 3.0
cov_func = signal_variance_true**2 * pm.gp.cov.ExpQuad(1, length_scale_true)

# A mean function that is zero everywhere
mean_func = pm.gp.mean.Zero()

# The latent function values are one sample from a multivariate normal
# Note that we have to call `eval()` because PyMC built on top of aseara
f_true = np.random.multivariate_normal(mean_func(X).eval(),
                                       cov_func(X).eval() + 1e-8*np.eye(n), 1).flatten()

# The observed data is the latent function plus a small amount of Gaussian distributed noise
# The standard deviation of the noise is `sigma`
noise_variance_true = 2.0
y = f_true + noise_variance_true * np.random.randn(n)

## Plot the data and the unobserved latent function
fig = plt.figure()
ax = fig.gca()
ax.plot(X, f_true, "dodgerblue", lw=3, label="True f");
ax.plot(X, y, 'ok', ms=3, label="Data");
ax.set_xlabel("X"); ax.set_ylabel("y"); plt.legend()
plt.savefig('simgp.jpg',dpi=300);

This is a pretty wiggly generating function, whose shape we could use a spline to estimate. But let's see what a GP would do with this

In [None]:
with pm.Model() as gp_fit:
    # Matern function parameters for covariance of adjacent points
    ρ = pm.Gamma('ρ', 1, 1)
    η = pm.Gamma('η', 1, 1)
    K = η * pm.gp.cov.Matern32(1, ρ)
    
    # Mean function - in this case zero, but could be a linear model etc
    M = pm.gp.mean.Zero()
    
    # Gaussian process prior - M can be omitted because default is zero, and not much gained by adding non-zero fn
    gp = pm.gp.Marginal(mean_func=M, cov_func=K)

    
    # Data likelihood
    σ = pm.HalfCauchy('σ', 2.5)
    y_obs = gp.marginal_likelihood('y_obs', X=X, y=y, noise=σ)

So what's going on here? Well the Matern function characterizes how quickly correlations between adjacent observations decline, and that gets put into something called a GP Marginal distribution. So what's that? Well it's the prior distribtuion on the wiggly functions. Let's fit it to see what this looks like

In [None]:
with gp_fit:
    trace = pm.sample(2000, n_init=20000)

In [None]:
pm.plot_trace(trace, var_names=['ρ', 'σ', 'η']);

In [None]:
Z = np.linspace(-2, 12, 100).reshape(-1, 1)
with gp_fit:
    fp = gp.conditional('fp', Z)
    ppc = pm.sample_posterior_predictive(trace, var_names=['fp'])

In [None]:
plt.plot(Z, ppc.posterior_predictive['fp'].values[0].T, c='grey', alpha=0.1)
plt.scatter(X, y, c='red')
plt.xlim(-2,12)
plt.savefig('gps.jpg',dpi=300);

Looking at draws from the posterior, these are draws from a population of **functions**, rather than from parameters. It's insane, but dammit it works well. Let's have a look then at two examples from Rethinking.

# Oceanic tools again

The oceanic tools example has, sitting behind it, the sea-going distances between islands, meaning that tool complexity is very likely impacted by how far each society is from other societies, implying a spatial dependence. There are many ways to deal with spatial strucuture but GP's are an increasingly common way to handle them, primarily because we can allow for spaital decay in relatedness among locations with functions on their covariances. Let's see how to do this with the `Kline.csv` data

In [None]:
# Import Oceanic Tools data
kdata = pd.read_csv('Kline2.csv')
dfi.export(kdata, 'kdata.jpg')
kdata

In [None]:
# Import Distance matrix for Islands
Dmat = pd.read_csv('islandsDistMatrix.csv', index_col=0)
# Grab island names
SocNames = Dmat.columns.values
# Take a look
tmp = Dmat.round(1)
dfi.export(tmp, 'dmat.jpg')
tmp

The tools data itself we've already seen, however we now also have a distance matrix (in 1000's of km's) of how far each island is from the others. This is what we'll use for the covariance part of our variance-covariance (aka covariance) matrix.

The model that incorporates this spaital structure builds on the 'scientific model' from Week 7, but now with the multivariate normal structure in there to include the distance matrix information. Repeated here from Week 8:

*Scientific model*

Based on domain knowledge - i.e. our understanding of the system under study - it should make sense that innovation in tool development increases with population size but with diminishing returns; eventually each additional person will add less innovation. Also cultures tend to discard tools over time, replacing them with new ones. These two processes can be represented by

$$
\Delta T = \alpha P^{\beta} - \gamma T
$$

where the change in tool number per time step ($\Delta T$) is equal to some increase ($\alpha$) proportional to population size, with diminishing returns ($\beta$), and the per-unit-time rate of tool loss ($\gamma$). If we then set $\Delta T=0$, we'll get the equilibrium tool set size


$$
\hat{T} = \frac{\alpha P^{\beta}}{\gamma}.
$$

Which we can encode this into a statistical model along with a new intercept for each island $k_i$, which is estimated from the distance matrix:

$$
\begin{align}
T_i \sim & Poisson(\lambda_i) \\
\lambda_i = & e^{k_i}\frac{\alpha P^{\beta}}{\gamma} \\
\left[\begin{array}{c}
k_{1} \\
k_{2} \\
... \\
k_{10}
\end{array}\right] = & MvN \left(\left[\begin{array}{c}
0 \\
0 \\
... \\
0 
\end{array}\right], K \right) \\
K_{ij} = & \eta^2 e^{-\rho^2 D^{2}_{ij}} + \delta_{ij}\sigma^2
\end{align}
$$

The critical developments come at the bottom, where average values for $k$ are $e^0=1$, meaning nothing changes in the the scientific model function at the average. Where $k<0$, $\lambda$ decreases and where $k>0$, $\lambda$ increases. 

Values for $k$ on each island are then altered by the covariance matrix $K$, which itself is defined by two additive parts. The second part $\delta_{ij}\sigma^2$ is just the variance within each society along the matrix diagonal, and zero elsewhere. The first bit however is the covariance part, with correlation $\rho$ and distance matrix $D$, which says that the covariance between any two places, $i$ and $j$, declines exponentially with the square of the distance between them. Why the square? Because it makes a nice shape in terms of decay that allows for more rapid decline as distance increases than would be the linear case. If you think about setting off in a boat, things get far more difficult quite quickly as you get further from your home island. You can see the difference for yourself

In [None]:
# Initialize plot
_, ax = plt.subplots(1, 1, figsize=(5, 5))

# Distance range
xrange = np.linspace(0, 4, 100)
# Plot linear exponential decline
ax.plot(xrange, np.exp(-1*xrange), 'k--', label='Linear')
# Plot squared exponential decline
ax.plot(xrange, np.exp(-1*xrange**2), 'k', label='Squared')
ax.set_xlabel('distance')
ax.set_ylabel('correlation')
plt.legend()
plt.savefig('decay.jpg',dpi=300);

With these elements in place, we can add our GP for exponential decline into our scientific model

In [None]:
# Total tools - response
T = kdata.total_tools.values
# Number of islands
nsoc = len(T)

# log-Population size
P = kdata.logpop.values
# Dummy for high-contact
C,Ic = indexall(kdata.contact.values)

# Distance matrix
Dmat_ = Dmat.values
# Squared distance matrix
Dmatsq = np.power(Dmat_, 2)

# Society index
Is = np.arange(nsoc)

In [None]:
with pm.Model(coords={'Contact':C}) as SciToolsGP:
    # Innovation rate
    α = pm.Exponential('iRate(α)', 1)
    # Diminishing returns
    β = pm.Exponential('dReturns(β)', 1)
    # Tool loss rate
    γ = pm.Exponential('lRate(γ)', 1)
    
    # Maximum covariance
    etasq = pm.Exponential('etasq', 2)
    # Correlation
    rhosq = pm.Exponential('rhosq', 0.5)
    # Variance-Covariance
    Kij = etasq*(pm.math.exp(-rhosq*Dmatsq)+np.diag([.01]*nsoc))
    
    # Distance-based effects
    k = pm.MvNormal('k', mu=np.zeros(nsoc), cov=Kij, shape=nsoc)
    
    # Scientific distance model
    λ = pm.math.exp(k[Is])*(α*P**β/γ)

    # Likelihood
    Yi = pm.Poisson('TotalTools', λ, observed=T)

In [None]:
with SciToolsGP:
    trace_t = pm.sample(1000, tune=1000)

In [None]:
tmp = pm.summary(trace_t)
dfi.export(tmp.style.background_gradient(), 'scitoolsgp.jpg')
tmp

Ok, things sampled ok (we can likely do better by non-centering k) but for now we'll run with it and take a look at our decay function and see how our priors and posteriors compare

In [None]:
# Initialize plot
_, ax = plt.subplots(1, 2, figsize=(10, 5))
xrange = np.linspace(0, 10, 200)

# Plot priors
ax[0].plot(xrange, 2 * np.exp(-.5 * xrange**2), 'k', c='red')
ax[0].plot(xrange, (np.random.exponential(2,100)[:, None] * np.exp(-np.random.exponential(0.5,100)[:, None] * xrange**2)).T,'k', alpha=.1)
ax[0].set_ylim(0, 2)
ax[0].set_xlabel('Distance (1000 km)')
ax[0].set_ylabel('Covariance')
ax[0].set_title('Prior')


# Grab posteriors
post_etasq = trace_t.posterior['etasq'].values[0]
post_rhosq = trace_t.posterior['rhosq'].values[0]

# Plot posteriors
ax[1].plot(xrange, np.median(post_etasq) * np.exp(-np.median(post_rhosq) * xrange**2), 'k', c='red')
ax[1].plot(xrange, (post_etasq[:100][:, None] * np.exp(-post_rhosq[:100][:, None] * xrange**2)).T, 'k', alpha=.1)
ax[1].set_ylim(0, 2)
ax[1].set_xlabel('Distance (1000 km)')
ax[1].set_ylabel('')
ax[1].set_title('Posterior')
plt.savefig('postdecay.jpg',dpi=300);

You can see from this that the level of prior covariance has shrunk considerably, but this is difficult to visualize. To make this more concrete we can re-constitute the correlation matrix among societies

In [None]:
# Posterior median covariance among islands
K_post = np.median(post_etasq) * (np.exp(-np.median(post_rhosq)*Dmatsq) + np.diag([.01]*nsoc))
# Variance
sigma_post = np.sqrt(np.diag(K_post))
# Correlation matrix
Rho = pd.DataFrame(np.diag(sigma_post**-1).dot(K_post.dot(np.diag(sigma_post**-1))), index=SocNames, columns=SocNames)
tmp = Rho.round(2)
dfi.export(tmp.style.background_gradient(), 'rho.jpg')
tmp

This is the distance-based correlation matrix, and to see exactly how it is working, we can plot these correlations in space, to see how spatially correlated things are, and we can plot tools against population, to see how these correlations play out for the full model

In [None]:
# Scale point size to logpop
logpop = P.copy()
logpop /= P.max()
psize = np.exp(logpop*5.5)

# Calculate posterior median relationship, ignoring distance
Nsamp, Nbin = 1000, 30
log_pop_seq = np.linspace(6, 14, Nbin)
a_post = trace_t.posterior['iRate(α)'].values[0][:, None]
b_post = trace_t.posterior['dReturns(β)'].values[0][:, None]
g_post = trace_t.posterior['lRate(γ)'].values[0][:, None]
lambda_post = a_post*log_pop_seq**b_post/g_post

In [None]:
# Set up plot
_, ax = plt.subplots(1, 2, figsize=(10, 5))

#### Plot societies in space
ax[0].scatter(kdata['lon2'], kdata['lat'], psize)
# Grab names
labels = kdata['culture'].values
# Iterate over islands
for i, itext in enumerate(labels):
    ax[0].text(kdata['lon2'][i]+1, kdata['lat'][i]+1, itext)

# Add lines shaded by Rho
for i in range(10):
    for j in np.arange(i+1, 10):
        ax[0].plot([kdata['lon2'][i],kdata['lon2'][j]],[kdata['lat'][i], kdata['lat'][j]],'k-', alpha=Rho.iloc[i,j]**2, lw=2.5)
ax[0].set_xlabel('Longitude')
ax[0].set_ylabel('Latitude')


#### Plot tools against population
# Posterior predictions
ax[1].plot(log_pop_seq, np.median(lambda_post, axis=0), '--', color='k')
cix = 0.8
az.plot_hdi(log_pop_seq, lambda_post, hdi_prob=cix, color='k', fill_kwargs={'alpha':cix*.5}, ax=ax[1])

# plot raw data and labels
ax[1].scatter(P, kdata['total_tools'], psize)
labels = kdata['culture'].values
for i, itext in enumerate(labels):
    ax[1].text(P[i]+.1, kdata['total_tools'][i]-2.5, itext)
    
# Add correlations
for i in range(10):
    for j in np.arange(i+1, 10):
        ax[1].plot([P[i], P[j]],[kdata['total_tools'][i], kdata['total_tools'][j]], 'k-', alpha=Rho.iloc[i, j]**2, lw=2.5)
        
ax[1].set_xlabel('log(population)')
ax[1].set_ylabel('Total tools')
ax[1].set_xlim(6.8, 12.8)
ax[1].set_ylim(10, 73)
plt.savefig('map.jpg',dpi=300);

# Phylogenetic distance

Among the many recent obsessions of a small corner of the ecological world is the relationship between phylogeny and morphology - how various body parts, brain sizes etc vary with phylogenetic relatedness. Phylogeny is essentially like a distance between islands, but rather through evolutionary distance, and so we can develop a similar covariance structure to add 'known' phylogenetic distances into a model. 

For this example we'll look at the causal influence of group size on brain size, plus phylogeny, on primates, outlined on p.477 in Rehinking. The difference between this model and the two other GP's above is that we'll have both a covariance matrix and a linear model. What we're looking for, in effect, is if phylogeny has much to say about brain size (B) beyond group size (G) and body size (M) among the various species. What's special is - you guessed it - that there is a phylogenetic distance matrix in there to account for correlations among species. In notation the model is 


$$
\begin{align}
B_i \sim & MvN(\mu_i, K) \\
\mu_i = & \beta_0 + \beta_1G_i + \beta_2M_i \\
K_{ij} = & \eta^2 exp({\rho^2D^2_{ij}}) \\
\beta_0 \sim & N(0, 1) \\
\beta_1,\beta_2 \sim & N(0, 0.5) \\
\eta^2 \sim & Exp(1) \\
\rho^2 \sim & HalfNorm(3, 0.25)
\end{align}
$$

With nearly the same distance matrix function for $K_{ij}$ seen in the Oceanic tools example. Note the prior for $\eta^2$ is different than in McElreath's text (p483); the $HalfNormal(1, 0.25)$ he uses is very strong.

First let's import the various datasets

In [None]:
# Grab primates data
Primates301 = pd.read_csv("Primates301.csv", sep=";")
# Drop missing stuff
pdata = Primates301.dropna(subset=["group_size", "body", "brain"])
# Keep track of spp names
spp_names = pdata["name"]
# Total number of species
nspp = len(spp_names)
# Take a look
pdata

In [None]:
# Brain size
B = stdize(np.log(pdata.brain)).values
# Body mass
M = stdize(np.log(pdata.body)).values
# Group size
G = stdize(np.log(pdata.group_size)).values

Next we can import the matrix of phylogentic distances

In [None]:
# Import distance matrix
Dmat = pd.read_csv("Primates301_distance_matrix.csv", index_col=0)
# Scale to 0-1 and reorder distance matrix to match that of the species names
Dmat_ord = (Dmat.loc[spp_names, spp_names] / Dmat.loc[spp_names, spp_names].max()).values
tmp = pd.DataFrame(Dmat_ord)
dfi.export(tmp.head(10), 'D.jpg',max_cols=10,)
Dmat_ord
Dmatsq = np.power(Dmat_ord, 2)

With all these elements in place, we can code this into PyMC3:

In [None]:
with pm.Model() as Phylo:
    # Linear model priors
    β0 = pm.Normal("Intercept", 0.0, 1.0)
    β1 = pm.Normal("BodyMass", 0.0, 0.5)
    β2 = pm.Normal("GroupSize", 0.0, 0.5)
    
    # Linear model
    mu = β0+β1*M+β2*G

    # OU process priors
    η2 = pm.Normal("etasq", 1, 0.25)
    ρ2 = pm.Normal("rhosq", 3.0, 0.25)
    
    # Covariance (OU) function
    K = η2*(pm.math.exp(-ρ2*Dmatsq))
            
    mu = pm.MvNormal('g', mu=mu, cov=K, shape=nspp)

    # GP
    σ = pm.Exponential("sigma", 1.0)
    Yi = pm.Normal("Yi", mu, σ, observed=B)

In [None]:
with Phylo:
    trace_p = pm.sample()

In [None]:
tmp = pm.summary(trace_p, var_names=['Intercept','BodyMass','GroupSize','etasq','rhosq'])
dfi.export(tmp.style.background_gradient(), 'phylotable.jpg')
tmp

In [None]:
d_seq = np.linspace(0, Dmat_ord.max(), 100)
post = trace_p.posterior.stack(sample=("chain", "draw"))

_, ax = plt.subplots(1, 1, figsize=(7, 5))

# prior mean and 89% interval
eta = np.random.exponential(1.0, 1000)
rho = np.random.normal(3.0, 0.25, 1000)
Kx = []
for dx in d_seq:
    Kx.append(eta * np.exp(-rho * dx))
Kx = np.asarray(Kx)

ax.plot(d_seq, Kx.mean(1), "k", alpha=.8, lw=2, label='Prior')
az.plot_hdi(d_seq, Kx.T, color="k", fill_kwargs={"alpha": .3}, ax=ax)

# posterior
#indx = np.random.choice(1000, 50)
#post_etasq = post["etasq"].values[indx][:, None]
#post_rhosq = post["rhosq"].values[indx][:, None]
#ax.plot(d_seq, (post_etasq * np.exp(-post_rhosq * d_seq)).T, "b", alpha=0.1)
#ax.plot(d_seq, (post["etasq"].median().values * np.exp(-post["rhosq"].median().values * d_seq)), c='b')
Kp = []
for d in d_seq:
    Kp.append(post["etasq"].values * np.exp(-post["rhosq"].values * d))
Kp = np.asarray(Kp)

ax.plot(d_seq, Kp.mean(1), "b", alpha=.8, lw=2, label='Posterior')
az.plot_hdi(d_seq, Kp.T, color="b", fill_kwargs={"alpha": .3}, ax=ax)


ax.set(xlabel="Phylogenetic distance", ylabel="Covariance")
plt.legend()
plt.savefig('PriorPost.jpg');