In [1]:
import os
import time
import utils
import emcee
import numpy as np
import scipy.special
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp
from tqdm import tqdm

cor, pal = utils.matplotlib_style()


# Set number of threads for other non-pool processes to one to avoid conflicts
os.environ["OMP_NUM_THREADS"] = "1"

# The two-state negative binomial approximation

In this notebook, we will numerically explore the two-state promoter model as it
approximates the negative binomial distribution. As we have described before,
the steady-state mRNA/UMI distribution for a two-state promoter is given by
$$
{\scriptstyle
\pi(u \mid \hat{\theta}) = 
\frac{1}{\Gamma(u + 1)}
\frac{
    \Gamma
    \left(
        \hat{k}^{(p)}_{\text{on}} + u
    \right)
}{
    \Gamma
    \left(
        \hat{k}^{(p)}_{\text{on}}
    \right)
}
\frac{
    \Gamma
    \left(
        \hat{k}^{(p)}_{\text{on}} + \hat{k}^{(p)}_{\text{off}}
    \right)
}{
    \Gamma
    \left(
        \hat{k}^{(p)}_{\text{on}} + \hat{k}^{(p)}_{\text{off}} + u
    \right)
}
\left( \hat{r}_u \right)^u \\
\times {}_1F_1 
\left(
    \hat{k}^{(p)}_{\text{on}} + u,
    \hat{k}^{(p)}_{\text{on}} + \hat{k}^{(p)}_{\text{off}} + u,
    - \hat{r}_u
\right),
}
\tag{1}
$$
where $\hat{\theta} = \left\{ \hat{k}^{(p)}_{\text{on}},
\hat{k}^{(p)}_{\text{off}}, \hat{r}_u \right\}$ are the parameters of the model
rescaled by the mRNA degradation rate $\gamma_m$, i.e.,
$$
\hat{x} = \frac{x}{\gamma_m}.
\tag{2}
$$

We have also shown that in the regime $\hat{k}^{(p)}_{\text{off}} \gg 1$, Eq.
(1) can be provably rewritten as
$$
\pi(u \mid \hat{\theta}) =
{
u + \hat{k}^{(p)}_{\text{on}} - 1
\choose
u
}
\left( 
    \frac{
        \frac{\hat{r}_u}{\hat{k}^{(p)}_{\text{off}}} 
    }{
        1 + \frac{\hat{r}_u}{\hat{k}^{(p)}_{\text{off}}}
    }
\right)^m
\left(
    \frac{
        1
    }{
        1 + \frac{\hat{r}_u}{\hat{k}^{(p)}_{\text{off}}}
    }
\right)^{\hat{k}^{(p)}_{\text{on}}},
\tag{3}
$$
i.e., the negative binomial PMF. The numerical question is then at what values
of $\hat{k}^{(p)}_{\text{off}}$ does Eq. (3) become a good approximation of Eq.
(1)?

To answer this question, we will numerically explore the two-state promoter
model for different parameter values. We already implemented the two-state
promoter model, including the integral approximation of the confluent
hypergeometric function in our `two_state_log_probability` function. Let's write
the equivalent function for Eq. (3).

In [2]:
def two_state_neg_binom_log_probability(
    mRNA_values,
    kp_on,
    kp_off,
    rm,
    gm=1,
):
    """
    Compute the log probability of the negative binomial approximation of the
    two-state promoter model.

    Parameters
    ----------
    mRNA_values : array-like
        The range of mRNA values to evaluate.
    kp_on : float
        Rate of activation of the promoter.
    kp_off : float
        Rate of deactivation of the promoter.
    rm : float
        Production rate of the mRNA.
    gm : float, optional
        1 / half-life time for the mRNA. Default is 1.

    Returns
    -------
    numpy.ndarray
        The computed log probabilities for the range of mRNA values.

    Notes
    -----
    This function uses the negative binomial approximation of the two-state
    promoter model, which is valid when kp_off >> 1.
    """
    # Define the parameters
    n = mRNA_values
    k = kp_on / gm
    p = 1 / (1 + rm / kp_off)
    # Compute the log probability using scipy.stats.nbinom
    log_prob = scipy.stats.nbinom.logpmf(n, k, p)
    return log_prob

Let's now test this function by comparing the two-state promoter model and the
negative binomial approximation.

In [None]:
# Define parameters
k_on = 4.0
k_off = 18.0
r_u = 100.0

# Define range of UMI counts
u_range = np.arange(0, 75)

# Evaluate the log probability
logP_two_state = utils.two_state_log_probability(u_range, k_on, k_off, r_u)

logP_neg_binom = two_state_neg_binom_log_probability(u_range, k_on, k_off, r_u)

# Initialize figure
fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))

# Plot the probability
ax.step(u_range, np.exp(logP_two_state), label="Two-state promoter")
ax.step(u_range, np.exp(logP_neg_binom), label="Negative binomial")

# Add legend
ax.legend(bbox_to_anchor=(1, 0.5))

# Label axis
ax.set_xlabel("UMI counts")
ax.set_ylabel("probability")

For this parameter regime the approximation is not very good. Let's see an 
example in the right parameter regime.

In [None]:
# Define parameters
k_on = 10.0
k_off = 200.0
r_u = 2000.0

# Define range of UMI counts
u_range = np.arange(0, 400)

# Evaluate the log probability
logP_two_state = utils.two_state_log_probability(u_range, k_on, k_off, r_u)

logP_neg_binom = two_state_neg_binom_log_probability(u_range, k_on, k_off, r_u)

# Initialize figure
fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))

# Plot the probability
ax.step(u_range, np.exp(logP_two_state), label="Two-state promoter")
ax.step(u_range, np.exp(logP_neg_binom), label="Negative binomial")

# Add legend
ax.legend(bbox_to_anchor=(1, 0.5))

# Label axis
ax.set_xlabel("UMI counts")
ax.set_ylabel("probability")

In [None]:
# Define parameters
k_on = 10.0
k_off = 200.0
r_u = 20.0

# Define range of UMI counts
u_range = np.arange(0, 40)

# Evaluate the log probability
logP_two_state = utils.two_state_log_probability(u_range, k_on, k_off, r_u)

logP_neg_binom = two_state_neg_binom_log_probability(u_range, k_on, k_off, r_u)

# Initialize figure
fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))

# Plot the probability
ax.step(u_range, np.exp(logP_two_state), label="Two-state promoter")
ax.step(u_range, np.exp(logP_neg_binom), label="Negative binomial")

# Add legend
ax.legend(bbox_to_anchor=(1, 0.5))

# Label axis
ax.set_xlabel("UMI counts")
ax.set_ylabel("probability")

In this parameter regime the approximation is much better.

In [None]:
# Define parameters
k_on = 1
k_off = 20
r_u = 100.0

# Define range of UMI counts
u_range = np.arange(0, 4000)

# Evaluate the log probability
logP_two_state = utils.two_state_log_probability(u_range, k_on, k_off, r_u)

logP_neg_binom = two_state_neg_binom_log_probability(u_range, k_on, k_off, r_u)

# Initialize figure
fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))

# Plot the probability
ax.step(u_range, np.exp(logP_two_state), label="Two-state promoter")
ax.step(u_range, np.exp(logP_neg_binom), label="Negative binomial")

ax.set_xscale("log")

# Add legend
ax.legend(bbox_to_anchor=(1, 0.5))

# Label axis
ax.set_xlabel("UMI counts")
ax.set_ylabel("probability")

## KL divergence as a similarity measure

To systematically explore the parameter regime where the approximation is
valid, we will compute the KL divergence between the two-state promoter model
and the negative binomial approximation. We will use the KL divergence as a
similarity measure between the two probability distributions. We can use the
`scipy.stats.entropy` function to compute this quantity.

In [None]:
# Define range of kp_on values
kp_on_range = np.logspace(0, 2, 25)
# Define range of r_u values
r_u_range = np.logspace(0, 3, 25)

# Define range of UMI counts
u_range = np.arange(0, 2_000)

# Define single kp_off value
kp_off = 100.0

# Initialize KL divergence matrix
kl_divergence = np.zeros((len(kp_on_range), len(r_u_range)))

# Count the number of iterations
total_iterations = len(kp_on_range) * len(r_u_range)
# Initialize progress bar
with tqdm(total=total_iterations, desc="Computing KL divergence") as pbar:
    # Loop over all combinations of kp_on and r_u
    for i, kp_on in enumerate(kp_on_range):
        for j, r_u in enumerate(r_u_range):
            # Compute the KL divergence
            kl_divergence[i, j] = scipy.stats.entropy(
                utils.two_state_log_probability(u_range, kp_on, kp_off, r_u),
                qk=two_state_neg_binom_log_probability(
                    u_range, kp_on, kp_off, r_u
                ),
            )
            # Update progress bar
            pbar.update(1)

Let's plot the KL divergence matrix as a heatmap.

In [None]:
# Initialize figure
fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))

# Plot the KL divergence matrix
sns.heatmap(kl_divergence, ax=ax, cmap="viridis")

# Set x-axis ticks and labels
ax.set_xticks([0, len(r_u_range)-1])
ax.set_xticklabels([f"{r_u_range[0]:.0f}", f"{r_u_range[-1]:.0f}"])

# Set y-axis ticks and labels
ax.set_yticks([0, len(kp_on_range)-1])
ax.set_yticklabels([f"{kp_on_range[0]:.0f}", f"{kp_on_range[-1]:.0f}"])

# Label axis
ax.set_xlabel(r"$r_u$")
ax.set_ylabel(r"$k^{(p)}_{on}$")

# Show figure
plt.show()