In [None]:
"""Sandbox module."""
import matplotlib.pyplot as plt
import numpy as np
from misc import (
    cdf_from_density,
    density_from_qd,
    l2_norm,
    norm_pdf,
    riemann_sum_arrays,
    trunc_norm_pdf,
)
from scipy.stats import norm, truncnorm

%matplotlib inline

$\int_0^1 f(t)dt$  wird dann durch die Riemann Summe $1/m \sum_{j=1}^m f(s_j)$ ersetzt ($s_j$  - Gridpunkte, $m$ -  Anzahl der Gridpunkte).

# Functional Data Analysis

Ideas for simulation
- Uni- vs. Multivariate case
- Simulate different normal distributions
- Vary parameters of (generalized) Beta distribution, so principal components can be interpreted as varying parameters

$X(t) = \sum_{k=1}^n η_k φ_k(t)$


## Transformation Method Paper (Petersen & Müller 2016)

In [None]:
# 1. Generate synthetic functional data
n = 200
gridnum = 1000
truncation_point = 5
grid_densities = np.linspace(
    start=-np.ones(n) * truncation_point,
    stop=np.ones(n) * truncation_point,
    num=gridnum,
)
grid_quantiles = np.linspace(
    start=np.ones(n) * 0.01,
    stop=np.ones(n) * 0.99,
    num=gridnum,
)
grid_densities_univ = np.linspace(
    start=-truncation_point,
    stop=truncation_point,
    num=gridnum,
)
grid_quantiles_univ = np.linspace(start=0, stop=1, num=gridnum)

# Draw different sigmas
log_sigmas = np.random.default_rng(seed=28071995).uniform(-1.5, 1.5, n)
mus = np.zeros(n)
sigmas = np.exp(log_sigmas)

In [None]:
# Calculate discretized distributions

# Calculate densities
a, b = (-truncation_point - mus) / sigmas, (truncation_point - mus) / sigmas
densities_discretized = norm_pdf(grid_densities, mus, sigmas).transpose()
densities_discretized2 = truncnorm.pdf(
    x=grid_densities,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
densities_discretized3 = trunc_norm_pdf(
    grid_densities,
    mus,
    sigmas,
    -truncation_point,
    truncation_point,
).transpose()

# Calculate distribution functions
cdfs_discretized = cdf_from_density(
    grid_densities_univ,
    densities_discretized[:, :, np.newaxis],
    axis=1,
)
cdfs_discretized2 = cdf_from_density(
    grid_densities_univ,
    densities_discretized2[:, :, np.newaxis],
    axis=1,
)
cdfs_discretized3 = cdf_from_density(
    grid_densities_univ,
    densities_discretized3[:, :, np.newaxis],
    axis=1,
)

# Calculate quantiles
quantiles_discretized = norm.ppf(grid_quantiles, mus, sigmas).transpose()
quantiles_discretized2 = truncnorm.ppf(
    q=grid_quantiles,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantiles_discretized3 = truncnorm.ppf(
    q=grid_quantiles,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()

# Calculate quantile densities
quantile_densities_discretized = np.reciprocal(
    norm_pdf(quantiles_discretized.transpose(), mus, sigmas),
).transpose()
quantile_densities_discretized2 = np.reciprocal(
    truncnorm.pdf(quantiles_discretized2.transpose(), a=a, b=b, loc=mus, scale=sigmas),
).transpose()
quantile_densities_discretized3 = np.reciprocal(
    trunc_norm_pdf(
        quantiles_discretized3.transpose(),
        mus,
        sigmas,
        -truncation_point,
        truncation_point,
    ),
).transpose()

In [None]:
densities_discretized = trunc_norm_pdf(grid_densities, mus, sigmas, a, b).transpose()
densities_discretized.shape

In [None]:
### Check whether my function differs from scipy func
eps = 1e-14
(
    (densities_discretized[0] - densities_discretized2[0] > eps).sum(),
    (quantiles_discretized[0] - quantiles_discretized2[0] > eps).sum(),
    (
        quantile_densities_discretized[0] - quantile_densities_discretized2[0] > eps
    ).sum(),
    (densities_discretized2[0] - densities_discretized3[0] > eps).sum(),
    (quantiles_discretized2[0] - quantiles_discretized3[0] > eps).sum(),
    (
        quantile_densities_discretized2[0] - quantile_densities_discretized3[0] > eps
    ).sum(),
)

### Check how fast the different implementations are.

1. My normal distribution functions
2. Scipy's Truncated normals
3. My truncated normals

(For quantiles always used scipy's implementation of ppf.)

In [None]:
%%timeit
densities_discretized = norm_pdf(grid_densities, mus, sigmas).transpose()
quantiles_discretized = norm.ppf(grid_quantiles, mus, sigmas).transpose()
quantile_densities_discretized = np.reciprocal(
    norm_pdf(quantiles_discretized.transpose(), mus, sigmas),
).transpose()

In [None]:
%%timeit
densities_discretized2 = truncnorm.pdf(
    x=grid_densities,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantiles_discretized2 = truncnorm.ppf(
    q=grid_quantiles,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantile_densities_discretized2 = np.reciprocal(
    truncnorm.pdf(quantiles_discretized2.transpose(), a=a, b=b, loc=mus, scale=sigmas),
).transpose()

In [None]:
%%timeit
densities_discretized3 = trunc_norm_pdf(grid_densities, mus, sigmas, a, b).transpose()
quantiles_discretized3 = truncnorm.ppf(
    q=grid_quantiles,
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantile_densities_discretized3 = np.reciprocal(
    trunc_norm_pdf(quantiles_discretized3.transpose(), mus, sigmas, a, b),
).transpose()

### Use implementation 3, faster than scipy but still correct

In [None]:
densities_discretized = densities_discretized3
quantiles_discretized = quantiles_discretized3
quantile_densities_discretized = quantile_densities_discretized3

In [None]:
# 2. Compute the mean function
mean_function = np.mean(densities_discretized, axis=0)

In [None]:
# 3. Center the data
centered_densities = densities_discretized - mean_function

In [None]:
# 4. Estimate the covariance function using a discrete approximation
cov_matrix = np.cov(centered_densities, rowvar=False)

In [None]:
# 5. Compute the eigenfunctions (principal components) of the covariance matrix
eigenvalues, eigenfunctions = np.linalg.eigh(cov_matrix)

In [None]:
# Sort eigenvalues and eigenfunctions in decreasing order
eigenvalues_sorted = eigenvalues[np.argsort(-eigenvalues)]
eigenfunctions_sorted = eigenfunctions[:, np.argsort(-eigenvalues)]

In [None]:
# 6. Adjust the lambda function to handle array input and interpolate values from the
# eigenvector

# Compute the L^2 norm for each column (eigenvector) for rescaling to l2 norm
l2_norms = l2_norm(
    left_bound_support=-truncation_point,
    right_bound_support=truncation_point,
    array=eigenfunctions_sorted,
    axis=0,
)

# Scale each column of the eigenfunctions matrix by its respective L^2 norm using
# broadcasting
eigenfunctions_scaled = eigenfunctions_sorted / l2_norms

# Check the first few L^2 norms to verify
first_few_norms = l2_norm(
    left_bound_support=-truncation_point,
    right_bound_support=truncation_point,
    array=eigenfunctions_scaled,
    axis=0,
)
first_few_norms[:5]

In [None]:
# 7. Compute FPC scores / factor loadings
products = np.einsum("ij,jk->ijk", centered_densities, eigenfunctions_scaled)
fpc_scores = riemann_sum_arrays(
    left_bound=-truncation_point,
    right_bound=truncation_point,
    array=products,
    axis=1,
)

In [None]:
# Plot the mean function and the first two eigenfunctions
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.plot(grid_densities_univ, mean_function, "b-")
plt.title("Mean Function")

plt.subplot(1, 3, 2)
plt.plot(grid_densities_univ, eigenfunctions_sorted[:, 0], "r-")
plt.title("1st Eigenfunction")

plt.subplot(1, 3, 3)
plt.plot(grid_densities_univ, eigenfunctions_sorted[:, 1], "g-")
plt.title("2nd Eigenfunction")

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(grid_densities_univ, mean_function, label="Mean Function")
ax.plot(grid_densities_univ, -eigenfunctions_sorted[:, 0], label="First component")
ax.plot(grid_densities_univ, eigenfunctions_sorted[:, 1], label="Second component")
plt.legend()
plt.show()

## More reproducing of Petersen & Müller (2016)

In [None]:
def fve(k):
    """Compute explained variance."""
    return np.sum(eigenvalues_sorted[0:k]) / np.sum(eigenvalues_sorted)


fve(1)

In [None]:
def mode_of_variation(alpha, mean_func, eigval, eigfunc):
    """Compute kth mode of variation."""
    return mean_func + alpha * np.sqrt(eigval) * eigfunc

In [None]:
first_mode = mode_of_variation(
    1,
    mean_function,
    eigenvalues_sorted[0],
    eigenfunctions_sorted[:, 0],
)
second_mode = mode_of_variation(
    1,
    mean_function,
    eigenvalues_sorted[1],
    eigenfunctions_sorted[:, 1],
)

In [None]:
fig, ax = plt.subplots()
ax.plot(grid_densities_univ, mean_function, label="Mean function")
ax.plot(grid_densities_univ, first_mode, label="First mode")
ax.plot(grid_densities_univ, second_mode, label="Second mode")
plt.legend()
plt.show()

## Fréchet mean estimation

In [None]:
mean_qdf = np.mean(quantile_densities_discretized, axis=0)
plt.plot(grid_quantiles_univ, mean_qdf)

In [None]:
# For tests:
cdfs_discretized = cdf_from_density(
    grid_densities_univ,
    densities_discretized[:, :, np.newaxis],
    axis=1,
)
cdfs_discretized[:, -1].min()

In [None]:
# Check if frechet mean density is a density
frechet_mean_density = density_from_qd(mean_qdf, grid_densities_univ)
(
    cdf_from_density(grid_densities_univ, frechet_mean_density, axis=0)[0],
    cdf_from_density(grid_densities_univ, frechet_mean_density, axis=0)[-1],
    (frechet_mean_density >= 0).all(),
)

In [None]:
def wasserstein_frechet_mean(qds_discretized):
    """Compute Wasserstein-Fréchet mean from sample."""
    mean_qdf = np.mean(qds_discretized, axis=0)
    return density_from_qd(mean_qdf, grid_densities_univ)