In [None]:
"""Sandbox module."""
import warnings

import matplotlib.pyplot as plt
import numpy as np
from fda_funcs import (
    compute_fpc_scores,
    compute_moments,
    compute_principal_components,
    mode_of_variation,
)
from misc import (
    cdf_from_density,
    norm_pdf,
    quantile_from_cdf,
    riemann_sum_arrays,
    trunc_norm_pdf,
)
from scipy.stats import norm, truncnorm

%matplotlib inline

In [None]:
# Make the warning always appear
warnings.filterwarnings("always")

$\int_0^1 f(t)dt$  wird dann durch die Riemann Summe $1/m \sum_{j=1}^m f(s_j)$ ersetzt ($s_j$  - Gridpunkte, $m$ -  Anzahl der Gridpunkte).

# Functional Data Analysis

Ideas for simulation
- Uni- vs. Multivariate case
- Simulate different normal distributions
- Vary parameters of (generalized) Beta distribution, so principal components can be interpreted as varying parameters

$X(t) = \sum_{k=1}^n η_k φ_k(t)$


## Transformation Method Paper (Petersen & Müller 2016)

In [None]:
# 1. Generate synthetic functional data
n = 200
gridnum = 1000
truncation_point = 3
delta=0

grid_densities = np.linspace(
    start=-truncation_point,
    stop=truncation_point,
    num=gridnum,
)
grid_quantiles = np.linspace(start=delta, stop=1-delta, num=gridnum)
range_support = grid_densities[-1] - grid_densities[0]

# Draw different sigmas
log_sigmas = np.random.default_rng(seed=28071995).uniform(-1.5, 1.5, n)
mus = np.zeros(n)
sigmas = np.exp(log_sigmas)

In [None]:
# Calculate discretized distributions, with different methods, to compare performance

# Calculate densities
a, b = (-truncation_point - mus) / sigmas, (truncation_point - mus) / sigmas
densities_discretized = norm_pdf(grid_densities[:,np.newaxis], mus, sigmas).transpose()
densities_discretized2 = truncnorm.pdf(
    x=grid_densities[:, np.newaxis],
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
densities_discretized3 = trunc_norm_pdf(
    grid_densities[:, np.newaxis],
    mus,
    sigmas,
    -truncation_point,
    truncation_point,
).transpose()

# Calculate distribution functions
cdfs_discretized = cdf_from_density(
    grid_densities,
    densities_discretized,
    axis=1,
)
cdfs_discretized2 = cdf_from_density(
    grid_densities,
    densities_discretized2,
    axis=1,
)
cdfs_discretized3 = cdf_from_density(
    grid_densities,
    densities_discretized3,
    axis=1,
)

# Calculate quantiles
quantiles_discretized = norm.ppf(grid_quantiles[:, np.newaxis], mus, sigmas).transpose()
quantiles_discretized2 = truncnorm.ppf(
    q=grid_quantiles[:, np.newaxis],
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantiles_discretized3 = quantile_from_cdf(
    grid_densities[:, np.newaxis].transpose(),
    cdfs_discretized3,
    grid_quantiles,
)

# Calculate quantile densities
quantile_densities_discretized = np.reciprocal(
    norm_pdf(quantiles_discretized.transpose(), mus, sigmas),
).transpose()
quantile_densities_discretized2 = np.reciprocal(
    truncnorm.pdf(quantiles_discretized2.transpose(), a=a, b=b, loc=mus, scale=sigmas),
).transpose()
quantile_densities_discretized3 = np.reciprocal(
    trunc_norm_pdf(
        quantiles_discretized3.transpose(),
        mus,
        sigmas,
        -truncation_point,
        truncation_point,
    ),
).transpose()

# Normalize quantile densities
quantile_densities_discretized = (
    quantile_densities_discretized
    * range_support
    / riemann_sum_arrays(grid_quantiles, quantile_densities_discretized, axis = 1)[:, np.newaxis]
)

quantile_densities_discretized2 = (
    quantile_densities_discretized2
    * range_support
    / riemann_sum_arrays(grid_quantiles, quantile_densities_discretized2, axis = 1)[:, np.newaxis]
)

quantile_densities_discretized3 = (
    quantile_densities_discretized3
    * range_support
    / riemann_sum_arrays(grid_quantiles, quantile_densities_discretized3, axis = 1)[:, np.newaxis]
)

In [None]:
grid_densities[0] + riemann_sum_arrays(grid_quantiles, quantile_densities_discretized3[0], axis = 0,cumsum = True)[0]

In [None]:
### Check whether my function differs from scipy func on first generated density sample
eps = 1e-3
(
    (abs(densities_discretized[0] - densities_discretized2[0]) > eps).sum(),
    (abs(cdfs_discretized[0] - cdfs_discretized2[0]) > eps).sum(),
    (abs(quantiles_discretized[0] - quantiles_discretized2[0]) > eps).sum(),
    (
        abs(quantile_densities_discretized[0] - quantile_densities_discretized2[0])
        > eps
    ).sum(),
    (abs(densities_discretized2[0] - densities_discretized3[0]) > eps).sum(),
    (abs(cdfs_discretized2[0] - cdfs_discretized3[0]) > eps).sum(),
    (abs(quantiles_discretized2[0] - quantiles_discretized3[0]) > eps).sum(),
    (
        abs(quantile_densities_discretized2[0] - quantile_densities_discretized3[0])
        > eps
    ).sum(),
)

### Check how fast the different implementations are.

1. My normal distribution functions
2. Scipy's Truncated normals
3. My truncated normals

(For quantiles always used scipy's implementation of ppf.)

In [None]:
%%timeit
densities_discretized = norm_pdf(grid_densities[:,np.newaxis], mus, sigmas).transpose()
quantiles_discretized = norm.ppf(grid_quantiles[:,np.newaxis], mus, sigmas).transpose()
quantile_densities_discretized = np.reciprocal(
    norm_pdf(quantiles_discretized.transpose(), mus, sigmas),
).transpose()

In [None]:
%%timeit
densities_discretized2 = truncnorm.pdf(
    x=grid_densities[:,np.newaxis],
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantiles_discretized2 = truncnorm.ppf(
    q=grid_quantiles[:,np.newaxis],
    a=a,
    b=b,
    loc=mus,
    scale=sigmas,
).transpose()
quantile_densities_discretized2 = np.reciprocal(
    truncnorm.pdf(quantiles_discretized2.transpose(), a=a, b=b, loc=mus, scale=sigmas),
).transpose()

In [None]:
%%timeit
densities_discretized3 = trunc_norm_pdf(grid_densities[:,np.newaxis], mus, sigmas, a, b).transpose()
quantiles_discretized3 = quantile_from_cdf(
    grid_densities[:,np.newaxis].transpose(),
    cdfs_discretized,
    grid_quantiles,
)
quantile_densities_discretized3 = np.reciprocal(
    trunc_norm_pdf(
        quantiles_discretized3.transpose(),
        mus,
        sigmas,
        -truncation_point,
        truncation_point,
    ),
).transpose()

## 3rd approach faster than scipys
Less accurate with quantiles though. Gets better with higher grid size, but wont do more
than 1000 for computational cost

### Use implementation 3, faster than scipy but still correct

In [None]:
ds_discretized = densities_discretized3
cs_discretized = cdfs_discretized3
qs_discretized = quantiles_discretized3
qds_discretized = quantile_densities_discretized3

## Compare my discretized cdf to analytical cdf from scipy, to see whether S shape is there

In [None]:
# Analytical CDF for standard normal distribution
analytical_cdf = norm.cdf(grid_densities)

# Plot
plt.plot(grid_densities, analytical_cdf, label="Analytical CDF")
plt.plot(
    grid_densities,
    cs_discretized[14],
    label="Computed CDF",
)  # Adjust index as needed
plt.legend()
plt.xlabel("Value")
plt.ylabel("Cumulative Probability")
plt.title("Comparison of Analytical and Computed CDFs")
plt.show()

## Perform FPCA

In [None]:
# Compute moments to work with in FPCA
mean_function, centered_densities, cov_matrix = compute_moments(ds_discretized)

In [None]:
# Compute the eigenfunctions (principal components) of the covariance matrix
eigenvalues, eigenfunctions = compute_principal_components(cov_matrix, grid_densities)

In [None]:
# Compute FPC scores / factor loadings
fpc_scores = compute_fpc_scores(centered_densities, eigenfunctions, grid_densities)

In [None]:
# Plot the mean function and the first two eigenfunctions
fig, ax = plt.subplots()
ax.plot(grid_densities, mean_function, label="Mean Function")
ax.plot(grid_densities, eigenfunctions[:, 0], label="First component")
ax.plot(grid_densities, eigenfunctions[:, 1], label="Second component")
plt.legend()
plt.show()

## More reproducing of Petersen & Müller (2016)

In [None]:
# Compute first two modes of variation
first_two_modes = mode_of_variation(
    mean_function,
    eigenvalues[:2],
    eigenfunctions[:, :2],
    1,
)

In [None]:
fig, ax = plt.subplots()
ax.plot(grid_densities, mean_function, label="Mean function")
ax.plot(grid_densities, first_two_modes[:, 0], label="First mode")
ax.plot(grid_densities, first_two_modes[:, 1], label="Second mode")
plt.legend()
plt.show()

## Fréchet mean estimation


Some errors in the code here, corrected in the debugging Jupyter Notebook. Will
continue in Petersen_Müller2016 notebook.

In [None]:
mean_qdf = np.mean(qds_discretized, axis=0)
mean_qf = np.mean(qs_discretized, axis=0)

In [None]:
mqdf = mean_qdf #np.mean(tempi, axis=0)
fig, ax = plt.subplots()
ax.plot(grid_quantiles, mqdf, label="mean qdf")
ax.plot(grid_quantiles, grid_densities[0] + riemann_sum_arrays(grid_quantiles, array=mqdf, axis=0, cumsum=True), label="num qf")
plt.legend()
plt.show()

In [None]:
def fve(k):
    """Compute explained variance."""
    return np.sum(eigenvalues[:k]) / np.sum(eigenvalues)


fve(1), fve(2)

In [None]:
fig, ax = plt.subplots()
# for i in range(3):
ax.plot(grid_densities, ds_discretized[40], label=f"Density {40}")
ax.plot(grid_densities, mean_function, label="mean pdf")
ax.plot(grid_densities, f_oplus, label="wf mean")
plt.legend()
plt.show()