In [None]:
"""Sandbox module."""
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import skfda
from scipy import stats
from skfda.preprocessing.dim_reduction import FPCA

%matplotlib inline

# Nonparametric Statistics

## Kernel density estimation

In [None]:
kernels = {
    "Epanechnikov": lambda u: 3
    / (4 * np.sqrt(5))
    * (1 - (u**2) / 5)
    * int(abs(u) <= np.sqrt(5)),
    "Uniform": lambda u: 0.5 * int(abs(u) <= 1),
    "Triangular": lambda u: (1 - abs(u)) * int(abs(u) <= 1),
}

In [None]:
def kernel_estimator(x, h, sample, kernel_type):
    """Kernel density estimator function."""
    k = np.vectorize(kernels[kernel_type])
    return 1 / (len(sample) * h) * sum(k((x - sample) / h))

### Fix parameters and generate sample

In [None]:
n = 200
n_grid = 100
grid_ending = 10
mu = 0
sigma = 1

sample = np.random.default_rng().normal(loc=mu, scale=sigma, size=n)
grid = np.linspace(start=-grid_ending, stop=grid_ending, num=n_grid)
# Rule-of-Thumb bandwidth (Li and Racine 2007, p. 66)
bandwidth = np.std(sample) * (n ** (-0.2))  # should implement optimal bandwidth

In [None]:
kernel_estimator_given_sample = partial(kernel_estimator, sample=sample)

### Generate fitted values

In [None]:
values_epa = [
    kernel_estimator_given_sample(x=i, h=bandwidth, kernel_type="Epanechnikov")
    for i in grid
]
values_uni = [
    kernel_estimator_given_sample(x=i, h=bandwidth, kernel_type="Uniform") for i in grid
]
values_tri = [
    kernel_estimator_given_sample(x=i, h=bandwidth, kernel_type="Triangular")
    for i in grid
]

### Plots

In [None]:
fig, ax = plt.subplots()
ax.plot(grid, values_epa, label="Epanechnikov")
ax.plot(grid, stats.norm.pdf(grid, loc=mu, scale=sigma), label="True density")
# plot histogram for comparison
ax.hist(
    sample,
    bins=grid,
    density=True,
    histtype="step",
    edgecolor="black",
    linewidth=0.5,
    label="Histogram",
)
plt.legend()
plt.show()

## Kernel Regression

Context: we want to investigate the nonparametric regression relation $y_i = m(x_i) +
\epsilon_i$, where $y_i$ is a dependent variable, $x_i$ an explanatory variable, and
$\epsilon_i$ an iid error term, for observations $i = 1, ..., n$.

### Sample generation

In [None]:
def m(x):
    """True function."""
    return 3 * np.sin(x) + 2 * x

In [None]:
epsilon = np.random.default_rng().normal(0, sigma / 2, size=n)
y = m(sample) + epsilon

In [None]:
def m_hat(x, y, h, sample, kernel_type):
    """Estimator."""
    k = np.vectorize(kernels[kernel_type])
    numerator = sum(k((x - sample) / h) * y)
    denominator = sum(k((x - sample) / h))
    return numerator / denominator

In [None]:
temp1 = partial(m_hat, y=y, h=bandwidth, sample=sample, kernel_type="Epanechnikov")
temp2 = np.vectorize(temp1)

In [None]:
fig, ax = plt.subplots()
ax.plot(grid, m(grid), label="True relation")
ax.plot(grid, temp2(x=grid), label="Nadaraya-Watson estimator")
plt.legend()
plt.show()

In above plot we see that the rule-of-thumb bandwidth is too large. When there is time
I could implement a better algorithm to search for the optimal bandwidth.

# Functional Data Analysis

Ideas for simulation
- Uni- vs. Multivariate case
- Simulate different normal distributions
- Vary parameters of (generalized) Beta distribution, so principal components can be interpreted as varying parameters

## Transformation Method Paper (Petersen & Müller 2016)

In [None]:
# Equispaced grid on [0, 1]
grid = np.linspace(start=-np.ones(n), stop=np.ones(n), num=2000)


# Define normal density
def norm_density(x, mu, sigma):
    """Define normal density function.

    To test: columns of x must align with mu and sigma.
    """
    x = np.array(x)  # to vectorize the input
    mu = np.array(mu)
    sigma = np.array(sigma)
    return np.reciprocal(np.sqrt(2 * np.pi) * sigma) * np.exp(
        (-0.5) * ((x - mu) / sigma) ** 2,
    )

In [None]:
# Draw different sigmas
log_sigmas = np.random.default_rng(seed=28071995).uniform(-1.5, 1.5, n)
mus = np.zeros(n)
sigmas = np.exp(log_sigmas)
densities_discretized = norm_density(grid, mus, sigmas).transpose()
densities_discretized[0], sigmas[0]

In [None]:
# Do FPCA via package
fpca_discretized = FPCA(n_components=1)
fd = skfda.FDataGrid(densities_discretized)
fpca_discretized.fit(fd)
fpca_discretized.components_.plot()

In [None]:
# Sample densities