# Comparing the False Positive Rate (Type I) Under the Null

The purpose of this simulation is to confirm the validity of the test, and thus, we expect the testing power to be close to the significance level $\alpha$. Here we use the independent AR(1) processes, and the sampling process is:

\begin{equation}
\begin{bmatrix}
X_t\\
Y_t
\end{bmatrix}
=
\begin{bmatrix}
\phi & 0\\
0 & \phi
\end{bmatrix}
\begin{bmatrix}
X_{t-1}\\
Y_{t-1}
\end{bmatrix} +
\begin{bmatrix}
\epsilon_t\\
\eta_t
\end{bmatrix},
\end{equation}

where $(\epsilon_t,\eta_t)$ is the noise generated by standard normal. For first experiment, we vary the length of time series from $n\in \{10, 20, 30, \ldots, 200\}$ with $\phi=0.5$. For second experiment, we vary the AR coefficient $\phi\in\{0.2,  0.25,\ldots, 0.95\}$ with $n=1200$. We use 1000 permutation per replication with 300 replications in total.

See here for wildHSIC and shiftHSIC computation done in matlab, and see here for data generation notebook.


In [6]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
from joblib import Parallel, delayed

from hyppo.time_series import MGCX, DcorrX, LjungBox

test_dict = {"LjungBox": LjungBox, "DcorrX": DcorrX, "MGCX": MGCX}

In [None]:
def worker(X, Y, test, reps=1000):
    n, d = X.shape

    res = test.test(X, Y, reps=reps)
    return n, d, res[1]  # pvalue


# load data
data = sp.io.loadmat("./data/1-independent_ar_n.mat")
X = data["X"]
Y = data["Y"]

# parameters
n_reps = X.shape[0]
ns = list(range(10, 201, 10))

dfs = []
for test_name, test in test_dict.items():
    results = Parallel(-2, verbose=1)(
        delayed(worker)(X[i, :n, :], Y[i, :n, :], test(max_lag=1))
        for n in ns
        for i in range(n_reps)
    )

    df = pd.DataFrame(results, columns=["n", "d", "pval"])
    df["test"] = test_name
    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)
df.to_csv("./outs/indep_ar_n.csv", index=False)

In [None]:
def worker(X, Y, test, phi, reps=1000):
    n, d = X.shape

    res = test.test(X, Y, reps=reps)
    return n, d, phi, res[1]  # pvalue


# load data
data = sp.io.loadmat("./data/2-independent_ar_phi.mat")
X = data["X"]
Y = data["Y"]
phis = data["phi"]

# parameters
n_reps = X.shape[1]

dfs = []
for test_name, test in test_dict.items():
    results = Parallel(-2, verbose=1)(
        delayed(worker)(X[j, i, :, :], Y[j, i, :, :], test(max_lag=1), phis[j])
        for j in range(len(phis))
        for i in range(n_reps)
    )

    df = pd.DataFrame(results, columns=["n", "d", "pval", "phi"])
    df["test"] = test_name
    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)
df.to_csv("./outs/indep_ar_phi.csv", index=False)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 95 concurrent workers.
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:    2.8s finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 95 concurrent workers.
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  1.2min finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 95 concurrent workers.
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:  4.1min


# VAR(1)

In [None]:
def worker(X, Y, test, reps=1000):
    n, d = X.shape

    res = test.test(X, Y, reps=reps)
    return n, d, res[1]  # pvalue


# load data
data = sp.io.loadmat("./data/6-independent_var_n.mat")
X = data["X"]
Y = data["Y"]

# parameters
n_reps = X.shape[0]
ns = list(range(10, 201, 10))

dfs = []
for test_name, test in test_dict.items():
    results = Parallel(-2, verbose=1)(
        delayed(worker)(X[i, :n, :], Y[i, :n, :], test(max_lag=1))
        for n in ns
        for i in range(n_reps)
    )

    df = pd.DataFrame(results, columns=["n", "d", "pval"])
    df["test"] = test_name
    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)
df.to_csv("./outs/indep_var_n.csv", index=False)