In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [1]:
from functools import partial
from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed

from hyppo.ksample import Hotelling, KSample

from src import generate_binary_sbms, estimate_embeddings



In [2]:
def run_experiment(
    m, block_1, block_2, p, delta, n_components, reps, tests, alpha=0.05
):
    total_n = block_1 + block_2

    omni_corrects = np.zeros((reps, 2, len(tests)))
    mase_corrects = np.zeros((reps, 2, len(tests)))

    for i in np.arange(reps).astype(int):
        pop1, pop2, true_labels = generate_binary_sbms(
            m=m, block_1=block_1, block_2=block_2, p=p, delta=delta
        )

        for method in ["omni", "mase"]:
            embeddings = estimate_embeddings(
                pop1, pop2, method, n_components, sample_space=True
            )
            for idx, j in enumerate([0, 19]):
                for k, test in enumerate(tests):
                    X_nodes = embeddings[:m, j, :]
                    Y_nodes = embeddings[m:, j, :]
                    try:
                        res = test.test(
                            embeddings[:m, j, :], embeddings[m:, j, :], reps=500
                        )
                        pval = res[1]
                        if np.isnan(res[1]):
                            pval = 1
                    except:
                        pval = 1

                    if method == "mase":
                        mase_corrects[i, idx, k] = pval
                    else:
                        omni_corrects[i, idx, k] = pval

    omni_powers = (omni_corrects <= (alpha / total_n)).mean(axis=0)
    mase_powers = (mase_corrects <= (alpha / total_n)).mean(axis=0)

    to_append = [m, p, delta, *omni_powers.reshape(-1), *mase_powers.reshape(-1)]
    return to_append

In [3]:
spacing = 50

block_1 = 5  # different probability
block_2 = 15
p = 0.5
deltas = np.linspace(0, 1 - p, spacing + 1)
n_components = 2
reps = 50
ms = np.linspace(0, 250, spacing + 1)[1:].astype(int)
tests = [KSample("MGC"), Hotelling()]

partial_func = partial(
    run_experiment,
    block_1=block_1,
    block_2=block_2,
    p=p,
    reps=reps,
    n_components=n_components,
    tests=tests
)

In [4]:
task = 0

In [5]:
args = [dict(m=m, delta=delta) for m, delta in product(ms, deltas)]
args = args[task::4]
args = sum(zip(reversed(args), args), ())[: len(args)]

res = Parallel(n_jobs=-1, verbose=5)(delayed(partial_func)(**arg) for arg in args)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed: 157.5min
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed: 250.8min
[Parallel(n_jobs=-1)]: Done 575 out of 638 | elapsed: 498.9min remaining: 54.7min
[Parallel(n_jobs=-1)]: Done 638 out of 638 | elapsed: 519.8min finished


In [6]:
cols = [
    "m",
    "p",
    "delta",
    *[f"omni_power_{t}_{i+1}" for i in [0, 19] for t in ['mgc', 'hotelling']],
    *[f"mase_power_{t}_{i+1}" for i in [0, 19] for t in ['mgc', 'hotelling']],
]
res_df = pd.DataFrame(res, columns=cols)
res_df = res_df.sort_values(by=["m", "delta"])
res_df.to_csv(f"./results/2020401_weighted_correct_nodes_{task}.csv", index=False)