In [1]:
from functools import partial
from itertools import product

import numpy as np
import pandas as pd
from graspy.cluster import GaussianCluster
from joblib import Parallel, delayed
from scipy.stats import mannwhitneyu, ttest_ind, ks_2samp

from src import generate_truncnorm_sbms_with_communities, estimate_embeddings



In [2]:
def estimate_community(embeddings, n_clusters):
    predicted_labels = (
        GaussianCluster(n_clusters, n_clusters, "all").fit_predict(embeddings) + 1
    )

    # ari = adjusted_rand_score(true_labels, predicted_labels)
    return predicted_labels

def compute_statistic(tests, pop1, pop2):
    res = np.zeros(len(tests))

    for idx, test in enumerate(tests):
        if test.__name__ == "multiscale_graphcorr":
            statistic, pval, _ = test(pop1, pop2, reps=250, is_twosamp=True)
        elif test.__name__ == "test":
            statistic, pval = test(pop1, pop2, reps=250)
        else:  # for other tests, do by edge
            statistic, pval = test(pop1, pop2)
        res[idx] = pval

    return res

def run_experiment(
    m,
    block_1,
    block_2,
    mean_1,
    mean_2,
    var_1,
    var_2,
    mean_delta,
    var_delta,
    n_clusters,
    reps,
    tests,
):
    total_n = block_1 + block_2
    r, c = np.triu_indices(total_n, k=1)

    res = np.zeros((reps, 2, len(tests)))

    for i in np.arange(reps).astype(int):
        pop1, pop2, true_labels = generate_truncnorm_sbms_with_communities(
            m=m,
            block_1=block_1,
            block_2=block_2,
            mean_1=mean_1,
            mean_2=mean_2,
            var_1=var_1,
            var_2=var_2,
            mean_delta=mean_delta,
            var_delta=var_delta,
        )
        pop1_edges = pop1[:, r, c]
        pop2_edges = pop2[:, r, c]
        true_edges = (true_labels[:, None] + true_labels[None, :])[r, c]

        sig_edges = np.zeros((len(tests), total_n, total_n))[:, r, c]
        for j in np.unique(true_edges):
            tmp_labels = true_edges == j
            tmp_pop1_edges = pop1_edges[:, tmp_labels].ravel()
            tmp_pop2_edges = pop2_edges[:, tmp_labels].ravel()

            pvals = compute_statistic(tests, tmp_pop1_edges, tmp_pop2_edges)
            for p_idx, pval in enumerate(pvals):
                if pval <= 0.05:
                    sig_edges[p_idx][tmp_labels] = 1

        prec = (sig_edges[:, true_edges == 0]).sum(axis=1) / sig_edges.sum(
            axis=1
        )
        np.nan_to_num(prec, False)
        recall = (sig_edges[:, true_edges == 0]).sum(axis=1) / (
            true_edges == 0
        ).sum(axis=0)
        
        res[i] = np.array((prec, recall))

    res = res.mean(axis=0).reshape(-1)

    to_append = [
        m,
        mean_1,
        mean_2,
        var_1,
        var_2,
        mean_delta,
        var_delta,
        *res,
    ]
    return to_append

In [3]:
spacing = 50

block_1 = 25  # different probability
block_2 = 25
mean_1 = 0
mean_2 = 0
var_1 = 0.25
var_2 = 0.25
mean_delta = 0
mean_deltas = np.linspace(mean_1, 1 - mean_1, spacing + 1)
#var_deltas = np.linspace(0, 3, spacing + 1)
var_delta = 0
reps = 50
n_clusters = range(2, 5)
ms = np.linspace(0, 250, spacing + 1)[1:].astype(int)

tests = [ks_2samp, mannwhitneyu, ttest_ind]

partial_func = partial(
    run_experiment,
    block_1=block_1,
    block_2=block_2,
    mean_1=mean_1,
    mean_2=mean_2,
    var_1=var_1,
    var_2=var_2,
    var_delta=var_delta,
    #mean_delta=mean_delta,
    n_clusters=n_clusters,
    reps=reps,
    tests=tests,
)

args = [dict(m=m, mean_delta=mean_delta) for m, mean_delta in product(ms, mean_deltas)]
#args = sum(zip(reversed(args), args), ())[: len(args)]
#args = sum(zip(reversed(args), args), ())[: len(args)]

res = Parallel(n_jobs=-1, verbose=7)(delayed(partial_func)(**arg) for arg in args)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed: 40.7min
[Parallel(n_jobs=-1)]: Done 1160 tasks      | elapsed: 57.4min
[Parallel(n_jobs=-1)]: Done 1376 tasks      | elapsed: 79.6min
[Parallel(n_jobs=-1)]: Done 1608 tasks      | elapsed: 106.6min
[Parallel(n_jobs=-1)]: Done 1856 tasks      | elapsed: 140.3min
[Parallel(n_jobs=-1)]: Done 2120 tasks      | elapsed: 181.2min
[Parallel(n_jobs=-1)]: Done 2550 out of 2550 | elapsed: 254.2min finished


In [4]:
cols = [
    "m",
    "mean_1",
    "mean_2",
    "var_1",
    "var_2",
    "mean_delta",
    "var_delta",
    *[
        f"omni_{metric}_{test.__name__}"
        for metric in ["precision", "recall"]
        for test in tests
    ],
]

res_df = pd.DataFrame(res, columns=cols)
res_df.to_csv(
    f"./results/20200321_truth_means.csv", index=False
)