In [1]:
from argparse import ArgumentParser
from functools import partial
from itertools import product

import numpy as np
import pandas as pd
from graspy.cluster import GaussianCluster
from joblib import Parallel, delayed
from scipy.stats import mannwhitneyu, ttest_ind, ks_2samp

from src import generate_truncnorm_sbms_with_communities, estimate_embeddings



In [2]:
def estimate_community(embeddings, n_clusters):
    predicted_labels = (
        GaussianCluster(n_clusters, n_clusters, "all").fit_predict(embeddings) + 1
    )

    # ari = adjusted_rand_score(true_labels, predicted_labels)
    return predicted_labels

def compute_statistic(tests, pop1, pop2):
    res = np.zeros(len(tests))

    for idx, test in enumerate(tests):
        if test.__name__ == "multiscale_graphcorr":
            statistic, pval, _ = test(pop1, pop2, reps=250, is_twosamp=True)
        elif test.__name__ == "test":
            statistic, pval = test(pop1, pop2, reps=250)
        else:  # for other tests, do by edge
            statistic, pval = test(pop1, pop2)
        res[idx] = pval

    return res

def run_experiment(
    m,
    block_1,
    block_2,
    mean_1,
    mean_2,
    var_1,
    var_2,
    mean_delta,
    var_delta,
    n_clusters,
    reps,
    tests,
):
    total_n = block_1 + block_2
    r, c = np.triu_indices(total_n, k=1)

    omni_res = np.zeros((reps, len(n_clusters), 2, len(tests)))
    mase_res = np.zeros((reps, len(n_clusters), 2, len(tests)))

    for i in np.arange(reps).astype(int):
        pop1, pop2, true_labels = generate_truncnorm_sbms_with_communities(
            m=m,
            block_1=block_1,
            block_2=block_2,
            mean_1=mean_1,
            mean_2=mean_2,
            var_1=var_1,
            var_2=var_2,
            mean_delta=mean_delta,
            var_delta=var_delta,
        )
        pop1_edges = pop1[:, r, c]
        pop2_edges = pop2[:, r, c]
        true_edges = (true_labels[:, None] + true_labels[None, :])[r, c]

        for method in ["mase", "omni"]:
            embeddings = estimate_embeddings(pop1, pop2, method, 2)

            for k_idx, k in enumerate(n_clusters):
                predicted_labels = estimate_community(embeddings, k)
                predicted_edge_labels = (
                    predicted_labels[:, None] * predicted_labels[None, :]
                )[
                    r, c
                ]  # vectorize to uppper triu
                
                
                cluster_labels = np.unique(predicted_edge_labels)
                communitity_pvals = np.zeros((np.unique(cluster_labels).size, len(tests)))

                for cdx, cluster_label in enumerate(cluster_labels):
                    tmp_labels = predicted_edge_labels == cluster_label
                    tmp_pop1_edges = pop1_edges[:, tmp_labels].ravel()
                    tmp_pop2_edges = pop2_edges[:, tmp_labels].ravel()

                    pvals = compute_statistic(tests, tmp_pop1_edges, tmp_pop2_edges)
#                     for p_idx, pval in enumerate(pvals):
#                         if pval <= 0.05:
#                             sig_edges[p_idx][tmp_labels] = 1
                    communitity_pvals[cdx] = pvals
                
                sig_edges = np.zeros((len(tests), total_n, total_n))[:, r, c]
                
                for u in range(len(tests)):
                    tmp_pvals = communitity_pvals[:, u]
                    sig_comm = cluster_labels[np.argsort(tmp_pvals, kind='stable')[0]]
                    sig_edges[u, predicted_edge_labels == sig_comm] = 1

                prec = (sig_edges[:, true_edges == 0]).sum(axis=1) / sig_edges.sum(
                    axis=1
                )
                np.nan_to_num(prec, False)
                recall = (sig_edges[:, true_edges == 0]).sum(axis=1) / (
                    true_edges == 0
                ).sum(axis=0)

                if method == "mase":
                    mase_res[i, k_idx, :] = np.array((prec, recall))
                else:
                    omni_res[i, k_idx, :] = np.array((prec, recall))

    omni_res = omni_res.mean(axis=0).reshape(-1)
    mase_res = mase_res.mean(axis=0).reshape(-1)

    to_append = [
        m,
        mean_1,
        mean_2,
        var_1,
        var_2,
        mean_delta,
        var_delta,
        *omni_res,
        *mase_res,
    ]
    return to_append

In [3]:
task_index = 0

In [4]:
spacing = 50

block_1 = 25  # different probability
block_2 = 25
mean_1 = 0
mean_2 = 0
var_1 = 0.25
var_2 = 0.25
mean_deltas = np.linspace(0, 1 , spacing + 1)
#var_deltas = np.linspace(var_1, 3, spacing + 1)
var_delta = 0
reps = 50
n_clusters = [2]
ms = np.linspace(0, 250, spacing + 1)[1:].astype(int)

tests = [ks_2samp, mannwhitneyu, ttest_ind]

partial_func = partial(
    run_experiment,
    block_1=block_1,
    block_2=block_2,
    mean_1=mean_1,
    mean_2=mean_2,
    var_1=var_1,
    var_2=var_2,
    var_delta=var_delta,
    #mean_delta=mean_delta,
    n_clusters=n_clusters,
    reps=reps,
    tests=tests,
)

In [5]:
args = [dict(m=m, mean_delta=mean_delta) for m, mean_delta in product(ms, mean_deltas)]
args = args[task_index::2]
args = sum(zip(reversed(args), args), ())[: len(args)]
res = Parallel(n_jobs=-1, verbose=7)(delayed(partial_func)(**arg) for arg in args)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 125.2min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed: 143.9min
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed: 239.7min
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed: 259.8min
[Parallel(n_jobs=-1)]: Done 712 tasks      | elapsed: 334.0min
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed: 395.7min
[Parallel(n_jobs=-1)]: Done 1203 out of 1275 | elapsed: 483.4min remaining: 28.9min
[Parallel(n_jobs=-1)]: Done 1275 out of 1275 | elapsed: 493.2min finished


In [6]:
cols = [
    "m",
    "mean_1",
    "mean_2",
    "var_1",
    "var_2",
    "mean_delta",
    "var_delta",
    *[
        f"omni_{metric}_{k}_{test.__name__}"
        for k in n_clusters
        for metric in ["precision", "recall"]
        for test in tests
    ],
    *[
        f"mase_{metric}_{k}_{test.__name__}"
        for k in n_clusters
        for metric in ["precision", "recall"]
        for test in tests
    ],
]
res_df = pd.DataFrame(res, columns=cols)
res_df.to_csv(
    f"./results/20200409_weighted_correct_nodes_{task_index}.csv", index=False
)

In [9]:
dfs = pd.concat([pd.read_csv(f"./results/20200409_weighted_correct_nodes_{i}.csv") for i in range(2)], ignore_index=True)

In [11]:
dfs = dfs.sort_values(['m', 'mean_delta'])

In [13]:
dfs.to_csv("./results/20200409_weighted_correct_nodes.csv", index=True)