In [1]:
import csv
from itertools import product

import numpy as np
import pandas as pd
import rpy2.robjects as ro
from graspologic.embed import OmnibusEmbed
from joblib import Parallel, delayed
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from statsmodels.stats.multitest import multipletests
from tqdm import tqdm

from generate import generate_graphs_1, generate_graphs_4

In [2]:
# Bind the MANOVA o OMNI function in R to a Python object
ro.r("""

    r_manova <- function(omni, n_vertices) {
        col1 <- which(grepl("omni", names(omni))) # column indices for the embeddings
        col2 <- which(grepl("genotype", names(omni))) # column index for the genotype

        embedding <- colnames(omni)[col1]
        genotype <- colnames(omni)[col2]
        form <- paste0("cbind(", paste(embedding, collapse=", "), ") ~ ", genotype)

        pvec <- rep(0, n_vertices)
        pillai <- rep(0, n_vertices)

        for (i in 1 : n_vertices) {
            omni.v <- omni[which(omni$ROI == i), ]
            ans <- manova(as.formula(form), data=omni.v)
            pvec[i] <- summary(ans)$stats[1, "Pr(>F)"]
        }

        signal_vertices <- data.frame(ROI=unique(omni$ROI), pvalue=pvec)
        return(signal_vertices)

    }
""")

r_manova = ro.r['r_manova']

In [3]:
def embed(X, Y):

    graphs = np.vstack([X, Y])
    n_vertices = graphs.shape[1]
    labels = np.concatenate([np.zeros(len(X)), np.ones(len(Y))]).astype(int)

    # Jointly embed graphs using OMNI
    embedder = OmnibusEmbed()
    omni_embedding = embedder.fit_transform(graphs)
    omni_embedding = omni_embedding.reshape(-1, embedder.n_components_)

    # Convert array to a dataframe
    omni_embedding = pd.DataFrame(omni_embedding, columns=[f"omni_{i + 1}" for i in range(embedder.n_components_)]).astype(np.float64)

    # Construct identifiers for each embedded vertex
    rois = np.arange(n_vertices) + 1
    participants = np.arange(len(graphs)) + 1

    identifiers = np.array(list(product(participants, rois))).reshape(-1, 2)
    identifiers = pd.DataFrame(identifiers, columns=["participant_id", "ROI"])
    identifiers["ROI"] = identifiers["ROI"].astype(np.int64)
    identifiers["genotype"] = np.array([[str(strain)] * n_vertices for strain in labels]).reshape(-1)

    omni = pd.concat([omni_embedding, identifiers], axis=1)
    omni.head()

    return omni


def correct(pvalues, methods, alpha=0.05):
    for method in methods:
        pvalues[method] = multipletests(pvalues["pvalue"], method=method, alpha=alpha)[1]


def fdr(pvalues, methods, alpha=0.05):
    true_disc = (pvalues.query("signal")[methods] < alpha).values.sum(axis=0)
    false_disc = (pvalues.query("not signal")[methods] < alpha).values.sum(axis=0)
    return np.array([true_disc, false_disc]).T.reshape(-1)


def test(X, Y, n_vertices, labels, methods=["bonferroni", "holm", "fdr_bh", "fdr_by"]):

    omni = embed(X, Y)
    
    with localconverter(ro.default_converter + pandas2ri.converter):
        pvalues = r_manova(ro.conversion.py2rpy(omni), int(n_vertices))
    pvalues["signal"] = labels

    correct(pvalues, methods)
    discoveries = fdr(pvalues, methods=methods)

    return pvalues, discoveries

In [4]:
def experiment(
    filename, sample_size, block_1, block_2, p, effect_size, second_angle, generate_func, reps=96
):

    m_per_pop = sample_size // 2
    block_size = np.array([block_1, block_2])
    n_vertices = block_1 + block_2
    labels = np.array([False] * block_1 + [True] * block_2)

    def worker(i):

        X, Y = generate_func(
            p, effect_size, block_size, m_per_pop, second_angle=second_angle
        )
        
        pvalues, discoveries = test(X, Y, n_vertices, labels)
        pvalues.to_csv(f"results/correction/{block_2}-{generate_func.__name__}-{i}.csv")

        discoveries = np.append(discoveries, [block_2, generate_func.__name__])

        # Write to csv
        with open(filename, "a") as outfile:
            writer = csv.writer(outfile)
            writer.writerow(discoveries)

    Parallel(-1)(delayed(worker)(i) for i in tqdm(range(reps)))


## Experiment

In [5]:
# %% Simulation parameters
n_nodes = 50
block_2s = np.array([5, 10, 15, 20, 25])
block_1s = n_nodes - block_2s
block_sizes = list(zip(block_1s, block_2s))
effect_size = 0.3
second_angle = 70.0
sample_size = 200
p = 0.25

generate_funcs = [generate_graphs_1, generate_graphs_4]

args = [
    dict(
        sample_size=sample_size,
        block_1=block_1,
        block_2=block_2,
        p=p,
        effect_size=effect_size,
        second_angle=second_angle,
        generate_func=generate_func,
    )
    for (block_1, block_2), generate_func in product(block_sizes, generate_funcs)
]

In [6]:
filename = "results/correction/correction.csv"
methods = ["bonferroni", "holm", "fdr_bh", "fdr_by"]

columns = [method + truth for method in methods for truth in ["_true", "_false"]] + ["block_2", "generate_func"]

with open(filename, "w") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(columns)

for arg in args:
    experiment(**arg, filename=filename)

100%|██████████| 96/96 [02:09<00:00,  1.35s/it]
100%|██████████| 96/96 [02:01<00:00,  1.26s/it]
100%|██████████| 96/96 [01:49<00:00,  1.14s/it]
100%|██████████| 96/96 [01:53<00:00,  1.19s/it]
100%|██████████| 96/96 [01:45<00:00,  1.10s/it]
100%|██████████| 96/96 [01:45<00:00,  1.10s/it]
100%|██████████| 96/96 [01:43<00:00,  1.08s/it]
100%|██████████| 96/96 [01:42<00:00,  1.07s/it]
100%|██████████| 96/96 [01:43<00:00,  1.07s/it]
100%|██████████| 96/96 [01:45<00:00,  1.10s/it]


## Analysis

In [7]:
# Calculate the false discover proportion for each correction method
df = pd.read_csv("results/correction/correction.csv")

for method in methods:
    df[f"{method}_fdp"] = df[f"{method}_false"] / df["block_2"]

df.groupby(["generate_func", "block_2"])[[f"{method}_fdp" for method in methods]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bonferroni_fdp,holm_fdp,fdr_bh_fdp,fdr_by_fdp
generate_func,block_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
generate_graphs_1,5,0.002083,0.004167,0.004167,0.002083
generate_graphs_1,10,0.023958,0.026042,0.071875,0.015625
generate_graphs_1,15,0.015278,0.015972,0.091667,0.01875
generate_graphs_1,20,0.014063,0.015625,0.129167,0.029167
generate_graphs_1,25,0.007917,0.011667,0.109583,0.027917
generate_graphs_4,5,0.004167,0.004167,0.01875,0.0
generate_graphs_4,10,0.089583,0.097917,0.484375,0.123958
generate_graphs_4,15,0.115972,0.127778,0.688194,0.220833
generate_graphs_4,20,0.161458,0.186458,0.707292,0.307812
generate_graphs_4,25,0.1575,0.18125,0.57125,0.283333
