In [1]:
import graspy
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dask
from dask.distributed import Client, progress
import dask.dataframe as ddf
#uncomment when placed into model folder not doc
#from .base import BaseGraphEstimator, _calculate_p

from scipy import stats
from scipy.special import expit
import pandas as pd 
import mizani as miz

from graspy.simulations import sbm, er_np, er_nm, siem
from graspy.plot import heatmap
from graspy.models import SBMEstimator

import statsmodels.api as sm
import statsmodels.formula.api as smf

from graspy.models.base import BaseGraphEstimator 
from graspy.utils.utils import (
    augment_diagonal,
    cartprod,
    import_graph,
    is_unweighted,
    remove_loops,
    symmetrize,
)
import sys
import plotnine as p9
from dfply import *
import multiprocessing as multiproc

In [2]:
def gen_pmat(num_blocks, effect_size=0):
    if num_blocks == 2:
        # tend to [[a, b], [b, a]]
        pmat = np.array([[.5 + effect_size/2, .5 - effect_size/2], [.5 - effect_size/2, .5 + effect_size/2]])
    elif num_blocks == 3:
        # tend to [[a, b], [b, d]]
        pmat = np.array([[.5 + effect_size/2, .5], [.5, .5 - effect_size/2]])
    elif num_blocks == 4:
        # tend to [[a, b], [c, d]]
        pmat = np.array([[.5 + effect_size/2, .5 + effect_size/4], [.5 - effect_size/4, .5 - effect_size/2]])
    return pmat

def quad_sbm(n, pmat, weighted=False, return_labels=False):
    comm = [n//2, n//2]
    if weighted:
        raise NotImplementedError()
    else:
        mat = graspy.simulations.sbm(comm, pmat, directed=True, loops=True, return_labels=return_labels)
    return mat
        
def sample_quad_sbm(n, effect_size=0, num_blocks=4, weighted=False, return_labels=False):
    pmat = gen_pmat(num_blocks, effect_size)
    mat = quad_sbm(n, pmat, weighted=weighted, return_labels=return_labels)
    return mat

def sbm_2(n, effect_size=0, weighted=False, return_labels=True):
    return sample_quad_sbm(n, effect_size=effect_size, num_blocks=2, weighted=weighted, return_labels=return_labels)

def sbm_3(n, effect_size=0, weighted=False, return_labels=True):
    return sample_quad_sbm(n, effect_size=effect_size, num_blocks=3, weighted=weighted, return_labels=return_labels)

def sbm_4(n, effect_size=0, weighted=False, return_labels=True):
    return sample_quad_sbm(n, effect_size=effect_size, num_blocks=4, weighted=weighted, return_labels=return_labels)

In [3]:
vals = [(0.05, 0), (0.2, 1), (0.1, 2)]
vals.sort()
print(vals)

[(0.05, 0), (0.1, 2), (0.2, 1)]


In [32]:
# the color for each test statistic
test_stat_cols = {'FET': 'red',
                  'LRT': 'teal',
                  'KW': 'blue',
                  'MGC': 'green',
                  'ANOVA': 'orange',
                  'Chi2': 'purple'}

# an array tracking the test statistics themselves
test_stats = {'FET': 'fisher_exact',
            'Chi2': 'chi2',
            'LRT': 'lrt',
            'DCorr': 'mgc',
            'KW': 'kw',
            'ANOVA': 'anova'}

# an array for when to use different test statistics
test_stat_use = {'FET': ['Unweighted'],
                 'LRT': ['Unweighted'],
                 'Chi2': ['Unweighted'],
                 'DCorr': ["Unweighted", "Weighted"],
                 'KW': ["Unweighted", "Weighted"],
                 'ANOVA': ["Unweighted", "Weighted"]}

simulations = {
    'Unweighted': {'abba': {'fn': sbm_2, 'eff_sz': np.linspace(0, .15, 10),'kwarg': {"weighted": False}},
                   'abbd': {'fn': sbm_3, 'eff_sz': np.linspace(0, .15, 10),'kwarg': {"weighted": False}},
                   'abcd': {'fn': sbm_4, 'eff_sz': np.linspace(0, .15, 10),'kwarg': {"weighted": False}}
                  }
}

nrep = 1
ncores = multiproc.cpu_count()-2
nvertices = np.round(np.linspace(20, 100, 10))

In [33]:
client = Client(threads_per_worker=1, n_workers=ncores)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43071 instead


0,1
Client  Scheduler: tcp://127.0.0.1:39029  Dashboard: http://127.0.0.1:43071/status,Cluster  Workers: 6  Cores: 6  Memory: 33.51 GB


In [34]:
def run_exp(row):
    # generate simulation using the dictionary defined above
    sim_dict = simulations[row["Weighting"]][row["Simulation"]]
    if sim_dict["kwarg"] is not None:
        (A, C) = sim_dict["fn"](row["n"], row["Effect_Size"], **sim_dict["kwarg"])
    else:
        (A, C) = sim_dict["fn"](row["n"], row["Effect_Size"])
        
    # run sim on all test statistics
    p_vals = []
    models = []
    stat_names = []
    for test_name, test in test_stats.items():
        # check if statistic is appropriate for the setting
        if row["Weighting"] in test_stat_use[test_name]:
            try:
                e = SBMEstimator()
                pval, model = e.estimate_block_structure(A, C, ["abba", "abbd", "abcd"], test_method=test)
                p_vals.append(pval)
                models.append(model)
                stat_names.append(test_name)
            except:
                ex_str = "Failed Test: {}, Simulation: {}, n: {}, Effect Size: {}"
                print(ex_str.format(test_name, row["Simulation"], row["n"], row["Effect_Size"]))
                p_vals.append(float("NaN"))
                models.append("")
        else:
            p_vals.append(float("NaN"))
            models.append("")
    return tuple([row["Simulation"], row["Weighting"], row["n"],
                 row["Effect_Size"], row["i"], *p_vals, *models])

In [35]:
exps = []
for weighting, sims in simulations.items():
    for sim_name, sim in sims.items():
        for es in sim["eff_sz"]:
            for n in nvertices:
                for i in range(nrep):
                    exps.append([sim_name, weighting, n, es, i])
sim_exps = pd.DataFrame(exps, columns=["Simulation", "Weighting", "n", "Effect_Size", "i"])
sim_exps["n"] = sim_exps["n"].astype(int)
print(sim_exps.head())
print(sim_exps.shape)

  Simulation   Weighting   n  Effect_Size  i
0       abba  Unweighted  20          0.0  0
1       abba  Unweighted  23          0.0  0
2       abba  Unweighted  27          0.0  0
3       abba  Unweighted  30          0.0  0
4       abba  Unweighted  33          0.0  0
(60, 5)


In [36]:
sim_exps = ddf.from_pandas(sim_exps, npartitions=10)
sim_results = sim_exps.apply(lambda x: run_exp(x), axis=1, result_type='expand',
                             meta={0: str, 1: str, 2: int, 3: float, 4: int,
                                   5: float, 6: float, 7: float, 8:float, 9:float, 10:float,
                                   11: str, 12: str, 13: str, 14: str, 15: str, 16: str})
sim_results

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,object,object,int64,float64,int64,float64,float64,float64,float64,float64,float64,object,object,object,object,object,object
6,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
sim_results = sim_results.compute(scheduler="multiprocessing")
sim_results = sim_results.rename(columns={0: "Simulation", 1: "Weighting", 2: "n", 3: "Effect_Size",
                                          4: "i", 5: "FET", 6: "Chi2", 7: "LRT", 8: "MGC", 9: "KW", 10: "ANOVA",
                                          11: "FET_pred", 12: "Chi2_pred", 13: "LRT_pred",
                                         14: "MGC_pred", 15: "KW_pred", 16: "ANOVA_pred"})
sim_results.to_pickle('./data/block_est.pkl')

In [36]:
print(sim_results.tail())

    Simulation   Weighting    n  Effect_Size  i  FET          Chi2           LRT  MGC            KW         ANOVA FET_pred Chi2_pred LRT_pred MGC_pred KW_pred ANOVA_pred
295       abcd  Unweighted   64         0.15  0  NaN  8.775968e-13  1.845957e-14  NaN  8.840966e-13  7.164495e-13               abbd     abcd             abbd       abbd
296       abcd  Unweighted   73         0.15  0  NaN  1.645611e-23  4.448272e-24  NaN  1.663230e-23  9.360871e-24               abcd     abcd             abcd       abcd
297       abcd  Unweighted   82         0.15  0  NaN  4.706011e-15  2.563437e-16  NaN  4.730942e-15  3.958104e-15               abcd     abcd             abcd       abcd
298       abcd  Unweighted   91         0.15  0  NaN  5.826799e-16  2.873860e-17  NaN  5.853942e-16  4.955378e-16               abcd     abcd             abcd       abcd
299       abcd  Unweighted  100         0.15  0  NaN  5.096071e-29  1.785900e-30  NaN  5.130839e-29  3.262408e-29               abcd     abcd         

In [22]:
row={"n": 20, "Effect_Size": .5, "kwarg": {"weighted": False}, "Weighting": "Unweighted", "Simulation": "abba"}
sim_dict = simulations[row["Weighting"]][row["Simulation"]]
(A, C) = sim_dict["fn"](row["n"], row["Effect_Size"], **sim_dict["kwarg"])

In [23]:
e = SBMEstimator()
pval, model = e.estimate_block_structure(A, C, ["abba", "abbd", "abcd"], test_method="dcorr")

In [24]:
pval

0.0

In [25]:
model

'abba'