# Overall performance of module detection methods

In [1]:
import sys
import os
# sys.path.insert(0,os.path.abspath("../lib/"))

import json

from util import JSONExtendedEncoder

%load_ext autoreload
%autoreload 2

%matplotlib inline
from matplotlib.pyplot import *

import pandas as pd
import numpy as np

import multiprocessing as mp

from itertools import product

import itertools
import shutil

import os

conf_folder = "conf/"

# Settings

In [2]:
n_jobs = int(mp.cpu_count() - 1)
method_name = None

In [3]:
# Parameters
method_name = "agglom_clustermatch"
n_jobs = 3


In [4]:
assert method_name is not None, "You have to specify a method_name"

In [5]:
print(f"Using {n_jobs} cores")

Using 3 cores


# Running a method on different parameter settings and datasets

Note: If you downloaded the results from zenodo, you don't need to rerun this for "dummy", "agglom", "ica_zscore", "spectral_biclust" and "meanshift"

The following code will explore the parameters of a module detection method on every dataset using a grid-search approach.

If you want to run your own method, you should wrap it into a python function and add its parameters to `conf/paramexplo_blueprints.py`. We will show the whole workflow here for a "dummy"  (but fast) clustering method, which will simply group genes randomly.

Every module detection method is wrapped in a python function (see `scripts/moduledetection.py`)

Because module detection methods usually take a while to run, we generate the files necessary to run a method on the several parameter settings and datasets here. These can then be easily called from the commandline, for example on a computer cluster or locally using GNU `parallel`.

This function will be called by scripts/moduledetection.py , which will save the modules in the correct format along with additional run information (such as running times).

In [6]:
# datasets to run
datasetnames = [
    "ecoli_colombos",
    "ecoli_dream5",
    "yeast_gpl2529",
    "yeast_dream5",
    "synth_ecoli_regulondb",
    "synth_yeast_macisaac",
    "human_tcga",
    "human_gtex",
    "human_seek_gpl5175",
    "ecoli_precise2"
]

# choose the method to evaluate
# method_name = "agglom_pearson_abs" # use the dummy method to check if everything works correctly
# method_name = "agglom" # this method runs very fast, and has the best performance among clustering methods
# method_name = "ica_zscore" # this method runs very slow, but has approx. the highest performance in the benchmark
# method_name = "spectral_biclust" # top biclustering method
# method_name = "meanshift"

To add your own method, create a function with "your_method_name" in the `lib/clustering.py` file (or any other file as long as it's imported in `scripts/moduledetection.py`.
This function should accept an `E` object (which is a dataframe with genes in columns) and any additional parameters
Then add reasonable parameter setting of your method to `conf/paramexplo_blueprints.py`.

method_name = "your_method_name"

In [7]:
# paramexplo_blueprints.py stores for every method the parameters which will be varied using a grid-search approach.
%run ../conf/paramexplo_blueprints.py
methodblueprint = blueprints[method_name]

In [8]:
methodblueprint

{'staticparams': {'method': 'agglom_clustermatch',
  'simdist_function': 'clustermatch'},
 'dynparams': {'linkage': ['complete', 'average'],
  'k': array([ 25.,  50.,  75., 100., 125., 150., 175., 200., 225., 250., 275.,
         300.])},
 'type': 'moduledetection'}

Generate different parameter settings using a grid-search.

In [9]:
params_folder = "conf/paramexplo/" + method_name + "/"
if os.path.exists("../" + params_folder):
    shutil.rmtree("../" + params_folder)
os.makedirs("../" + params_folder)

methodsettings = []
method_locations = []
i = 0
for dynparam_combination in list(itertools.product(*[methodblueprint["dynparams"][param] for param in sorted(methodblueprint["dynparams"].keys())])):
    method = {"params":{}}
    method["params"] = methodblueprint["staticparams"].copy()
    method["params"].update(dict(zip(sorted(methodblueprint["dynparams"].keys()), dynparam_combination)))
    method["location"] = params_folder + str(i) + ".json"
    method["seed"] = 0

    methodsettings.append(method)

    json.dump(method, open("../" + method["location"], "w"), cls=JSONExtendedEncoder)

    method_locations.append(method["location"])

    i+=1

Now combine the different parameter settings and datasets. Then generate the different python commands to run every parameter setting and dataset in parallel.

In [10]:
settings_name = "paramexplo/{method_name}".format(method_name = method_name)
settings = []
for datasetname in datasetnames:
    for setting_ix, methodsetting in enumerate(methodsettings):
        settingid = datasetname + "_" + str(setting_ix)
        settings.append({
            "dataset_location":"conf/datasets/" + datasetname + ".json",
            "dataset_name":datasetname,
            "method_location":methodsetting["location"],
            "output_folder":"results/" + methodblueprint["type"] + "/{settings_name}/{settingid}/".format(settings_name=settings_name, settingid=settingid),
            "settingid":settingid
        })
json.dump(settings, open("../conf/settings/{settings_name}.json".format(settings_name=settings_name), "w"))

In [11]:
settings_dataset = pd.DataFrame([dict(settingid=setting["settingid"], **json.load(open("../" + setting["dataset_location"]))["params"]) for setting in settings])
settings_method = pd.DataFrame([dict(settingid=setting["settingid"], **json.load(open("../" + setting["method_location"]))["params"]) for setting in settings])

In [12]:
# commands = ""
# for i, setting in enumerate(settings):
#     #commands += "python scripts/moduledetection.py {method_location} {dataset_location} {output_folder} 0 test\n".format(**setting)
#     commands += "python3 scripts/" + methodblueprint["type"] + ".py {method_location} {dataset_location} {output_folder}\n".format(**setting)

# commands_location = "tmp/{settings_name}.txt".format(**locals())
# os.makedirs("../" + os.path.dirname(commands_location), exist_ok=True)
# with open("../" + commands_location, "w") as outfile:
#     outfile.write(commands)
# commands_location = "tmp/{settings_name}.txt".format(**locals())
# os.makedirs(os.path.dirname("../tmp/" + commands_location), exist_ok=True)
# with open("../tmp/" + commands_location, "w") as outfile:
#     outfile.write(commands)
    
# #script_location = generate_batchcode(commands_location, settings_name, len(settings), {"memory":"10G", "numcores":1}, "biclust_comp2")

# # this command can be used on most linux computers to run the different parameter settings in parallel
# print("parallel -j 4 -a " + commands_location)

# Evaluating the method

In [13]:
from modulescomparison import ModevalKnownmodules, ModevalCoverage

Note: If you downloaded the results from zenodo, you don't need to rerun this for "dummy", "agglom", "ica_zscore", "spectral_biclust" and "meanshift"

## By comparing with known modules

Evaluate by comparing with known modules

In [14]:
# create pool of processors
if "pool" in locals().keys():
    pool.close()
pool = mp.Pool(n_jobs)

In [15]:
settings_filtered = [setting for setting in settings if not setting["dataset_name"].startswith("human")] # only evaluate non-human datasets
modeval = ModevalKnownmodules(settings_filtered, baseline = True)

In [16]:
modeval.run(pool)
modeval.save(settings_name)

In [17]:
modeval.load(settings_name)

In [18]:
modeval.scores

Unnamed: 0,recovery,relevance,F1rr,recall,precision,F1rp,F1rprr,F1rr_permuted,F1rp_permuted,F1rprr_permuted,settingid,knownmodules_name,regnet_name,goldstandard,runningtime
0,0.204072,0.155772,0.176681,0.034730,0.052112,0.041681,0.067450,2.393683,6.003424,3.422678,ecoli_colombos_0,mcl2,ecoli_regulondb,ecoli_regulondb#mcl2,9860.165773
1,0.181225,0.131946,0.152708,0.045633,0.030637,0.036661,0.059127,2.076844,4.348791,2.811165,ecoli_colombos_0,minimal,ecoli_regulondb,ecoli_regulondb#minimal,9860.165773
2,0.147236,0.110189,0.126046,0.016048,0.033773,0.021758,0.037110,1.677602,3.103377,2.177893,ecoli_colombos_0,ap3,ecoli_regulondb,ecoli_regulondb#ap3,9860.165773
3,0.159253,0.128440,0.142197,0.031995,0.036934,0.034288,0.055252,2.128900,5.075888,2.999688,ecoli_colombos_0,tc1,ecoli_regulondb,ecoli_regulondb#tc1,9860.165773
4,0.171253,0.135728,0.151435,0.027523,0.039573,0.032466,0.053469,2.378674,6.058349,3.416095,ecoli_colombos_0,mcl3,ecoli_regulondb,ecoli_regulondb#mcl3,9860.165773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2131,0.263015,0.184682,0.216996,0.041213,0.053502,0.046560,0.076669,1.923664,2.727465,2.256109,synth_yeast_macisaac_19,ap2,yeast_macisaac,yeast_macisaac#ap2,0.314966
2132,0.513998,0.492474,0.503006,0.335392,0.324600,0.329908,0.398470,8.871212,81.424476,15.999298,synth_yeast_macisaac_19,mcl1,yeast_macisaac,yeast_macisaac#mcl1,0.314966
2133,0.443107,0.384471,0.411712,0.403232,0.203112,0.270148,0.326234,7.229137,36.503901,12.068299,synth_yeast_macisaac_19,tc2,yeast_macisaac,yeast_macisaac#tc2,0.314966
2134,0.240714,0.190353,0.212591,0.039618,0.058981,0.047398,0.077514,3.470634,10.408960,5.205583,synth_yeast_macisaac_19,ap1,yeast_macisaac,yeast_macisaac#ap1,0.314966


## Using the coverage of regulators

In [19]:
# create pool of processors
if "pool" in locals().keys():
    pool.close()
pool = mp.Pool(n_jobs)

In [20]:
settings_filtered = [setting for setting in settings if setting["dataset_name"].startswith("human")] # only evaluate human datasets
modeval = ModevalCoverage(settings_filtered, baseline = True)

In [21]:
modeval.run(pool)
modeval.save(settings_name)

Evaluating a total of 72 settings.


In [22]:
modeval.load(settings_name)

In [23]:
modeval.scores

Unnamed: 0,aucodds,aucodds_permuted,settingid,goldstandard,runningtime
0,0.092095,3.475159,human_tcga_0,regcircuit,108280.023016
1,0.005299,0.199965,human_tcga_1,regcircuit,2.270966
2,0.129057,4.869895,human_tcga_6,regcircuit,2.183533
3,0.152574,5.757295,human_tcga_12,regcircuit,2.157218
4,0.032507,1.226642,human_tcga_7,regcircuit,2.318005
...,...,...,...,...,...
67,0.175378,6.617782,human_gtex_23,regcircuit,1.818512
68,0.174717,6.592872,human_seek_gpl5175_20,regcircuit,1.252326
69,0.123249,4.650741,human_seek_gpl5175_21,regcircuit,1.284648
70,0.169265,6.387120,human_seek_gpl5175_22,regcircuit,1.412935


**TODO:** I understand that the previous code generates some files that will be used later to create the final dataframe with scores and the plots.