# Overall performance of module detection methods

In [1]:
import sys
import os
# sys.path.insert(0,os.path.abspath("../lib/"))

import json

from util import JSONExtendedEncoder

%load_ext autoreload
%autoreload 2

%matplotlib inline
from matplotlib.pyplot import *

import pandas as pd
import numpy as np

import multiprocessing as mp

from itertools import product

import itertools
import shutil

import os

conf_folder = "conf/"

# Settings

In [2]:
n_jobs = int(mp.cpu_count() - 1)
method_name = None

In [3]:
# Parameters
method_name = "agglom_clustermatch_linear"


In [4]:
assert method_name is not None, "You have to specify a method_name"

In [5]:
print(f"Using {n_jobs} cores")

Using 3 cores


# Running a method on different parameter settings and datasets

Note: If you downloaded the results from zenodo, you don't need to rerun this for "dummy", "agglom", "ica_zscore", "spectral_biclust" and "meanshift"

The following code will explore the parameters of a module detection method on every dataset using a grid-search approach.

If you want to run your own method, you should wrap it into a python function and add its parameters to `conf/paramexplo_blueprints.py`. We will show the whole workflow here for a "dummy"  (but fast) clustering method, which will simply group genes randomly.

Every module detection method is wrapped in a python function (see `scripts/moduledetection.py`)

Because module detection methods usually take a while to run, we generate the files necessary to run a method on the several parameter settings and datasets here. These can then be easily called from the commandline, for example on a computer cluster or locally using GNU `parallel`.

This function will be called by scripts/moduledetection.py , which will save the modules in the correct format along with additional run information (such as running times).

In [6]:
# datasets to run
datasetnames = [
    "ecoli_colombos",
    "ecoli_dream5",
    "yeast_gpl2529",
    "yeast_dream5",
    "synth_ecoli_regulondb",
    "synth_yeast_macisaac",
    "human_tcga",
    "human_gtex",
    "human_seek_gpl5175",
    "ecoli_precise2"
]

# choose the method to evaluate
# method_name = "agglom_pearson_abs" # use the dummy method to check if everything works correctly
# method_name = "agglom" # this method runs very fast, and has the best performance among clustering methods
# method_name = "ica_zscore" # this method runs very slow, but has approx. the highest performance in the benchmark
# method_name = "spectral_biclust" # top biclustering method
# method_name = "meanshift"

To add your own method, create a function with "your_method_name" in the `lib/clustering.py` file (or any other file as long as it's imported in `scripts/moduledetection.py`.
This function should accept an `E` object (which is a dataframe with genes in columns) and any additional parameters
Then add reasonable parameter setting of your method to `conf/paramexplo_blueprints.py`.

method_name = "your_method_name"

In [7]:
# paramexplo_blueprints.py stores for every method the parameters which will be varied using a grid-search approach.
%run ../conf/paramexplo_blueprints.py
methodblueprint = blueprints[method_name]

In [8]:
methodblueprint

{'staticparams': {'method': 'agglom_clustermatch_linear',
  'simdist_function': 'clustermatch_linear'},
 'dynparams': {'linkage': ['complete', 'average'],
  'k': array([ 25.,  50.,  75., 100., 125., 150., 175., 200., 225., 250., 275.,
         300.])},
 'type': 'moduledetection'}

Generate different parameter settings using a grid-search.

In [9]:
params_folder = "conf/paramexplo/" + method_name + "/"
if os.path.exists("../" + params_folder):
    shutil.rmtree("../" + params_folder)
os.makedirs("../" + params_folder)

methodsettings = []
method_locations = []
i = 0
for dynparam_combination in list(itertools.product(*[methodblueprint["dynparams"][param] for param in sorted(methodblueprint["dynparams"].keys())])):
    method = {"params":{}}
    method["params"] = methodblueprint["staticparams"].copy()
    method["params"].update(dict(zip(sorted(methodblueprint["dynparams"].keys()), dynparam_combination)))
    method["location"] = params_folder + str(i) + ".json"
    method["seed"] = 0

    methodsettings.append(method)

    json.dump(method, open("../" + method["location"], "w"), cls=JSONExtendedEncoder)

    method_locations.append(method["location"])

    i+=1

Now combine the different parameter settings and datasets. Then generate the different python commands to run every parameter setting and dataset in parallel.

In [10]:
settings_name = "paramexplo/{method_name}".format(method_name = method_name)
settings = []
for datasetname in datasetnames:
    for setting_ix, methodsetting in enumerate(methodsettings):
        settingid = datasetname + "_" + str(setting_ix)
        settings.append({
            "dataset_location":"conf/datasets/" + datasetname + ".json",
            "dataset_name":datasetname,
            "method_location":methodsetting["location"],
            "output_folder":"results/" + methodblueprint["type"] + "/{settings_name}/{settingid}/".format(settings_name=settings_name, settingid=settingid),
            "settingid":settingid
        })
json.dump(settings, open("../conf/settings/{settings_name}.json".format(settings_name=settings_name), "w"))

In [11]:
settings_dataset = pd.DataFrame([dict(settingid=setting["settingid"], **json.load(open("../" + setting["dataset_location"]))["params"]) for setting in settings])
settings_method = pd.DataFrame([dict(settingid=setting["settingid"], **json.load(open("../" + setting["method_location"]))["params"]) for setting in settings])

In [12]:
# commands = ""
# for i, setting in enumerate(settings):
#     #commands += "python scripts/moduledetection.py {method_location} {dataset_location} {output_folder} 0 test\n".format(**setting)
#     commands += "python3 scripts/" + methodblueprint["type"] + ".py {method_location} {dataset_location} {output_folder}\n".format(**setting)

# commands_location = "tmp/{settings_name}.txt".format(**locals())
# os.makedirs("../" + os.path.dirname(commands_location), exist_ok=True)
# with open("../" + commands_location, "w") as outfile:
#     outfile.write(commands)
# commands_location = "tmp/{settings_name}.txt".format(**locals())
# os.makedirs(os.path.dirname("../tmp/" + commands_location), exist_ok=True)
# with open("../tmp/" + commands_location, "w") as outfile:
#     outfile.write(commands)
    
# #script_location = generate_batchcode(commands_location, settings_name, len(settings), {"memory":"10G", "numcores":1}, "biclust_comp2")

# # this command can be used on most linux computers to run the different parameter settings in parallel
# print("parallel -j 4 -a " + commands_location)

# Evaluating the method

In [13]:
from modulescomparison import ModevalKnownmodules, ModevalCoverage

Note: If you downloaded the results from zenodo, you don't need to rerun this for "dummy", "agglom", "ica_zscore", "spectral_biclust" and "meanshift"

## By comparing with known modules

Evaluate by comparing with known modules

In [14]:
# create pool of processors
if "pool" in locals().keys():
    pool.close()
pool = mp.Pool(n_jobs)

In [15]:
settings_filtered = [setting for setting in settings if not setting["dataset_name"].startswith("human")] # only evaluate non-human datasets
modeval = ModevalKnownmodules(settings_filtered, baseline = True)

In [16]:
modeval.run(pool)
modeval.save(settings_name)

In [17]:
modeval.load(settings_name)

In [18]:
modeval.scores

Unnamed: 0,recovery,relevance,F1rr,recall,precision,F1rp,F1rprr,F1rr_permuted,F1rp_permuted,F1rprr_permuted,settingid,knownmodules_name,regnet_name,goldstandard,runningtime
0,0.160323,0.115185,0.134056,0.023544,0.057103,0.033341,0.053400,1.816284,4.808081,2.636582,ecoli_colombos_0,mcl2,ecoli_regulondb,ecoli_regulondb#mcl2,64.392650
1,0.161990,0.116331,0.135415,0.027470,0.046665,0.034583,0.055095,1.841562,4.098843,2.541333,ecoli_colombos_0,minimal,ecoli_regulondb,ecoli_regulondb#minimal,64.392650
2,0.138781,0.089873,0.109097,0.011325,0.040541,0.017705,0.030466,1.451533,2.526991,1.843906,ecoli_colombos_0,ap3,ecoli_regulondb,ecoli_regulondb#ap3,64.392650
3,0.136757,0.110509,0.122240,0.020601,0.049427,0.029081,0.046985,1.830111,4.292295,2.566107,ecoli_colombos_0,tc1,ecoli_regulondb,ecoli_regulondb#tc1,64.392650
4,0.140813,0.110539,0.123853,0.020329,0.049141,0.028760,0.046680,1.945370,5.379820,2.857466,ecoli_colombos_0,mcl3,ecoli_regulondb,ecoli_regulondb#mcl3,64.392650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2131,0.218961,0.130819,0.163784,0.026629,0.035206,0.030323,0.051172,1.451927,1.776219,1.597784,synth_yeast_macisaac_19,ap2,yeast_macisaac,yeast_macisaac#ap2,0.320813
2132,0.485264,0.454219,0.469228,0.299951,0.292661,0.296262,0.363204,8.275350,73.117081,14.867954,synth_yeast_macisaac_19,mcl1,yeast_macisaac,yeast_macisaac#mcl1,0.320813
2133,0.395816,0.328407,0.358974,0.331798,0.175743,0.229779,0.280202,6.304107,31.051643,10.480469,synth_yeast_macisaac_19,tc2,yeast_macisaac,yeast_macisaac#tc2,0.320813
2134,0.208612,0.159689,0.180902,0.033736,0.051003,0.040610,0.066330,2.953116,8.918483,4.437029,synth_yeast_macisaac_19,ap1,yeast_macisaac,yeast_macisaac#ap1,0.320813


## Using the coverage of regulators

In [19]:
# create pool of processors
if "pool" in locals().keys():
    pool.close()
pool = mp.Pool(n_jobs)

In [20]:
settings_filtered = [setting for setting in settings if setting["dataset_name"].startswith("human")] # only evaluate human datasets
modeval = ModevalCoverage(settings_filtered, baseline = True)

In [21]:
modeval.run(pool)
modeval.save(settings_name)

Evaluating a total of 72 settings.


In [22]:
modeval.load(settings_name)

In [23]:
modeval.scores

Unnamed: 0,aucodds,aucodds_permuted,settingid,goldstandard,runningtime
0,0.025362,0.957039,human_tcga_0,regcircuit,684.209317
1,0.004162,0.157069,human_tcga_1,regcircuit,2.243876
2,0.124455,4.696235,human_tcga_6,regcircuit,2.156149
3,0.148484,5.602955,human_tcga_12,regcircuit,2.163012
4,0.098598,3.720532,human_tcga_2,regcircuit,2.175296
...,...,...,...,...,...
67,0.081281,3.067092,human_seek_gpl5175_17,regcircuit,1.275766
68,0.161598,6.097835,human_seek_gpl5175_20,regcircuit,1.258450
69,0.084471,3.187474,human_seek_gpl5175_21,regcircuit,1.275337
70,0.163004,6.150857,human_seek_gpl5175_22,regcircuit,1.267121


**TODO:** I understand that the previous code generates some files that will be used later to create the final dataframe with scores and the plots.