In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from run_desmond import run_DESMOND
from utils.eval import calculate_perfromance, generate_exprs
from utils.method import read_bic_table

# Simulated expression data 

* 10000 genes x 200 samples
* background ~N(0,1), bicluster ~ N(4,1)
* four biclusters with fraction [0.05,0.1,0.25,0.5] of all samples simulate four subtypes
* three scenarios A,B,C
    * for each scenario, bicluster sizes in genes were 5,50,500 
    * 3 scenarios x 3 gene sizes = 9 expression matrices in total

### scenario A:
 * biclusters are not overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario B:
 * biclusters are overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario C:
 * overlapping in genes and samples
 * four co-expressed modules of 500 genes each
 

# Evaluation

# 1. Example: scenario C,50 and k-means

#### Inputs:

In [2]:
exprs_file = "data/simulated_m=4,std=1/C/C.n_genes=500.exprs_z.tsv" 
exprs = pd.read_csv(exprs_file,sep = "\t",index_col=0)
exprs.head(5)

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_190,s_191,s_192,s_193,s_194,s_195,s_196,s_197,s_198,s_199
g_0,3.234252,-0.537743,-0.145318,2.122808,-0.58562,1.945721,0.31979,1.776159,-0.703115,-0.197809,...,-0.691652,-0.04111,2.058545,-1.090703,-0.382239,-0.27632,0.936099,-0.391953,-0.439644,-1.03939
g_1,0.2755,0.48117,1.010313,0.980679,-1.482808,-1.037173,0.434818,0.433552,0.434831,3.816463,...,-0.997243,0.411398,-1.424616,1.768575,1.107971,-0.562352,-1.822691,1.2847,-0.203047,1.167116
g_2,-1.119433,-1.541442,-0.106289,-0.109311,-0.560576,0.087471,-0.642202,-0.955568,0.935068,0.425404,...,-0.456908,-0.574621,0.257983,1.475459,0.894755,-1.273852,-0.015582,-1.586488,-1.036009,0.365994
g_3,0.266405,-0.849287,0.095549,1.942929,0.280191,0.875908,-1.490424,-1.769411,-1.592939,1.915069,...,0.222925,0.069257,0.057946,-0.020052,-0.330713,1.71118,0.242721,0.35803,0.16532,0.274322
g_4,0.847373,-0.673997,-0.033612,-0.617749,-0.588689,-0.457588,0.09821,-0.634982,1.179481,-1.070011,...,0.083825,-2.270019,-0.392735,-0.847586,-1.181957,-0.428222,1.746393,0.536221,-0.731673,0.464814


In [3]:
ground_truth_file = "data/simulated/C/C.n_genes=500,m=4,std=1,overlap=yes.biclusters.tsv"
ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
if "genes" in ground_truth.columns.values:
    ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))

ground_truth.head(5)

Unnamed: 0,genes,samples,frac,n_genes,n_samples:,n_samples
0.05,"{g_149, g_8066, g_1422, g_8038, g_5769, g_8605...","{s_75, s_197, s_55, s_26, s_24, s_46, s_185, s...",0.05,500,10,10
0.1,"{g_6028, g_9437, g_9815, g_2982, g_7439, g_934...","{s_149, s_45, s_51, s_199, s_175, s_109, s_153...",0.1,500,20,20
0.25,"{g_7621, g_1340, g_1180, g_386, g_8919, g_4440...","{s_34, s_123, s_52, s_29, s_158, s_168, s_46, ...",0.25,500,50,50
0.5,"{g_4459, g_4846, g_8605, g_5189, g_8850, g_217...","{s_25, s_162, s_154, s_182, s_37, s_48, s_80, ...",0.5,500,100,100


In [4]:
# prepare a dict with sample groups corresponding to known bicluster
known_groups = {}
known_groups["simulated"] = {}
for group in ground_truth.index.values:
    known_groups["simulated"][group] = ground_truth.loc[group,"samples"]

### Method output
* must be a dataFrame with (bi-)cluster index and "samples" coulum
* "samples" column contrain sets of samples

In [5]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=4)
labels = model.fit_predict(exprs.T)


result = {}
sample_names = exprs.columns.values
for clust in set(labels):
    s = set(sample_names[labels == clust])
    result[clust] = {"samples":s,"n_samples":len(s)}
result = pd.DataFrame.from_dict(result).T
result



Unnamed: 0,samples,n_samples
0,"{s_178, s_96, s_33, s_195, s_86, s_124, s_49, ...",63
1,"{s_25, s_160, s_162, s_189, s_154, s_182, s_18...",70
2,"{s_34, s_123, s_52, s_29, s_158, s_46, s_168, ...",47
3,"{s_45, s_149, s_51, s_199, s_175, s_109, s_153...",20


In [6]:
all_samples = set(exprs.columns.values)
performance, best_matches = calculate_perfromance(result,known_groups,all_samples,adjust_pvals="B")
print("Sum of weighted Jaccard indexes:",performance[0])
best_matches

Sum of weighted Jaccard indexes: 0.7777372262773723


Unnamed: 0,bm_id,J,weight,adj_pval,is_enriched,samples,n_samples,classification
0.05,,0.0,0.055556,,,{},0,simulated
0.1,3.0,1.0,0.111111,0.0,True,"{s_45, s_149, s_51, s_199, s_175, s_109, s_153...",20,simulated
0.25,2.0,0.94,0.277778,0.0,True,"{s_34, s_123, s_52, s_29, s_158, s_46, s_168, ...",47,simulated
0.5,0.0,0.729927,0.555556,0.0,False,"{s_178, s_96, s_33, s_195, s_86, s_124, s_49, ...",63,simulated


# Running DESMOND2 with multiple parameter combinations

In [12]:
from time import time
from run_unpast import run

out_dir= "results_on_simulated_data/"
fname_prefix = "data/simulated_m=4,std=1/"

# number of runs and seeds - set to 5 
n_runs = 5
seeds =  [85822412, 14942603, 3356886, 99529223, 36913810]

# method parameters to try
pvals = [0.005,0.01,0.05,0.1] #0.001,
bin_methods = ["kmeans","ward","GMM"] 

directions =   [["UP","DOWN"]]


In [13]:
clust_method = "Louvain"
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

df = []
for scenario in ["A","B","C"]:
    for gsize in [500,50,5]:
        exprs_file = fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".exprs_z.tsv"
        all_samples = set(pd.read_csv(exprs_file,sep="\t",index_col=0, nrows=0).columns.values)

        ground_truth_file =fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".biclusters.tsv"
        ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
        ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
        if "genes" in ground_truth.columns.values:
            ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))
        # make dictionary with sample sets for calculate_perforamce()
        known_groups = {}
        known_groups["simulated"] = {}
        for group in ground_truth.index.values:
            known_groups["simulated"][group] = ground_truth.loc[group,"samples"]
            
        basename = scenario+".n_genes="+str(gsize)
        print(scenario,gsize)
        
        for n_run in range(n_runs):
            seed = seeds[n_run]
            for pval in pvals:
                for bin_method in bin_methods:
                    for m in modularities:
                        for d in directions:
                            # save parameters as a ;-separated string
                            params = "bin="+bin_method+";pval="+str(pval)+";direction="+str("-".join(d))
                            params += ";clust="+clust_method+";m="+str(m)
                            
                            fname = out_dir+basename+".seed="+str(seed)+\
                                ".bin="+bin_method +",pval="+str(pval)+",clust="+clust_method+",direction="+\
                                str("-".join(d))+",m="+str(m)+".biclusters.tsv"
                            try:
                                t0 = time()
                                if os.path.exists(fname):
                                    result = read_bic_table(fname)
                                else:
                                    result = run(exprs_file, basename , out_dir=out_dir,
                                                                    save=True, load = True,
                                                                    min_n_samples = 5,
                                                                    bin_method = bin_method, pval = pval,
                                                                    directions = d,
                                                                    clust_method = clust_method,
                                                                    modularity=m,
                                                                    seed = seed,
                                                                    verbose = False)
                                # find best matches and calculate performance 
                                performance, best_matches = calculate_perfromance(result, known_groups,all_samples)

                                #print("\trun",run,params,performance)

                                d = {"scenario":scenario,"gsize":gsize,
                                     "n_run":n_run,"seed":seed,
                                     "parameters":params, 
                                     "performance":performance["simulated"]}
                                d["runtime"] = time()-t0
                                df.append(d)
                            except:
                                print("\t failed",n_run,seed, params, file = sys.stderr)        

A 500
A 50
A 5
B 500
B 50
B 5
C 500
C 50
C 5


In [14]:
df2 = []

In [15]:
rpath="/home/olya/anaconda3/envs/r4_env/bin/"
clust_method = "WGCNA"
ds_values = [0,1,2,3,4]
dchs = [0.95,0.995]

for scenario in ["A","B","C"]:
    for gsize in [500,50,5]:
        exprs_file = fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".exprs_z.tsv"
        all_samples = set(pd.read_csv(exprs_file,sep="\t",index_col=0, nrows=0).columns.values)

        ground_truth_file=fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".biclusters.tsv"
        ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
        ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
        if "genes" in ground_truth.columns.values:
            ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))
        # make dictionary with sample sets for calculate_perforamce()
        known_groups = {}
        known_groups["simulated"] = {}
        for group in ground_truth.index.values:
            known_groups["simulated"][group] = ground_truth.loc[group,"samples"]
            
        basename = scenario+".n_genes="+str(gsize)
        print(scenario,gsize)

        for n_run in range(n_runs):
            seed = seeds[n_run]
            for pval in pvals:
                for bin_method in bin_methods:
                    for ds in ds_values:
                        for dch in dchs:
                            #
                            for d in directions:
                                # save parameters as a ;-separated string
                                params = "bin="+bin_method+";pval="+str(pval)+";direction="+str("-".join(d))
                                params += ";clust="+clust_method+";dch="+str(dch)+";ds="+str(ds)+";preClustering=T"
                                
                                fname = out_dir+basename+".seed="+str(seed)+\
                                ".bin="+bin_method +",pval="+str(pval)+",clust="+clust_method+",direction="+\
                                str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True.biclusters.tsv"
                                
                                try:
                                    t0 = time()
                                    if os.path.exists(fname):
                                        result = read_bic_table(fname)
                                    else:
                                        result = run(exprs_file, basename , out_dir=out_dir,
                                                                        save=True, load = True,
                                                                min_n_samples = 5,
                                                                bin_method = bin_method, pval = pval,
                                                                directions = d,
                                                                clust_method = clust_method,
                                                                precluster=True,
                                                                ds=ds,dch=dch,
                                                                rpath=rpath,
                                                                seed = seed,
                                                                verbose = False)

                                    time_t = time()-t0
                                    # find best matches and calculate performance 
                                    performance, best_matches = calculate_perfromance(result, known_groups,all_samples)

                                    d = {"scenario":scenario,"gsize":gsize,
                                         "n_run":n_run,"seed":seed,
                                         "parameters":params, 
                                         "performance":performance["simulated"]}
                                    d["runtime"] = time()-t0
                                    df2.append(d)
                                except:
                                    print("\t failed",n_run,seed, params, file = sys.stderr)



A 500
A 50
A 5


	 failed 1 14942603 bin=GMM;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


B 500
B 50
B 5


	 failed 3 99529223 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


C 500
C 50
C 5


	 failed 0 85822412 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T
	 failed 1 14942603 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T
	 failed 4 36913810 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


In [18]:
pd.DataFrame.from_records(df+df2).loc[:,["parameters","performance"]].groupby(["parameters"]).agg("mean").sort_values("performance",ascending=False)

Unnamed: 0_level_0,performance
parameters,Unnamed: 1_level_1
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,0.981790
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T,0.981790
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T,0.981790
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,0.981790
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,0.981658
...,...
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=Louvain;m=0.6,0.906042
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=Louvain;m=0.5,0.906042
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=Louvain;m=0.4,0.906042
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=Louvain;m=0.3,0.906042


In [19]:
overall_performance = pd.DataFrame.from_records(df+df2)
overall_performance.head(5)

Unnamed: 0,scenario,gsize,n_run,seed,parameters,performance,runtime
0,A,500,0,85822412,bin=kmeans;pval=0.005;direction=UP-DOWN;clust=...,1.0,0.012582
1,A,500,0,85822412,bin=kmeans;pval=0.005;direction=UP-DOWN;clust=...,1.0,0.01096
2,A,500,0,85822412,bin=kmeans;pval=0.005;direction=UP-DOWN;clust=...,1.0,0.011012
3,A,500,0,85822412,bin=kmeans;pval=0.005;direction=UP-DOWN;clust=...,1.0,0.010937
4,A,500,0,85822412,bin=kmeans;pval=0.005;direction=UP-DOWN;clust=...,1.0,0.010773


In [20]:
overall_performance.to_csv("UnPaSt_ABC.tsv",sep = "\t")