In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from run_desmond import run_DESMOND
from utils.eval import find_best_matches, generate_exprs
from utils.method import read_bic_table

# Simulated expression data 

* 10000 genes x 200 samples
* background ~N(0,1), bicluster ~ N(4,1)
* four biclusters with fraction [0.05,0.1,0.25,0.5] of all samples simulate four subtypes
* three scenarios A,B,C
    * for each scenario, bicluster sizes in genes were 5,50,500 
    * 3 scenarios x 3 gene sizes = 9 expression matrices in total

### scenario A:
 * biclusters are not overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario B:
 * biclusters are overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario C:
 * overlapping in genes and samples
 * four co-expressed modules of 500 genes each
 

# Evaluation

# 1. Example: scenario C,50 and k-means

#### Inputs:

In [2]:
exprs_file = "data/simulated_m=4,std=1/C/C.n_genes=500.exprs_z.tsv" 
exprs = pd.read_csv(exprs_file,sep = "\t",index_col=0)
exprs.head(5)

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_190,s_191,s_192,s_193,s_194,s_195,s_196,s_197,s_198,s_199
g_0,3.234252,-0.537743,-0.145318,2.122808,-0.58562,1.945721,0.31979,1.776159,-0.703115,-0.197809,...,-0.691652,-0.04111,2.058545,-1.090703,-0.382239,-0.27632,0.936099,-0.391953,-0.439644,-1.03939
g_1,0.2755,0.48117,1.010313,0.980679,-1.482808,-1.037173,0.434818,0.433552,0.434831,3.816463,...,-0.997243,0.411398,-1.424616,1.768575,1.107971,-0.562352,-1.822691,1.2847,-0.203047,1.167116
g_2,-1.119433,-1.541442,-0.106289,-0.109311,-0.560576,0.087471,-0.642202,-0.955568,0.935068,0.425404,...,-0.456908,-0.574621,0.257983,1.475459,0.894755,-1.273852,-0.015582,-1.586488,-1.036009,0.365994
g_3,0.266405,-0.849287,0.095549,1.942929,0.280191,0.875908,-1.490424,-1.769411,-1.592939,1.915069,...,0.222925,0.069257,0.057946,-0.020052,-0.330713,1.71118,0.242721,0.35803,0.16532,0.274322
g_4,0.847373,-0.673997,-0.033612,-0.617749,-0.588689,-0.457588,0.09821,-0.634982,1.179481,-1.070011,...,0.083825,-2.270019,-0.392735,-0.847586,-1.181957,-0.428222,1.746393,0.536221,-0.731673,0.464814


In [3]:
ground_truth_file = "data/simulated/C/C.n_genes=500,m=4,std=1,overlap=yes.biclusters.tsv"
ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
if "genes" in ground_truth.columns.values:
    ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))

ground_truth.head(5)

Unnamed: 0,genes,samples,frac,n_genes,n_samples:,n_samples
0.05,"{g_6783, g_8430, g_8831, g_5410, g_7052, g_202...","{s_197, s_131, s_55, s_46, s_75, s_26, s_144, ...",0.05,500,10,10
0.1,"{g_1104, g_5375, g_1134, g_318, g_3346, g_5408...","{s_51, s_91, s_88, s_40, s_19, s_149, s_184, s...",0.1,500,20,20
0.25,"{g_3028, g_3555, g_2351, g_6830, g_9555, g_143...","{s_196, s_29, s_52, s_23, s_75, s_38, s_112, s...",0.25,500,50,50
0.5,"{g_8286, g_8636, g_6783, g_2487, g_8598, g_774...","{s_51, s_118, s_188, s_18, s_67, s_150, s_171,...",0.5,500,100,100


In [4]:
# prepare a dict with sample groups corresponding to known bicluster
known_groups = {}
for group in ground_truth.index.values:
    known_groups[group] = ground_truth.loc[group,"samples"]

### Method output
* must be a dataFrame with (bi-)cluster index and "samples" coulum
* "samples" column contrain sets of samples

In [5]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=4)
labels = model.fit_predict(exprs.T)


result = {}
sample_names = exprs.columns.values
for clust in set(labels):
    s = set(sample_names[labels == clust])
    result[clust] = {"samples":s,"n_samples":len(s)}
result = pd.DataFrame.from_dict(result).T
result

Unnamed: 0,samples,n_samples
0,"{s_76, s_197, s_106, s_124, s_152, s_26, s_69,...",63
1,"{s_55, s_92, s_111, s_118, s_188, s_79, s_18, ...",70
2,"{s_51, s_91, s_88, s_40, s_21, s_19, s_149, s_...",20
3,"{s_196, s_29, s_52, s_23, s_75, s_38, s_112, s...",47


In [6]:
all_samples = set(exprs.columns.values)
best_matches = find_best_matches(result,known_groups,all_samples,FDR=0.05)
print("Total weighted J:",best_matches["J_weighted"].sum())
best_matches

Total weighted J: 0.7777372262773723


Unnamed: 0_level_0,group_size,J,is_enriched,best_match_id,samples,n_samples,J_weighted
known_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.1,20.0,1.0,True,2.0,"{s_51, s_91, s_88, s_40, s_21, s_19, s_149, s_...",20.0,0.111111
0.25,50.0,0.94,True,3.0,"{s_196, s_29, s_52, s_23, s_75, s_38, s_112, s...",47.0,0.261111
0.5,100.0,0.729927,False,0.0,"{s_76, s_197, s_106, s_124, s_152, s_26, s_69,...",63.0,0.405515
0.05,10.0,0.0,,,,,0.0


# Running DESMOND2 with multiple parameter combinations

In [7]:
def calc_performance(found_clusters, all_samples, ground_truth_file,match_unique=True):
    
    ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
    ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
    if "genes" in ground_truth.columns.values:
        ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))
        
    # prepare a dict with sample groups corresponding to known bicluster
    known_groups = {}
    known_gsets = {}
    for group in ground_truth.index.values:
        known_groups[group] = ground_truth.loc[group,"samples"]
        known_gsets[group] = ground_truth.loc[group,"genes"]
    
    if found_clusters is None:
        performance= {}
        performance["total"] = 0
        best_matches = None
    else:
        best_matches = find_best_matches(found_clusters,known_groups,all_samples,
                                         FDR=0.05,verbose = False,match_unique=match_unique)
        J_total = best_matches["J_weighted"].sum()
        performance = best_matches.loc[:,["J"]].to_dict()["J"]
        # renaming subtype-specific performances to "performance_"+subt
        subtypes = list(performance.keys())
        for subt in subtypes:
            performance["performance_"+str(subt)] = performance.pop(subt)
        performance["overall_performance"] = J_total
    return performance, best_matches


In [8]:
# number of runs and seeds - set to 5 
n_runs = 5
seeds =  [85822412, 14942603, 3356886, 99529223, 36913810]

# method parameters to try
pvals = [0.0001, 0.0005,0.001,0.005,0.01]
bin_methods = ["kmeans","ward","GMM"] 
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

In [9]:
from time import time
from run_desmond2 import run_DESMOND

out_dir= "results_on_simulated_data/"
fname_prefix = "data/simulated_m=4,std=1/"

df = []
for scenario in ["A","B","C"]:
    for gsize in [500,50,5]:
        exprs_file = fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".exprs_z.tsv"
        ground_truth_file =fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".biclusters.tsv"
        all_samples = set(pd.read_csv(exprs_file,sep="\t",index_col=0, nrows=0).columns.values)
        print(scenario,gsize)
        for run in range(n_runs):
            seed = seeds[run]
            for pval in pvals:
                for bin_method in bin_methods:
                    for m in modularities:
                        basename = scenario+".n_genes="+str(gsize)
                        try:
                            t0 = time()
                            # read output
                            fname = out_dir+basename+".seed="+str(seed)+\
                            ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+\
                            ",m="+str(m)+".biclusters.tsv"
                            result = read_bic_table(fname)
                            # calculate performance and bmids
                            performance, best_matches = calc_performance(result, all_samples, ground_truth_file)
                            
                            # save parameters as a string
                            params = "bin_method="+bin_method+";pval="+str(pval)
                            params += ";clust_method="+"Louvain"+";modularity="+str(m)
                            #print("\trun",run,params,performance)
                            
                            d = {"scenario":scenario,"gsize":gsize,
                                 "run":run,"seed":seed,
                                 "parameters":params}
                            d.update(performance)
                            d["runtime"] = time()-t0
                            df.append(d)
                        except:
                            print("\t failed",run,seed, params, file = sys.stderr)        

A 500
A 50
A 5


	 failed 0 85822412 bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0.9
	 failed 0 85822412 bin_method=ward;pval=0.0001;clust_method=Louvain;modularity=0.9
	 failed 1 14942603 bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0.9
	 failed 2 3356886 bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0.9
	 failed 3 99529223 bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0.9
	 failed 4 36913810 bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0.9


B 500
B 50
B 5
C 500
C 50
C 5


In [10]:
df2 = []
ds_values = [0,1,2,3,4]
dch = 0.995

for scenario in ["A","B","C"]:
    for gsize in [500,50,5]:
        exprs_file = fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".exprs_z.tsv"
        ground_truth_file =fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".biclusters.tsv"
        all_samples = set(pd.read_csv(exprs_file,sep="\t",index_col=0, nrows=0).columns.values)
        print(scenario,gsize)

        for run in range(n_runs):
            seed = seeds[run]
            for pval in pvals:
                for bin_method in bin_methods:
                    for ds in ds_values:
                        basename = scenario+".n_genes="+str(gsize)
                        ### running TCGA or reading results
                        try:
                            t0 = time()
                            fname = out_dir+basename+".seed="+str(seed)+\
                                    ".bin="+bin_method +",pval="+str(pval)+",clust=WGCNA"+\
                                    ",ds="+str(ds)+",dch="+str(dch)+".biclusters.tsv"
                            result = read_bic_table(fname)
                            time_t = time()-t0
                            # find the best matches between TCGA biclusters and subtypes
                            # and calculate overall performance == weighted sum of Jaccard indexes
                            performance, best_matches = calc_performance(result, all_samples, ground_truth_file)

                            # save parameters as a string
                            params = "bin_method="+bin_method+";pval="+str(pval)
                            params += ";clust_method="+"WGCNA"+";ds="+str(ds)
                            #print("\trun",run,params,performance)

                            d = {"scenario":scenario,"gsize":gsize,
                                 "run":run,"seed":seed,
                                 "parameters":params}
                            d.update(performance)
                            d["runtime"] = time()-t0
                            df2.append(d)
                        except:
                            print("\t failed",run,seed, params, file = sys.stderr)



A 500
A 50
A 5


	 failed 1 14942603 bin_method=GMM;pval=0.005;clust_method=WGCNA;ds=4
	 failed 1 14942603 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 2 3356886 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4
	 failed 2 3356886 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 3 99529223 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 4 36913810 bin_method=GMM;pval=0.005;clust_method=WGCNA;ds=4
	 failed 4 36913810 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4


B 500
B 50
B 5


	 failed 0 85822412 bin_method=GMM;pval=0.005;clust_method=WGCNA;ds=4
	 failed 1 14942603 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 2 3356886 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4
	 failed 3 99529223 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4
	 failed 3 99529223 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 4 36913810 bin_method=GMM;pval=0.005;clust_method=WGCNA;ds=4
	 failed 4 36913810 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4


C 500
C 50
C 5


	 failed 0 85822412 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4
	 failed 1 14942603 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4
	 failed 3 99529223 bin_method=kmeans;pval=0.01;clust_method=WGCNA;ds=4
	 failed 4 36913810 bin_method=kmeans;pval=0.005;clust_method=WGCNA;ds=4


In [13]:
overall_performance = pd.concat([pd.DataFrame.from_records(df),pd.DataFrame.from_records(df2)])
overall_performance.groupby(["parameters"]).agg("mean").sort_values("overall_performance",ascending=False).head(10)

Unnamed: 0_level_0,gsize,run,seed,performance_0.5,performance_0.25,performance_0.1,performance_0.05,overall_performance,runtime
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bin_method=GMM;pval=0.01;clust_method=WGCNA;ds=4,185.0,2.0,48112986.8,1.0,1.0,1.0,0.669444,0.981636,0.036091
bin_method=ward;pval=0.0001;clust_method=Louvain;modularity=0.3,185.0,2.0,48112986.8,1.0,1.0,1.0,0.666667,0.981481,0.021053
bin_method=ward;pval=0.0001;clust_method=Louvain;modularity=0.4,185.0,2.0,48112986.8,1.0,1.0,1.0,0.666667,0.981481,0.020945
bin_method=ward;pval=0.0001;clust_method=Louvain;modularity=0.5,185.0,2.0,48112986.8,1.0,1.0,1.0,0.666667,0.981481,0.021099
bin_method=ward;pval=0.005;clust_method=WGCNA;ds=3,185.0,2.0,48112986.8,0.997778,1.0,0.998889,0.678533,0.980783,0.02875
bin_method=GMM;pval=0.01;clust_method=WGCNA;ds=3,185.0,2.0,48112986.8,1.0,1.0,1.0,0.65122,0.980623,0.032092
bin_method=ward;pval=0.001;clust_method=WGCNA;ds=4,185.0,2.0,48112986.8,0.997778,1.0,1.0,0.671667,0.980525,0.025398
bin_method=ward;pval=0.001;clust_method=WGCNA;ds=3,185.0,2.0,48112986.8,0.997778,1.0,1.0,0.669167,0.980386,0.025126
bin_method=GMM;pval=0.01;clust_method=Louvain;modularity=0,185.0,2.0,48112986.8,0.998,1.0,1.0,0.666667,0.98037,0.026219
bin_method=ward;pval=0.01;clust_method=WGCNA;ds=4,185.0,2.0,48112986.8,0.997778,1.0,0.994444,0.679111,0.980321,0.039848


In [12]:
overall_performance.to_csv("UnPaSt_ABC.tsv",sep = "\t")