In [1]:
import os
import pandas as pd
import random
import copy

import sys
sys.path.insert(0, '..')

from unpast.run_unpast import unpast
from unpast.utils.method import prepare_input_matrix
from unpast.utils.consensus import make_consensus_biclusters

from unpast.utils.io import read_bic_table

from unpast.utils.eval import make_ref_groups
from unpast.utils.eval import calculate_perfromance, compare_gene_clusters

# Simulated expression data 

* 10000 genes x 200 samples
* background ~N(0,1), bicluster ~ N(4,1)
* four biclusters with fraction [0.05,0.1,0.25,0.5] of all samples simulate four subtypes
* three scenarios A,B,C
    * for each scenario, bicluster sizes in genes were 5,50,500 
    * 3 scenarios x 3 gene sizes = 9 expression matrices in total

### scenario A:
 * biclusters are not overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario B:
 * biclusters are overlapping in genes and samples
 * all other genes are independent  ~N(0,1)

### scenario C:
 * overlapping in genes and samples
 * four co-expressed modules of 500 genes each
 

# Evaluation

# 1. Example: scenario C,50 and k-means

#### Inputs:

In [2]:
exprs_file = "../../data/simulated_m=4,std=1/C/C.n_genes=500.exprs_z.tsv" 
exprs = pd.read_csv(exprs_file,sep = "\t",index_col=0)
exprs.head(5)

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_190,s_191,s_192,s_193,s_194,s_195,s_196,s_197,s_198,s_199
g_0,3.234252,-0.537743,-0.145318,2.122808,-0.58562,1.945721,0.31979,1.776159,-0.703115,-0.197809,...,-0.691652,-0.04111,2.058545,-1.090703,-0.382239,-0.27632,0.936099,-0.391953,-0.439644,-1.03939
g_1,0.2755,0.48117,1.010313,0.980679,-1.482808,-1.037173,0.434818,0.433552,0.434831,3.816463,...,-0.997243,0.411398,-1.424616,1.768575,1.107971,-0.562352,-1.822691,1.2847,-0.203047,1.167116
g_2,-1.119433,-1.541442,-0.106289,-0.109311,-0.560576,0.087471,-0.642202,-0.955568,0.935068,0.425404,...,-0.456908,-0.574621,0.257983,1.475459,0.894755,-1.273852,-0.015582,-1.586488,-1.036009,0.365994
g_3,0.266405,-0.849287,0.095549,1.942929,0.280191,0.875908,-1.490424,-1.769411,-1.592939,1.915069,...,0.222925,0.069257,0.057946,-0.020052,-0.330713,1.71118,0.242721,0.35803,0.16532,0.274322
g_4,0.847373,-0.673997,-0.033612,-0.617749,-0.588689,-0.457588,0.09821,-0.634982,1.179481,-1.070011,...,0.083825,-2.270019,-0.392735,-0.847586,-1.181957,-0.428222,1.746393,0.536221,-0.731673,0.464814


In [3]:
ground_truth_file = "../../data/simulated/C/C.n_genes=500,m=4,std=1,overlap=yes.biclusters.tsv"
ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
if "genes" in ground_truth.columns.values:
    ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))

ground_truth.head(5)

Unnamed: 0,genes,samples,frac,n_genes,n_samples:,n_samples
0.05,"{g_1140, g_5010, g_2353, g_995, g_6199, g_9655...","{s_75, s_46, s_185, s_24, s_26, s_131, s_194, ...",0.05,500,10,10
0.1,"{g_7636, g_3865, g_8956, g_4630, g_5884, g_683...","{s_58, s_184, s_63, s_199, s_21, s_45, s_159, ...",0.1,500,20,20
0.25,"{g_6846, g_6304, g_6072, g_6830, g_3555, g_893...","{s_165, s_46, s_158, s_89, s_112, s_13, s_176,...",0.25,500,50,50
0.5,"{g_3453, g_8871, g_2404, g_9657, g_333, g_9735...","{s_105, s_4, s_73, s_110, s_184, s_63, s_54, s...",0.5,500,100,100


In [4]:
# prepare a dict with sample groups corresponding to known bicluster
known_groups = {}
known_groups["simulated"] = {}
for group in ground_truth.index.values:
    known_groups["simulated"][group] = ground_truth.loc[group,"samples"]

### Method output
* must be a dataFrame with (bi-)cluster index and "samples" coulum
* "samples" column contrain sets of samples

In [5]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=4)
labels = model.fit_predict(exprs.T)


result = {}
sample_names = exprs.columns.values
for clust in set(labels):
    s = set(sample_names[labels == clust])
    result[clust] = {"samples":s,"n_samples":len(s)}
result = pd.DataFrame.from_dict(result).T
result

Unnamed: 0,samples,n_samples
0,"{s_75, s_46, s_144, s_185, s_24, s_131, s_194,...",10
1,"{s_4, s_105, s_73, s_110, s_184, s_63, s_54, s...",98
2,"{s_165, s_158, s_89, s_113, s_112, s_49, s_58,...",56
3,"{s_68, s_122, s_99, s_14, s_178, s_12, s_195, ...",36


In [6]:
all_samples = set(exprs.columns.values)
performance, best_matches = calculate_perfromance(result,
                                                  known_groups,
                                                  all_samples,
                                                  adjust_pvals="BH",
                                                  performance_measure = "ARI")
print("Sum of weighted ARIs:",performance[0])
best_matches

Sum of weighted ARIs: 0.6306305991471917


Unnamed: 0,bm_id,ARI,weight,adj_pval,is_enriched,samples,n_samples,classification
0.05,0.0,1.0,0.055556,2e-06,True,"{s_75, s_46, s_144, s_185, s_24, s_131, s_194,...",10,simulated
0.1,,0.0,0.111111,,,{},0,simulated
0.25,2.0,0.14987,0.277778,0.000476,True,"{s_165, s_158, s_89, s_113, s_112, s_49, s_58,...",56,simulated
0.5,1.0,0.9602,0.555556,3e-06,True,"{s_4, s_105, s_73, s_110, s_184, s_63, s_54, s...",98,simulated


# 2. Running UnPaSt with multiple parameter combinations

In [7]:
rpath = "/home/olya/anaconda3/envs/r4_env/bin/"
out_dir = "../unpast_results/simulated_ABC/"

fname_prefix = "../../data/simulated_m=4,std=1/"

# number of runs and seeds - set to 5 
n_runs = 5
seeds =  [85822412, 14942603, 3356886, 99529223, 36913810]

# method parameters 
pvals = [0.05,0.01,0.005,0.001]
bin_methods = ["kmeans","GMM","ward"] 
directions =  [["UP","DOWN"],["BOTH"]]


In [8]:
df = []
rpath="/home/olya/anaconda3/envs/r4_env/bin/"
clust_method = "WGCNA"
ds_values = [0,1,2,3]
dchs = [0.95,0.995]

In [10]:
for scenario in ["A","B","C"]:
    for gsize in [500,50,5]:
        exprs_file = fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".exprs_z.tsv"
        all_samples = set(pd.read_csv(exprs_file,sep="\t",index_col=0, nrows=0).columns.values)

        ground_truth_file=fname_prefix + "/"+scenario+"/"+scenario+".n_genes="+str(gsize)+".biclusters.tsv"
        ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
        ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
        if "genes" in ground_truth.columns.values:
            ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))
        # make dictionary with sample sets for calculate_perforamce()
        known_groups = {}
        known_groups["simulated"] = {}
        for group in ground_truth.index.values:
            known_groups["simulated"][group] = ground_truth.loc[group,"samples"]
            
        basename = scenario+".n_genes="+str(gsize)
        print(scenario,gsize)

        for n_run in range(n_runs):
            seed = seeds[n_run]
            for pval in pvals:
                for bin_method in bin_methods:
                    for ds in ds_values:
                        for dch in dchs:
                            #
                            for d in directions:
                                # save parameters as a ;-separated string
                                params = "bin="+bin_method+";pval="+str(pval)+";direction="+str("-".join(d))
                                params += ";clust="+clust_method+";dch="+str(dch)+";ds="+str(ds)+";preClustering=T"
                                
                                fname = out_dir+basename+".seed="+str(seed)+\
                                ".bin="+bin_method +",pval="+str(pval)+",clust="+clust_method+",direction="+\
                                str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True.biclusters.tsv"
                                
                                try:
                                    if os.path.exists(fname):
                                        result = read_bic_table(fname)
                                    else:
                                        result = unpast(exprs_file, 
                                                        basename,
                                                        out_dir=out_dir,
                                                        save=True,
                                                        load = True,
                                                        min_n_samples = 5,
                                                        bin_method = bin_method,
                                                        pval = pval,
                                                        directions = d,
                                                        clust_method = clust_method,
                                                        precluster=True,
                                                        ds=ds,
                                                        dch=dch,
                                                        rpath=rpath,
                                                        seed = seed,
                                                        verbose = False)

                                    # find best matches and calculate performance 
                                    performance, best_matches = calculate_perfromance(result, known_groups,all_samples,
                                                                                     performance_measure = "ARI")

                                    d = {"scenario":scenario,"gsize":gsize,
                                         "n_run":n_run,"seed":seed,
                                         "parameters":params, 
                                         "performance":performance["simulated"]}
                                    df.append(d)
                                except:
                                    print("\t failed",n_run,seed, params, file = sys.stderr)



A 500
A 50
A 5


	 failed 1 14942603 bin=GMM;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


B 500
B 50
B 5


	 failed 3 99529223 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


C 500
C 50
C 5


	 failed 0 85822412 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T
	 failed 1 14942603 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T
	 failed 4 36913810 bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T


In [11]:
overall_performance = pd.DataFrame.from_records(df)
overall_performance.to_csv(out_dir +"UnPaSt_ABC_ARI.tsv",sep = "\t")
out_dir +"UnPaSt_ABC_ARI.tsv"

'../unpast_results/simulated_ABC/UnPaSt_ABC_ARI.tsv'