In [1]:
import os
import pandas as pd
import random
import copy

import sys
sys.path.insert(0, '..')

from unpast.run_unpast import unpast
from unpast.utils.method import prepare_input_matrix
from unpast.utils.consensus import make_consensus_biclusters

from unpast.utils.io import read_bic_table

from unpast.utils.eval import make_ref_groups
from unpast.utils.eval import calculate_perfromance, compare_gene_clusters

# 1. Reading expressions and annotations

In [2]:
data_dir = "../../data/preprocessed_v6/"

In [3]:
exprs_file_t = data_dir +"TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)

exprs_file_m = data_dir + "METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)

m_subtypes = pd.read_csv(data_dir + "METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv(data_dir + "METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv(data_dir + "TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv(data_dir + "TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)

## 1.1 Preparing ground truth samples sets for performance evaluation

### Example of known_groups dictionary for TCGA-BRCA

*make_ref_groups(subtypes, annotation, exprs)*

**input:**
  - subtypes - subtypes dataframe
  - annotation - annotation dataframe
  - exprs - expression dataframe
  
**returns:**
  -  known_groups = {classificaton1:{"subt1":{s1,s2,...} , "subt2":{...}, "subt3":{...}, ...}, "classi2":{"subtA":{...}}, ... }
*known_groups* is a dictionary with known sample classifications. Each classification (e.g. PAM50 or IHC or Luminal) is a dict that can conatain one or several sample sets 
  -  all_samples = {} set of all samples in expression and annotation files; necessary for computing overlap p-values

In [4]:
known_groups_t, all_samples_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, all_samples_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

## The sructure of known_groups dict for TCGA-BRCA:

We calculate performance for **classifications**:

    * PAM50 = [Luminal, Basal, Her2, Normal]
    * Intrinsic = [Luminal, Basal, Her2, Normal, Claudin-low]
    * PAM50_AB =  [LumA, LumB, Basal, Her2, Normal]
    * SCMOD2 = [ER-/HER2-, ER+/HER2- High Prolif, ER+/HER2- Low Prolif,  HER2+]
    * IHC = [IHC_HER2, IHC_ER, IHC_PR, IHC_TNBC]

And for **isolated sample sets** corresponding to Luminal, Basal, LumA, NEC subtypes etc. 

In [5]:
for cl in known_groups_t.keys():
    if len(known_groups_t[cl].keys())>1:
        print("classification", cl)
        print("\tsbtypes:"," ".join(known_groups_t[cl].keys()))
    else:
        print(" classification", cl, "(individual subtype)")

classification PAM50
	sbtypes: Normal Basal Her2 Luminal
classification Intrinsic
	sbtypes: Normal Basal Her2 Luminal Claudin-low
classification PAM50_AB
	sbtypes: Normal Basal LumA Her2 LumB
classification SCMOD2
	sbtypes: HER2+ ER-/HER2- ER+/HER2- Low Prolif ER+/HER2- High Prolif
classification IHC
	sbtypes: IHC_HER2 IHC_ER IHC_PR IHC_TNBC
 classification Luminal (individual subtype)
 classification Basal (individual subtype)
 classification Her2 (individual subtype)
 classification LumA (individual subtype)
 classification LumB (individual subtype)
 classification Normal (individual subtype)
 classification Claudin-low (individual subtype)
 classification IHC_HER2 (individual subtype)
 classification IHC_ER (individual subtype)
 classification IHC_PR (individual subtype)
 classification IHC_TNBC (individual subtype)
 classification NET_kmeans (individual subtype)
 classification NET_ward (individual subtype)


## 1.2 evaluation of the resulting sample sets (on the example of UnPaSt file) 
reading the results 

In [6]:
# reading pre-computed UnPaSt results
bic_file = "../../results_on_real_data_WGCNA2/TCGA.seed=670487.bin=kmeans,pval=0.01,clust=WGCNA,direction=UP-DOWN,ds=3,dch=0.995,max_power=10,precluster=True.biclusters.tsv"
result = read_bic_table(bic_file) 
print("sample clusters: ", result.shape[0])
result.head(2)

sample clusters:  168


Unnamed: 0_level_0,SNR,n_genes,n_samples,genes,samples,direction,genes_up,genes_down,gene_indexes,sample_indexes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4.042148,116,14,"{RNU6ATAC, FSHB, OR1N2, CBLL2, LRRTM3, DLGAP2-...","{TCGA-A7-A0DC-01, TCGA-A7-A13G-01, TCGA-A7-A26...",UP,"{RNU6ATAC, FSHB, OR1N2, CBLL2, LRRTM3, DLGAP2-...",{},"{8194, 10251, 13836, 13838, 15375, 2063, 13841...","{130, 131, 132, 133, 262, 135, 137, 138, 202, ..."
1,3.314768,52,191,"{FOXA1, ZG16B, CNTNAP2, TSPAN1, GALNT10, SLC44...","{TCGA-AO-A128-01, TCGA-D8-A1XQ-01, TCGA-B6-A3Z...",DOWN,{},"{FOXA1, ZG16B, CNTNAP2, TSPAN1, GALNT10, SLC44...","{2947, 7430, 17160, 17161, 15755, 7821, 12174,...","{1024, 1018, 514, 1030, 519, 8, 1035, 524, 103..."


* ensure that results file is a dataframe with "samples" column
* each row in samples column must contain a non-empty set of samples
## performance evaluation
* requires *known_groups* dict and *all_samples* set  
     - using *make_ref_groups()* is recommened for this breast cancer analysis
     - alternatively, *known_groups* dict and *all_samples* can be created manually
* if samples in (bi)clusters do not match *all_samples* set, trho

*calculate_perfromance(bi_clusters_df, annotation, exprs)*

**input:**
  - bi_clusters_df - a dataframe with sample clusters (sets in "sample" column)
  - *known_groups* is a dictionary with known sample classifications. Each classification (e.g. PAM50 or IHC or Luminal) is a dict that can conatain one or several sample sets 
  - *all_samples* = {} set of all samples in expression and annotation files; necessary for computing overlap p-values
  
**returns:**
  - performances - *pandas.Series* with overall perforamnce for each classification from *known_groups* 
  - best_matches - a dataframe with information about the best matching (bi)cluster for each sample set from *known_groups* (helpful for debugging and validation)

In [7]:
performances, best_matches = calculate_perfromance(result, known_groups_t,all_samples_t, performance_measure="ARI")
#performances
#best_matches

# 2. Evaluation of the results obtained with different parameters
(UnPaSt)

In [8]:
# selecting 5 seeds for probabilistic methods 
analysis_seed = 42
n_runs = 5
seeds = []
random.seed(analysis_seed)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [9]:
subt_t = [] # Performances for TCGA-BRCA
subt_m = [] # Performances for METABRIC
clustering_similarities = [] # Similarities of gene clusters found in TCGA and METABRIC

# UnPaSt parameters 

rpath = "/home/olya/anaconda3/envs/r4_env/bin/"
out_dir = "../unpast_results/breast_cancer/"
basename_t = "TCGA"
basename_m = "METABRIC" 
pvals = [0.05,0.01,0.005,0.001]
bin_methods = ["kmeans","GMM","ward"] 
directions =  [["UP","DOWN"],["BOTH"]]

In [None]:
clust_methods = ["WGCNA"]
dss = [0,1,2,3]
dchs = [0.95,0.995]
cseed = 0

pc = True

for pval in pvals:
    for ds in dss:
        for dch in dchs:
            for d in directions:
                for clust_method in clust_methods:
                    for bin_method in bin_methods:
                        # save parameters as a ;-separated string
                        params = "bin="+bin_method+";pval="+str(pval)+";direction="+str("-".join(d))
                        params += ";clust="+clust_method+";dch="+str(dch)+";ds="+str(ds)+";preClustering=T"
                        biclusters_t = []
                        biclusters_m = []
                        for r in range(n_runs):
                            seed = seeds[r]

                            params_dict = {"parameters":params, "seed":seed,"run":r}

                            ### running TCGA or reading results
                            fname = out_dir+basename_t+".seed="+str(seed)+".bin="+bin_method +",pval="+str(pval)+",clust=WGCNA,direction="+str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True"+".biclusters.tsv"
                            result_t = read_bic_table(fname)
                            if type(result_t)!=pd.DataFrame:
                                result_t = unpast(exprs_file_t, basename_t, out_dir=out_dir,
                                                            save=True, load = True,
                                                            min_n_samples = 5,
                                                            bin_method = bin_method, pval = pval,
                                                            directions = d,
                                                            clust_method = clust_method,
                                                            precluster=pc,
                                                            ds=ds,dch=dch,
                                                            rpath=rpath,
                                                            seed = seed,
                                                            verbose = False)

                            # find the best matches between TCGA-BRCA biclusters and true subtypes
                            # calculate overall performance: sum of ARIs weighted proprtionally to true cluster sizes
                            performance_t, bm_dict_t = calculate_perfromance(result_t, known_groups_t,
                                                                            all_samples_t,
                                                                            performance_measure="ARI")
                            performance_t = performance_t.to_dict()
                            performance_t.update(params_dict)
                            subt_t.append(performance_t)

                            ### running METABRIC or reading results
                            fname = out_dir+basename_m+".seed="+str(seed)+".bin="+bin_method +",pval="+str(pval)+",clust=WGCNA,direction="+str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True"+".biclusters.tsv"
                            result_m = read_bic_table(fname)
                            if type(result_m)!=pd.DataFrame:
                                result_m = unpast(exprs_file_m, basename_m, out_dir=out_dir,
                                                            save=True, load = True,
                                                            min_n_samples = 5,
                                                            bin_method = bin_method, pval = pval,
                                                            directions = d,
                                                            clust_method = clust_method,
                                                            precluster=pc,
                                                            ds=ds,dch=dch,
                                                            rpath=rpath,
                                                            seed = seed,
                                                            verbose = False)


                            # find the best matches between METABRIC biclusters and true subtypes
                            # calculate overall performance: sum of ARIs weighted proprtionally to true cluster sizes
                            performance_m,bm_dict_m = calculate_perfromance(result_m, known_groups_m,all_samples_m,
                                                                            performance_measure="ARI")
                            performance_m = performance_m.to_dict()
                            performance_m.update(params_dict)
                            subt_m.append(performance_m)
                            print(params,seed, round(performance_t["PAM50"],3),round(performance_m["PAM50"],3))    
    

### Saving method performaces for all parameter combinations

In [11]:
pd.DataFrame.from_records(subt_t).to_csv(out_dir +"UnPaSt_TCGA_ARI.tsv",sep = "\t")
pd.DataFrame.from_records(subt_m).to_csv(out_dir +"UnPaSt_METABRIC_ARI.tsv",sep = "\t")
out_dir +"UnPaSt_TCGA_ARI.tsv"

'../unpast_results/breast_cancer/UnPaSt_TCGA_ARI.tsv'

## 3. Selecting parameters for TCGA and METABRIC
* "best": max. performance for PAM50 classification
* "optimal": minimal rank sum for TCGA and METABRIC

In [None]:
ds1 = "TCGA-BRCA"
ds2 = "METABRIC"
method = "UnPaSt"
performance_col = "PAM50"

df1 = pd.read_csv(out_dir +"UnPaSt_TCGA_ARI.tsv",sep = "\t",index_col =0)
df2 = pd.read_csv(out_dir +"UnPaSt_METABRIC_ARI.tsv",sep = "\t",index_col =0)


#if "seed" in df1.columns or "seed" in df2.columns:
df1 = df1.groupby("parameters").agg("mean")
df2 = df2.groupby("parameters").agg("mean")

df1 = df1.sort_values(by=performance_col,ascending= False)
df2 = df2.sort_values(by=performance_col,ascending= False)

df1["rank"] =df1[performance_col].rank(ascending= False)
df2["rank"] =df2[performance_col].rank(ascending= False)
mean_ranks = (df1["rank"]+df2["rank"])*0.5
mean_ranks = mean_ranks.sort_values()
best_mean_rank = mean_ranks.head(1)[0]
optimized_params = mean_ranks[mean_ranks == best_mean_rank].index.values
print(method+"\tbest mean rank:",best_mean_rank, "top-%:", round(best_mean_rank/mean_ranks.shape[0],4)*100)

print("\topt. parameters:\n\t\t"+"\n\t\t".join(optimized_params) )

# perfromance with optimized parameters
opt_perf1 = df1.loc[optimized_params,performance_col].sort_values(ascending= False)[0]
opt_perf2 = df2.loc[optimized_params,performance_col].sort_values(ascending= False)[0]
print("\tperformance w. optimized:\t%s:%.2f\t%s:%.2f"%(ds1,opt_perf1,ds2,opt_perf2))
# best perfromance 
best_perf1 = df1.loc[:,performance_col].sort_values(ascending= False)
best_perf1 = best_perf1[0]
best_param1 =  df1.loc[df1[performance_col]==best_perf1,:].index.values
print("\tbest parameters %s:\t%.2f"%(ds1,best_perf1))
print("\t\t"+"\n\t\t".join(best_param1))

best_perf2 = df2.loc[:,performance_col].sort_values(ascending= False)
best_perf2 = best_perf2[0]
best_param2 =  df2.loc[df2[performance_col]==best_perf2,:].index.values
print("\tbest parameters %s:\t%.2f"%(ds2,best_perf2))
print("\t\t"+"\n\t\t".join(best_param2))
