In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from utils.method import read_bic_table

from utils.eval import find_best_matches, make_known_groups
from utils.eval import find_best_matching_biclusters

In [2]:
def make_ref_groups(subtypes, annotation,exprs):
    # prepared a dict of subtype classifications {"class1":{"subt1":[],"subt2":[]},"class2":{"subtA":[],"subtB":[]}}
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {} 
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)
    
    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)
    ihc["IHC_TNBC"] = set(annotation.loc[annotation["IHC_TNBC"]==1,:].index.values)
    
    known_groups = {"PAM50":pam50,"Luminal":lum,"Claudin-low":claudin,"SCMOD2":scmod2,"IHC":ihc}
    
    freqs = {}
    N =  exprs.shape[1]
    for classification in known_groups.keys():
        for group in known_groups[classification].keys():
            n = len(known_groups[classification][group])
            freqs[group] = n/N
            
    return known_groups, freqs

def calculate_perfromance(results, known_groups, freqs, all_samples,
                          classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"]}):
    # finds best matches for each subtype, calcuates J per subtype and overall performance
    N = len(all_samples)
    best_matches = []
    
    for classification in known_groups.keys():
        bm = find_best_matches(results,known_groups[classification],all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
            
    best_matches = pd.concat(best_matches, axis=0)
    best_matches = best_matches["J"].to_dict()
    
    for cl_name in classifications.keys():
        overall_performance = 0
        norm_factor = 0
        for group in classifications[cl_name]:
            overall_performance += best_matches[group]*freqs[group]
            norm_factor +=freqs[group]
        overall_performance = overall_performance/norm_factor 
        best_matches["overall_performance_"+cl_name] = overall_performance
    return best_matches

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster, 
    # and the average J index for best matches 
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()
    
    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    
    
    clust_similarity = {}
    # number of biclusters 
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()
    
    
    return clust_similarity, bm, bm2




In [3]:
classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"],
                "SCMOD2":["ER-/HER2-","ER+/HER2- Low Prolif","ER+/HER2- High Prolif","HER2+"],
                "IHC":["IHC_TNBC","IHC_ER","IHC_HER2","IHC_PR"]}

exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


known_groups_t, freqs_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, freqs_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

In [4]:
n_runs = 5
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [5]:
subt_t = []
subt_m = []
clustering_similarities = []

pvals = [0.0001, 0.0005,0.005,0.001,0.01,0.05]
bin_methods = ["kmeans","GMM","ward"] 

In [6]:
### Louvain 
out_dir= "results_on_real_data/"
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []
for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for m in modularities:
                #print("run",run,bin_method,pval,m,seed)
                
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"Louvain"+";m="+str(m)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_t+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_t = read_bic_table(fname)
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_t = calculate_perfromance(result_t, known_groups_t,
                                                          freqs_t, set(exprs_t.columns.values),
                                                          classifications=classifications)
                    performance_t.update(params_dict)
                    performance_t["time"] = time_t
                    subt_t.append(performance_t)
                    t_failed = False
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_m+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_m = read_bic_table(fname)
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_m = calculate_perfromance(result_m, known_groups_m,
                                                          freqs_m, set(exprs_m.columns.values),
                                                          classifications=classifications)
                    performance_m.update(params_dict)
                    performance_m["time"] = time_m
                    subt_m.append(performance_m)
                    m_failed = False
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results - only if gene sets are defined for each cluster
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)                    
                else:
                    clust_sim = {}
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)
                               

In [7]:
out_dir = "results_on_real_data_WGCNA/"
### WGCNA
ds_values = [0,1,2,3,4]
dch = 0.995

for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for ds in ds_values:
                
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"WGCNA"+";ds="+str(ds)+";dch="+str(dch)
                #print("run",run,seed,params)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_t+".seed="+str(seed)+".bin="+bin_method \
                    +",pval="+str(pval)+",clust=WGCNA"+",ds="+str(ds)+",dch="+str(dch)+".biclusters.tsv"
                    result_t = read_bic_table(fname)
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_t = calculate_perfromance(result_t, known_groups_t,
                                                          freqs_t, set(exprs_t.columns.values),
                                                          classifications=classifications)
                    performance_t.update(params_dict)
                    performance_t["time"] = time_t
                    subt_t.append(performance_t)
                    t_failed = False
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_m+".seed="+str(seed)+".bin="+bin_method \
                    +",pval="+str(pval)+",clust=WGCNA"+",ds="+str(ds)+",dch="+str(dch)+".biclusters.tsv"
                    result_m = read_bic_table(fname)
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_m = calculate_perfromance(result_m, known_groups_m,
                                                          freqs_m, set(exprs_m.columns.values),
                                                          classifications=classifications)
                    performance_m.update(params_dict)
                    performance_m["time"] = time_m
                    subt_m.append(performance_m)
                    m_failed = False
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method,file = sys.stderr)
                    print(fname)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results - only if gene sets are defined for each cluster
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)
                    
                    
                else:
                    clust_sim = {}
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)


In [8]:
df = pd.DataFrame.from_records(subt_t).groupby("parameters").agg("mean").sort_values("overall_performance_Intrinsic",ascending = False)
df.head(3)

Unnamed: 0_level_0,Basal,LumA,LumB,Her2,Normal,Luminal,Claudin-low,ER-/HER2-,HER2+,ER+/HER2- Low Prolif,...,IHC_ER,IHC_PR,IHC_TNBC,IHC_HER2,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.01;clust=WGCNA;ds=3;dch=0.995,0.920175,0.705,0.479765,0.473805,0.092771,0.936716,0.115793,0.85325,0.775418,0.722508,...,0.833445,0.734483,0.462887,0.419193,0.832475,0.690762,0.732767,375882.4,2.0,0.045097
bin=kmeans;pval=0.005;clust=WGCNA;ds=3;dch=0.995,0.910904,0.722438,0.472924,0.479386,0.099015,0.934925,0.113662,0.858906,0.794304,0.725314,...,0.830769,0.749261,0.459758,0.417075,0.830309,0.690629,0.736929,375882.4,2.0,0.039529
bin=kmeans;pval=0.0005;clust=WGCNA;ds=2;dch=0.995,0.883853,0.672015,0.503178,0.508823,0.09547,0.93791,0.109811,0.861287,0.813917,0.611906,...,0.837458,0.737931,0.452449,0.407772,0.830166,0.645458,0.734206,375882.4,2.0,0.030332


In [9]:
df = pd.DataFrame.from_records(subt_m).groupby("parameters").agg("mean").sort_values("overall_performance_Intrinsic",ascending = False)
df.head(3)

Unnamed: 0_level_0,Basal,LumB,LumA,Her2,Normal,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,...,IHC_ER,IHC_HER2,IHC_PR,IHC_TNBC,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.0001;clust=WGCNA;ds=0;dch=0.995,0.849441,0.525238,0.470244,0.563183,0.087507,0.969742,0.162624,0.669133,0.686179,0.515569,...,0.997753,0.680394,0.54838,0.676319,0.846321,0.545285,0.78982,375882.4,2.0,0.029243
bin=kmeans;pval=0.005;clust=WGCNA;ds=3;dch=0.995,0.855456,0.592458,0.513004,0.479413,0.10713,0.976384,0.161526,0.802946,0.689765,0.564508,...,0.996404,0.846992,0.704134,0.679299,0.841819,0.601998,0.854887,375882.4,2.0,0.045272
bin=kmeans;pval=0.0005;clust=WGCNA;ds=0;dch=0.995,0.855674,0.526752,0.484206,0.532637,0.087241,0.96679,0.161102,0.721651,0.691933,0.530483,...,0.997303,0.746905,0.566034,0.679674,0.841247,0.561779,0.801094,375882.4,2.0,0.028384


In [10]:
pd.DataFrame.from_records(clustering_similarities).to_csv("UnPaSt_similarities.tsv",sep = "\t")
pd.DataFrame.from_records(subt_t).to_csv("UnPaSt_TCGA.tsv",sep = "\t")
pd.DataFrame.from_records(subt_m).to_csv("UnPaSt_METABRIC.tsv",sep = "\t")
