In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from utils.method import read_bic_table

In [2]:
from utils.eval import find_best_matches, make_known_groups
from utils.eval import find_best_matching_biclusters

def make_ref_groups(subtypes, annotation,exprs):
    # prepared a dict of subtype classifications {"class1":{"subt1":[],"subt2":[]},"class2":{"subtA":[],"subtB":[]}}
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {} 
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)
    
    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)
    ihc["IHC_TNBC"] = set(annotation.loc[annotation["IHC_TNBC"]==1,:].index.values)
    
    known_groups = {"PAM50":pam50,"Luminal":lum,"Claudin-low":claudin,"SCMOD2":scmod2,"IHC":ihc}
    
    freqs = {}
    N =  exprs.shape[1]
    for classification in known_groups.keys():
        for group in known_groups[classification].keys():
            n = len(known_groups[classification][group])
            freqs[group] = n/N
            
    return known_groups, freqs

def calculate_perfromance(results, known_groups, freqs, all_samples,
                          classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"]}):
    # finds best matches for each subtype, calcuates J per subtype and overall performance
    N = len(all_samples)
    best_matches = []
    
    for classification in known_groups.keys():
        bm = find_best_matches(results,known_groups[classification],all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
            
    best_matches = pd.concat(best_matches, axis=0)
    best_matches = best_matches["J"].to_dict()
    
    for cl_name in classifications.keys():
        overall_performance = 0
        norm_factor = 0
        for group in classifications[cl_name]:
            overall_performance += best_matches[group]*freqs[group]
            norm_factor +=freqs[group]
        overall_performance = overall_performance/norm_factor 
        best_matches["overall_performance_"+cl_name] = overall_performance
    return best_matches

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster, 
    # and the average J index for best matches 
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()
    
    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    
    
    clust_similarity = {}
    # number of biclusters 
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()
    
    
    return clust_similarity, bm, bm2




In [3]:
classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"],
                "SCMOD2":["ER-/HER2-","ER+/HER2- Low Prolif","ER+/HER2- High Prolif","HER2+"],
                "IHC":["IHC_TNBC","IHC_ER","IHC_HER2","IHC_PR"]}

exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


known_groups_t, freqs_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, freqs_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

In [4]:
n_runs = 5
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [5]:
pvals = [0.0001, 0.0005,0.005,0.001,0.01,0.05]
bin_methods = ["kmeans","GMM","ward"] 

### Louvain 
out_dir= "results_on_real_data/"
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []
for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for m in modularities:
                print("run",run,bin_method,pval,m,seed)
                
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"Louvain"+";m="+str(m)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_t+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_t = read_bic_table(fname)
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_t = calculate_perfromance(result_t, known_groups_t,
                                                          freqs_t, set(exprs_t.columns.values),
                                                          classifications=classifications)
                    performance_t.update(params_dict)
                    performance_t["time"] = time_t
                    subt_t.append(performance_t)
                    t_failed = False
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_m+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_m = read_bic_table(fname)
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_m = calculate_perfromance(result_m, known_groups_m,
                                                          freqs_m, set(exprs_m.columns.values),
                                                          classifications=classifications)
                    performance_m.update(params_dict)
                    performance_m["time"] = time_m
                    subt_m.append(performance_m)
                    m_failed = False
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results - only if gene sets are defined for each cluster
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)
                    
                    
                else:
                    clust_sim = {}
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)
                
                
pd.DataFrame.from_records(clustering_similarities)

run 0 kmeans 0.0001 0 670487
run 0 kmeans 0.0001 0.3 670487
run 0 kmeans 0.0001 0.4 670487
run 0 kmeans 0.0001 0.5 670487
run 0 kmeans 0.0001 0.6 670487
run 0 kmeans 0.0001 0.7 670487
run 0 kmeans 0.0001 0.8 670487
run 0 kmeans 0.0001 0.9 670487
run 0 GMM 0.0001 0 670487
run 0 GMM 0.0001 0.3 670487
run 0 GMM 0.0001 0.4 670487
run 0 GMM 0.0001 0.5 670487
run 0 GMM 0.0001 0.6 670487
run 0 GMM 0.0001 0.7 670487
run 0 GMM 0.0001 0.8 670487
run 0 GMM 0.0001 0.9 670487
run 0 ward 0.0001 0 670487
run 0 ward 0.0001 0.3 670487
run 0 ward 0.0001 0.4 670487
run 0 ward 0.0001 0.5 670487
run 0 ward 0.0001 0.6 670487
run 0 ward 0.0001 0.7 670487
run 0 ward 0.0001 0.8 670487
run 0 ward 0.0001 0.9 670487
run 0 kmeans 0.0005 0 670487
run 0 kmeans 0.0005 0.3 670487
run 0 kmeans 0.0005 0.4 670487
run 0 kmeans 0.0005 0.5 670487
run 0 kmeans 0.0005 0.6 670487
run 0 kmeans 0.0005 0.7 670487
run 0 kmeans 0.0005 0.8 670487
run 0 kmeans 0.0005 0.9 670487
run 0 GMM 0.0005 0 670487
run 0 GMM 0.0005 0.3 670487
ru

run 2 kmeans 0.0001 0.6 26225
run 2 kmeans 0.0001 0.7 26225
run 2 kmeans 0.0001 0.8 26225
run 2 kmeans 0.0001 0.9 26225
run 2 GMM 0.0001 0 26225
run 2 GMM 0.0001 0.3 26225
run 2 GMM 0.0001 0.4 26225
run 2 GMM 0.0001 0.5 26225
run 2 GMM 0.0001 0.6 26225
run 2 GMM 0.0001 0.7 26225
run 2 GMM 0.0001 0.8 26225
run 2 GMM 0.0001 0.9 26225
run 2 ward 0.0001 0 26225
run 2 ward 0.0001 0.3 26225
run 2 ward 0.0001 0.4 26225
run 2 ward 0.0001 0.5 26225
run 2 ward 0.0001 0.6 26225
run 2 ward 0.0001 0.7 26225
run 2 ward 0.0001 0.8 26225
run 2 ward 0.0001 0.9 26225
run 2 kmeans 0.0005 0 26225
run 2 kmeans 0.0005 0.3 26225
run 2 kmeans 0.0005 0.4 26225
run 2 kmeans 0.0005 0.5 26225
run 2 kmeans 0.0005 0.6 26225
run 2 kmeans 0.0005 0.7 26225
run 2 kmeans 0.0005 0.8 26225
run 2 kmeans 0.0005 0.9 26225
run 2 GMM 0.0005 0 26225
run 2 GMM 0.0005 0.3 26225
run 2 GMM 0.0005 0.4 26225
run 2 GMM 0.0005 0.5 26225
run 2 GMM 0.0005 0.6 26225
run 2 GMM 0.0005 0.7 26225
run 2 GMM 0.0005 0.8 26225
run 2 GMM 0.0005 0.

run 4 GMM 0.0001 0.7 288389
run 4 GMM 0.0001 0.8 288389
run 4 GMM 0.0001 0.9 288389
run 4 ward 0.0001 0 288389
run 4 ward 0.0001 0.3 288389
run 4 ward 0.0001 0.4 288389
run 4 ward 0.0001 0.5 288389
run 4 ward 0.0001 0.6 288389
run 4 ward 0.0001 0.7 288389
run 4 ward 0.0001 0.8 288389
run 4 ward 0.0001 0.9 288389
run 4 kmeans 0.0005 0 288389
run 4 kmeans 0.0005 0.3 288389
run 4 kmeans 0.0005 0.4 288389
run 4 kmeans 0.0005 0.5 288389
run 4 kmeans 0.0005 0.6 288389
run 4 kmeans 0.0005 0.7 288389
run 4 kmeans 0.0005 0.8 288389
run 4 kmeans 0.0005 0.9 288389
run 4 GMM 0.0005 0 288389
run 4 GMM 0.0005 0.3 288389
run 4 GMM 0.0005 0.4 288389
run 4 GMM 0.0005 0.5 288389
run 4 GMM 0.0005 0.6 288389
run 4 GMM 0.0005 0.7 288389
run 4 GMM 0.0005 0.8 288389
run 4 GMM 0.0005 0.9 288389
run 4 ward 0.0005 0 288389
run 4 ward 0.0005 0.3 288389
run 4 ward 0.0005 0.4 288389
run 4 ward 0.0005 0.5 288389
run 4 ward 0.0005 0.6 288389
run 4 ward 0.0005 0.7 288389
run 4 ward 0.0005 0.8 288389
run 4 ward 0.0005

Unnamed: 0,n_1,n_2,percent_matched_1,percent_matched_2,n_shared_genes_1,n_shared_genes_2,avg_bm_J_1,avg_bm_J_2,parameters,seed,run
0,54,47,0.185185,0.191489,32,29,0.378284,0.410512,bin=kmeans;pval=0.0001;clust=Louvain;m=0,670487,0
1,69,44,0.217391,0.363636,197,175,0.223249,0.210149,bin=kmeans;pval=0.0001;clust=Louvain;m=0.3,670487,0
2,47,45,0.297872,0.333333,155,148,0.229930,0.221145,bin=kmeans;pval=0.0001;clust=Louvain;m=0.4,670487,0
3,56,45,0.321429,0.400000,122,121,0.402653,0.403209,bin=kmeans;pval=0.0001;clust=Louvain;m=0.5,670487,0
4,61,49,0.295082,0.387755,96,105,0.400219,0.384806,bin=kmeans;pval=0.0001;clust=Louvain;m=0.6,670487,0
...,...,...,...,...,...,...,...,...,...,...,...
715,73,68,0.301370,0.323529,315,277,0.333160,0.332164,bin=ward;pval=0.05;clust=Louvain;m=0.5,288389,4
716,84,66,0.297619,0.393939,208,226,0.318742,0.315555,bin=ward;pval=0.05;clust=Louvain;m=0.6,288389,4
717,96,65,0.260417,0.415385,168,181,0.352025,0.331444,bin=ward;pval=0.05;clust=Louvain;m=0.7,288389,4
718,93,58,0.258065,0.379310,128,103,0.295633,0.317917,bin=ward;pval=0.05;clust=Louvain;m=0.8,288389,4


In [7]:
pd.DataFrame.from_records(clustering_similarities).to_csv("UnPaSt_similarities.tsv",sep = "\t")
pd.DataFrame.from_records(subt_t).to_csv("UnPaSt_TCGA.tsv",sep = "\t")
pd.DataFrame.from_records(subt_m).to_csv("UnPaSt_METABRIC.tsv",sep = "\t")
pd.DataFrame.from_records(subt_t)

Unnamed: 0,Normal,Her2,Basal,LumB,LumA,Luminal,Claudin-low,HER2+,ER+/HER2- High Prolif,ER+/HER2- Low Prolif,...,IHC_PR,IHC_ER,IHC_HER2,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,parameters,seed,run,time
0,0.068826,0.477124,0.924623,0.489796,0.697974,0.853731,0.114833,0.798319,0.562925,0.681729,...,0.711823,0.759197,0.419689,0.777915,0.677229,0.690088,bin=kmeans;pval=0.0001;clust=Louvain;m=0,670487,0,0.017318
1,0.109589,0.477124,0.894009,0.427549,0.491577,0.853731,0.120172,0.798319,0.516245,0.454798,...,0.689655,0.785953,0.419689,0.773979,0.595739,0.693852,bin=kmeans;pval=0.0001;clust=Louvain;m=0.3,670487,0,0.022913
2,0.080000,0.477124,0.887850,0.390870,0.590741,0.832836,0.115385,0.798319,0.448071,0.500000,...,0.667488,0.782609,0.419689,0.758045,0.583580,0.683541,bin=kmeans;pval=0.0001;clust=Louvain;m=0.4,670487,0,0.026934
3,0.068433,0.477124,0.935961,0.430743,0.552083,0.868657,0.119816,0.798319,0.562998,0.496523,...,0.706897,0.795987,0.419689,0.789950,0.622867,0.705269,bin=kmeans;pval=0.0001;clust=Louvain;m=0.5,670487,0,0.025383
4,0.070938,0.477124,0.935961,0.450909,0.624809,0.910448,0.119816,0.798319,0.553655,0.529862,...,0.736453,0.799331,0.419689,0.817684,0.630484,0.718264,bin=kmeans;pval=0.0001;clust=Louvain;m=0.6,670487,0,0.023644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.094512,0.478261,0.949495,0.456929,0.658996,0.874627,0.119658,0.846154,0.468591,0.726841,...,0.677340,0.678930,0.392265,0.797095,0.668831,0.638758,bin=ward;pval=0.05;clust=Louvain;m=0.5,288389,4,0.029649
716,0.071759,0.492647,0.944444,0.436187,0.729630,0.925373,0.119048,0.854369,0.557099,0.737288,...,0.783251,0.762542,0.394444,0.830519,0.702066,0.718327,bin=ward;pval=0.05;clust=Louvain;m=0.6,288389,4,0.030696
717,0.086758,0.492647,0.923858,0.475954,0.663415,0.835821,0.117073,0.854369,0.587065,0.757812,...,0.719212,0.729097,0.394444,0.768019,0.714677,0.678627,bin=ward;pval=0.05;clust=Louvain;m=0.7,288389,4,0.033831
718,0.098413,0.480769,0.945274,0.438356,0.666667,0.767164,0.120930,0.780488,0.489871,0.661224,...,0.628079,0.745819,0.423469,0.725634,0.648662,0.652805,bin=ward;pval=0.05;clust=Louvain;m=0.8,288389,4,0.029343
