In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from run_desmond2 import run_DESMOND

In [2]:
from utils.eval import find_best_matches, make_known_groups

def match_known_subtypes(results, subtypes, annotation,exprs):
    
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {} 
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)
    
    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR","IHC_TNBC"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)

    known_groups = [pam50,lum,claudin,scmod2,ihc]
    best_matches = []
    for group in known_groups:
        bm = find_best_matches(results,group,all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
    best_matches = pd.concat(best_matches, axis=0)
    return best_matches


from utils.eval import find_best_matching_biclusters

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster, 
    # and the average J index for best matches 
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()
    
    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    
    
    clust_similarity = {}
    # number of biclusters 
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()
    
    
    return clust_similarity, bm, bm2




In [3]:
out_dir= "results_on_real_data/"

exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


In [4]:
n_runs = 2
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  2  seeds [670487, 116739]


In [5]:
pvals = [0.005,0.001]#[0.005,0.001, 0.0001, 0.0005,0.01,0.05]
bin_methods = ["kmeans"] #["kmeans","GMM","ward"] 

### Louvain 
modularities = [False]#[False,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []
ud = []
for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for m in modularities:
                print("run",run,bin_method,pval,m,seed)
                
                # save parameters as a string
                params = "bin_method="+bin_method+";pval="+str(pval)
                params += ";clust_method="+"Louvain"+";modularity="+str(m)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    result_t, u,d = run_DESMOND(exprs_file_t, basename_t, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                bin_method = bin_method, pval = pval,
                                                min_n_samples = 5,
                                                clust_method = "Louvain", cluster_binary=False,
                                                modularity=m,
                                                seed = seed,
                                                verbose = False, plot_all = False,
                                                merge = 1)
                    t_failed = False
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    t_best_matches = match_known_subtypes(result_t, t_subtypes, t_annotation,exprs_t)
                    t_best_matches = t_best_matches["J"].to_dict()
                    t_best_matches.update(params_dict)
                    t_best_matches["time"] = time_t
                    subt_t.append(t_best_matches)
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    result_m, u2,d2 = run_DESMOND(exprs_file_m, basename_m, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                bin_method = bin_method, pval = pval,
                                                min_n_samples = 5,
                                                clust_method = "Louvain", cluster_binary=False,
                                                modularity=m,
                                                seed = seed,
                                                verbose = False, plot_all = False,
                                                merge = 1)
                    m_failed = False
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    m_best_matches = match_known_subtypes(result_m, m_subtypes, m_annotation,exprs_m)
                    m_best_matches = m_best_matches["J"].to_dict()
                    m_best_matches.update(params_dict)
                    m_best_matches["time"] = time_m
                    subt_m.append(m_best_matches)
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)
                    
                    
                else:
                    clustering_similarities.append({})
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)
                
                ####  relevant only for DESMOND ####
                """
                o_u = set(u.columns).intersection(set(u2.columns))
                union = set(u.columns).union(set(u2.columns))
                j_u = len(o_u)/len(union)

                o_d = set(d.columns).intersection(set(d2.columns))
                union = set(d.columns).union(set(d2.columns))
                j_d = len(o_d)/len(union)
                
                ud.append({"J_UP":j_u,"overlap_UP":len(o_u),"J_DOWN":j_d,"overlap_DOWN":len(o_d),"parameters":params})
                """
pd.DataFrame.from_records(clustering_similarities)

run 0 kmeans 0.005 False 670487
run 0 kmeans 0.001 False 670487
run 1 kmeans 0.005 False 116739
run 1 kmeans 0.001 False 116739


Unnamed: 0,n_1,n_2,percent_matched_1,percent_matched_2,n_shared_genes_1,n_shared_genes_2,avg_bm_J_1,avg_bm_J_2,parameters,seed,run
0,57,42,0.192982,0.238095,35,32,0.394043,0.427064,bin_method=kmeans;pval=0.005;clust_method=Louv...,670487,0
1,64,40,0.171875,0.25,34,31,0.368775,0.397758,bin_method=kmeans;pval=0.001;clust_method=Louv...,670487,0
2,62,41,0.16129,0.243902,32,31,0.38068,0.403128,bin_method=kmeans;pval=0.005;clust_method=Louv...,116739,1
3,64,41,0.140625,0.195122,31,28,0.404622,0.445331,bin_method=kmeans;pval=0.001;clust_method=Louv...,116739,1


In [6]:
pd.DataFrame.from_records(subt_t)

Unnamed: 0,Basal,LumA,Normal,Her2,LumB,Luminal,Claudin-low,ER-/HER2-,ER+/HER2- High Prolif,ER+/HER2- Low Prolif,HER2+,IHC_TNBC,IHC_PR,IHC_ER,IHC_HER2,parameters,seed,run,time
0,0.934673,0.723282,0.068826,0.477419,0.496622,0.889552,0.119048,0.837838,0.564189,0.729339,0.778689,0.0,0.763547,0.795987,0.420513,bin_method=kmeans;pval=0.005;clust_method=Louv...,670487,0,101.4634
1,0.934673,0.705993,0.068826,0.477124,0.489865,0.889552,0.119048,0.837838,0.564189,0.703629,0.798319,0.0,0.763547,0.782609,0.419689,bin_method=kmeans;pval=0.001;clust_method=Louv...,670487,0,111.525453
2,0.934673,0.72045,0.068826,0.471338,0.496644,0.889552,0.119048,0.837838,0.565436,0.705411,0.766129,0.0,0.763547,0.772575,0.416244,bin_method=kmeans;pval=0.005;clust_method=Louv...,116739,1,112.229571
3,0.934673,0.700952,0.068826,0.477124,0.490849,0.889552,0.119048,0.837838,0.565724,0.712215,0.798319,0.0,0.763547,0.799331,0.419689,bin_method=kmeans;pval=0.001;clust_method=Louv...,116739,1,107.616838


In [7]:
pd.DataFrame.from_records(subt_m)

Unnamed: 0,Normal,Her2,Basal,LumA,LumB,Luminal,Claudin-low,ER-/HER2-,ER+/HER2- High Prolif,ER+/HER2- Low Prolif,HER2+,IHC_TNBC,IHC_PR,IHC_ER,IHC_HER2,parameters,seed,run,time
0,0.111486,0.468657,0.837121,0.547438,0.575439,0.881919,0.159574,0.662675,0.48552,0.609108,0.8,0.0,0.639106,0.957303,0.883268,bin_method=kmeans;pval=0.005;clust_method=Louv...,670487,0,79.371482
1,0.111486,0.468657,0.837121,0.548418,0.575439,0.881919,0.159574,0.659363,0.48552,0.616491,0.8,0.0,0.639106,0.952809,0.883268,bin_method=kmeans;pval=0.001;clust_method=Louv...,670487,0,69.258133
2,0.112245,0.468657,0.837121,0.552224,0.575439,0.835793,0.159574,0.662675,0.48552,0.619923,0.8,0.0,0.631285,0.957303,0.883268,bin_method=kmeans;pval=0.005;clust_method=Louv...,116739,1,80.011026
3,0.112245,0.468657,0.86692,0.542533,0.575439,0.95203,0.15625,0.672131,0.48552,0.60775,0.8,0.0,0.649162,0.997753,0.883268,bin_method=kmeans;pval=0.001;clust_method=Louv...,116739,1,78.364729


In [9]:
pd.DataFrame.from_records(ud)