In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from run_desmond2 import run_DESMOND

In [2]:
n_runs = 5
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [3]:
out_dir= "results_on_real_data/"

exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


In [4]:
from utils.eval import find_best_matches , make_known_groups

def match_known_subtypes(results, subtypes, annotation,exprs):
    
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {} 
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)
    
    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR","IHC_TNBC"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)

    known_groups = [pam50,lum,claudin,scmod2,ihc]
    best_matches = []
    for group in known_groups:
        bm = find_best_matches(results,group,all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
    best_matches = pd.concat(best_matches, axis=0)
    return best_matches


from utils.eval import find_best_matching_biclusters

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster, 
    # and the average J index for best matches 
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()
    
    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    
    clust_similarity = {}
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()
    
    
    return clust_similarity, bm, bm2




In [None]:
pvals = [0.005,0.001] # [0.0001, 0.0005,0.001,0.005,0.01,0.05]
bin_methods = ["kmeans","GMM","ward"] 

### Louvain 
modularities = [False,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []
ud = []
for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for m in modularities:
                print("run",run,bin_method,pval,m,seed)
                
                # save parameters as a string
                params = "bin_method="+bin_method+";pval="+str(pval)
                params += ";clust_method="+"Louvain"+";modularity="+str(m)
                
                ### running TCGA or reading results
                
                try:
                    t0 = time()
                    result_t, u,d = run_DESMOND(exprs_file_t, basename_t, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                bin_method = bin_method, pval = pval,
                                                min_n_samples = 5,
                                                clust_method = "Louvain", cluster_binary=False,
                                                #ds = ds, dch  = dch,
                                                #similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False, plot_all = False,
                                                merge = 1)
                    t_failed = False
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    t_best_matches = match_known_subtypes(result_t, t_subtypes, t_annotation,exprs_t)
                    t_best_matches = t_best_matches["J"]
                    t_best_matches["parameters"] = params
                    t_best_matches["time"] = time_t
                    subt_t.append(t_best_matches)
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    t_failed = True
                    subt_t.append({})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    result_m, u2,d2 = run_DESMOND(exprs_file_m, basename_m, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                bin_method = bin_method, pval = pval,
                                                min_n_samples = 5,
                                                clust_method = "Louvain", cluster_binary=False,
                                                #ds = ds, dch  = dch,
                                                #similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False, plot_all = False,
                                                merge = 1)
                    m_failed = False
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    m_best_matches = match_known_subtypes(result_m, m_subtypes, m_annotation,exprs_m)
                    m_best_matches = m_best_matches["J"]
                    m_best_matches["parameters"] = params
                    m_best_matches["time"] = time_m
                    subt_m.append(m_best_matches)
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    m_failed = True
                    subt_m.append({})
                    
                # compare clustering results
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)
                    clustering_similarities.append(clust_sim)
                else:
                    clustering_similarities.append({})
                
                ####  relevant only for DESMOND ####
                
                o_u = set(u.columns).intersection(set(u2.columns))
                union = set(u.columns).union(set(u2.columns))
                j_u = len(o_u)/len(union)

                o_d = set(d.columns).intersection(set(d2.columns))
                union = set(d.columns).union(set(d2.columns))
                j_d = len(o_d)/len(union)
                
                ud.append({"J_UP":j_u,"overlap_UP":len(o_u),"J_DOWN":j_d,"overlap_DOWN":len(o_d),"parameters":params})
pd.DataFrame.from_records(clustering_similarities)

run 0 kmeans 0.005 False 670487


results_on_real_data//TCGA.kmeans.seed=670487.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.kmeans.seed=670487.binarized.tsv file not found and will be created


run 0 kmeans 0.005 0.3 670487
run 0 kmeans 0.005 0.4 670487
run 0 kmeans 0.005 0.5 670487
run 0 kmeans 0.005 0.6 670487
run 0 kmeans 0.005 0.7 670487
run 0 kmeans 0.005 0.8 670487
run 0 kmeans 0.005 0.9 670487
run 0 GMM 0.005 False 670487


results_on_real_data//TCGA.GMM.seed=670487.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.GMM.seed=670487.binarized.tsv file not found and will be created


run 0 GMM 0.005 0.3 670487
run 0 GMM 0.005 0.4 670487
run 0 GMM 0.005 0.5 670487
run 0 GMM 0.005 0.6 670487
run 0 GMM 0.005 0.7 670487
run 0 GMM 0.005 0.8 670487
run 0 GMM 0.005 0.9 670487
run 0 ward 0.005 False 670487
run 0 ward 0.005 0.3 670487
run 0 ward 0.005 0.4 670487
run 0 ward 0.005 0.5 670487
run 0 ward 0.005 0.6 670487
run 0 ward 0.005 0.7 670487
run 0 ward 0.005 0.8 670487
run 0 ward 0.005 0.9 670487
run 0 kmeans 0.001 False 670487
run 0 kmeans 0.001 0.3 670487
run 0 kmeans 0.001 0.4 670487
run 0 kmeans 0.001 0.5 670487
run 0 kmeans 0.001 0.6 670487
run 0 kmeans 0.001 0.7 670487
run 0 kmeans 0.001 0.8 670487
run 0 kmeans 0.001 0.9 670487
run 0 GMM 0.001 False 670487
run 0 GMM 0.001 0.3 670487
run 0 GMM 0.001 0.4 670487
run 0 GMM 0.001 0.5 670487
run 0 GMM 0.001 0.6 670487
run 0 GMM 0.001 0.7 670487
run 0 GMM 0.001 0.8 670487
run 0 GMM 0.001 0.9 670487
run 0 ward 0.001 False 670487
run 0 ward 0.001 0.3 670487
run 0 ward 0.001 0.4 670487
run 0 ward 0.001 0.5 670487
run 0 ward 

results_on_real_data//TCGA.kmeans.seed=116739.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.kmeans.seed=116739.binarized.tsv file not found and will be created


run 1 kmeans 0.005 0.3 116739
run 1 kmeans 0.005 0.4 116739
run 1 kmeans 0.005 0.5 116739
run 1 kmeans 0.005 0.6 116739
run 1 kmeans 0.005 0.7 116739
run 1 kmeans 0.005 0.8 116739
run 1 kmeans 0.005 0.9 116739
run 1 GMM 0.005 False 116739


results_on_real_data//TCGA.GMM.seed=116739.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.GMM.seed=116739.binarized.tsv file not found and will be created


run 1 GMM 0.005 0.3 116739
run 1 GMM 0.005 0.4 116739
run 1 GMM 0.005 0.5 116739
run 1 GMM 0.005 0.6 116739
run 1 GMM 0.005 0.7 116739
run 1 GMM 0.005 0.8 116739
run 1 GMM 0.005 0.9 116739
run 1 ward 0.005 False 116739
run 1 ward 0.005 0.3 116739
run 1 ward 0.005 0.4 116739
run 1 ward 0.005 0.5 116739
run 1 ward 0.005 0.6 116739
run 1 ward 0.005 0.7 116739
run 1 ward 0.005 0.8 116739
run 1 ward 0.005 0.9 116739
run 1 kmeans 0.001 False 116739
run 1 kmeans 0.001 0.3 116739
run 1 kmeans 0.001 0.4 116739
run 1 kmeans 0.001 0.5 116739
run 1 kmeans 0.001 0.6 116739
run 1 kmeans 0.001 0.7 116739
run 1 kmeans 0.001 0.8 116739
run 1 kmeans 0.001 0.9 116739
run 1 GMM 0.001 False 116739
run 1 GMM 0.001 0.3 116739
run 1 GMM 0.001 0.4 116739
run 1 GMM 0.001 0.5 116739
run 1 GMM 0.001 0.6 116739
run 1 GMM 0.001 0.7 116739
run 1 GMM 0.001 0.8 116739
run 1 GMM 0.001 0.9 116739
run 1 ward 0.001 False 116739
run 1 ward 0.001 0.3 116739
run 1 ward 0.001 0.4 116739
run 1 ward 0.001 0.5 116739
run 1 ward 

results_on_real_data//TCGA.kmeans.seed=26225.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.kmeans.seed=26225.binarized.tsv file not found and will be created


run 2 kmeans 0.005 0.3 26225
run 2 kmeans 0.005 0.4 26225
run 2 kmeans 0.005 0.5 26225
run 2 kmeans 0.005 0.6 26225
run 2 kmeans 0.005 0.7 26225
run 2 kmeans 0.005 0.8 26225
run 2 kmeans 0.005 0.9 26225
run 2 GMM 0.005 False 26225


results_on_real_data//TCGA.GMM.seed=26225.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.GMM.seed=26225.binarized.tsv file not found and will be created


run 2 GMM 0.005 0.3 26225
run 2 GMM 0.005 0.4 26225
run 2 GMM 0.005 0.5 26225
run 2 GMM 0.005 0.6 26225
run 2 GMM 0.005 0.7 26225
run 2 GMM 0.005 0.8 26225
run 2 GMM 0.005 0.9 26225
run 2 ward 0.005 False 26225


results_on_real_data//TCGA.ward.seed=26225.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.ward.seed=26225.binarized.tsv file not found and will be created


run 2 ward 0.005 0.3 26225
run 2 ward 0.005 0.4 26225
run 2 ward 0.005 0.5 26225
run 2 ward 0.005 0.6 26225
run 2 ward 0.005 0.7 26225
run 2 ward 0.005 0.8 26225
run 2 ward 0.005 0.9 26225
run 2 kmeans 0.001 False 26225
run 2 kmeans 0.001 0.3 26225
run 2 kmeans 0.001 0.4 26225
run 2 kmeans 0.001 0.5 26225
run 2 kmeans 0.001 0.6 26225
run 2 kmeans 0.001 0.7 26225
run 2 kmeans 0.001 0.8 26225
run 2 kmeans 0.001 0.9 26225
run 2 GMM 0.001 False 26225
run 2 GMM 0.001 0.3 26225
run 2 GMM 0.001 0.4 26225
run 2 GMM 0.001 0.5 26225
run 2 GMM 0.001 0.6 26225
run 2 GMM 0.001 0.7 26225
run 2 GMM 0.001 0.8 26225
run 2 GMM 0.001 0.9 26225
run 2 ward 0.001 False 26225
run 2 ward 0.001 0.3 26225
run 2 ward 0.001 0.4 26225
run 2 ward 0.001 0.5 26225
run 2 ward 0.001 0.6 26225
run 2 ward 0.001 0.7 26225
run 2 ward 0.001 0.8 26225
run 2 ward 0.001 0.9 26225
run 3 kmeans 0.005 False 777572


results_on_real_data//TCGA.kmeans.seed=777572.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.kmeans.seed=777572.binarized.tsv file not found and will be created


run 3 kmeans 0.005 0.3 777572
run 3 kmeans 0.005 0.4 777572
run 3 kmeans 0.005 0.5 777572
run 3 kmeans 0.005 0.6 777572
run 3 kmeans 0.005 0.7 777572
run 3 kmeans 0.005 0.8 777572
run 3 kmeans 0.005 0.9 777572
run 3 GMM 0.005 False 777572


results_on_real_data//TCGA.GMM.seed=777572.binarized.tsv file not found and will be created
  return (np.mean(ar1) - np.mean(ar2)) / (np.std(ar1) + np.std(ar2))
results_on_real_data//METABRIC.GMM.seed=777572.binarized.tsv file not found and will be created


run 3 GMM 0.005 0.3 777572


In [7]:
pd.DataFrame.from_records(subt_t)

known_group,Normal,Her2,Basal,LumB,LumA,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,ER+/HER2- High Prolif,IHC_TNBC,IHC_PR,IHC_ER,IHC_HER2,parameters,time
0,0.102389,0.48227,0.913706,0.410935,0.653465,0.925373,0.107317,0.858491,0.809955,0.566007,0.465608,0.0,0.743842,0.732441,0.396739,bin_method=ward;pval=0.005;clust_method=Louvai...,101.749673
1,0.106762,0.48227,0.93401,0.388889,0.477795,0.802985,0.110577,0.858491,0.828054,0.432735,0.421365,0.0,0.6133,0.749164,0.396739,bin_method=ward;pval=0.005;clust_method=Louvai...,102.786896


In [8]:
pd.DataFrame.from_records(subt_m)

known_group,Basal,Her2,LumB,Normal,LumA,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,ER+/HER2- High Prolif,IHC_TNBC,IHC_PR,IHC_ER,IHC_HER2,parameters,time
0,0.865613,0.477204,0.513514,0.115108,0.496988,0.869004,0.141748,0.790323,0.690058,0.536481,0.478764,0.0,0.61676,0.952809,0.852713,bin_method=ward;pval=0.005;clust_method=Louvai...,81.397651
1,0.853755,0.477612,0.513514,0.180488,0.557692,0.902214,0.181034,0.783465,0.681818,0.618421,0.478764,0.0,0.724022,0.92809,0.810409,bin_method=ward;pval=0.005;clust_method=Louvai...,90.358938


In [9]:
pd.DataFrame.from_records(ud)

Unnamed: 0,J_UP,overlap_UP,J_DOWN,overlap_DOWN
0,0.129538,"{COX7B2, SH3GL3, SRPK3, RFWD3, RGS22, GNAT1, M...",0.094323,"{CERS6, ZNF439, CAVIN1, PIP, ANKRD30A, MT1E, S..."
1,0.129357,"{COX7B2, SH3GL3, SRPK3, RFWD3, RGS22, GNAT1, M...",0.094209,"{CERS6, ZNF439, CAVIN1, PIP, ANKRD30A, MT1E, S..."
