In [1]:
import os, sys
import pandas as pd
import numpy as np
import random

from utils.method import read_bic_table, make_consensus_biclusters2,zscore, write_bic_table
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
in_dir = "results_on_real_data_tuned_v2/TCGA_optimized/"
out_dir = "results_on_real_data_tuned_v2/TCGA_optimized/"
dataset = 'TCGA-BRCA'
exprs_file = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"

#in_dir = "results_on_real_data_tuned_v2/METABRIC_optimized/"
#out_dir = "results_on_real_data_tuned_v2/METABRIC_optimized/"
#dataset = 'METABRIC'
#exprs_file = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"

suffix  = ".bin=kmeans,pval=0.01,clust=WGCNA,direction=DOWN-UP,ds=3,dch=0.995,max_power=10,precluster=True.biclusters.tsv" 

In [3]:
exprs= pd.read_csv(exprs_file,sep = "\t",index_col=0)

In [4]:
n_runs = 5
cseed = 42 # seed for consensus of 5 runs
seeds = []
random.seed(cseed)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [5]:
# reading biclusters detected in independent runs
biclustering_results = []
for seed in seeds:
    biclusters = read_bic_table(in_dir+dataset+".seed="+str(seed)+suffix)
    print("seed %s n biclusters: %s"%(seed,biclusters.shape[0]))
    # reindexing
    biclusters.index = ["seed="+str(seed)+"_"+str(x) for x in biclusters.index.values ]
    biclustering_results.append(biclusters)


seed 670487 n biclusters: 168
seed 116739 n biclusters: 197
seed 26225 n biclusters: 150
seed 777572 n biclusters: 134
seed 288389 n biclusters: 210


In [None]:
consensus_biclusters = make_consensus_biclusters2(biclustering_results, zscore(exprs), 
                                                      seed = cseed,
                                                      verbose = True,
                                                      plot = True)

In [7]:
outfname = out_dir+"/"+dataset+".consensus_seed="+str(cseed)+suffix
print(outfname)
#write_bic_table(consensus_biclusters, outfname)

results_on_real_data_tuned_v2/TCGA_optimized//TCGA-BRCA.consensus_seed=42.bin=kmeans,pval=0.01,clust=WGCNA,direction=DOWN-UP,ds=3,dch=0.995,max_power=10,precluster=True.biclusters.tsv


# Evaluating performance

In [8]:
from utils.eval import make_ref_groups
from utils.eval import calculate_perfromance, compare_gene_clusters

In [9]:
# TCGA
exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)

known_groups_t, all_samples_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)


In [11]:
performances, best_matches = calculate_perfromance(consensus_biclusters, known_groups_t,all_samples_t,
                                                  performance_measure="ARI")
performances

PAM50          0.723389
Intrinsic      0.699051
PAM50_AB       0.472755
SCMOD2         0.485605
IHC            0.505737
Luminal        0.717999
Basal          0.938534
Her2           0.559805
LumA           0.465327
LumB           0.206061
Normal         0.089773
Claudin-low    0.115465
IHC_HER2       0.453328
IHC_ER         0.586139
IHC_PR         0.424782
IHC_TNBC       0.505250
NET_kmeans     0.770237
NET_ward       0.715634
dtype: float64

In [12]:
best_matches

Unnamed: 0,bm_id,ARI,weight,adj_pval,is_enriched,samples,n_samples,classification
Basal,2,0.938534,0.180723,0.000335,True,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-C8-A12...",195,PAM50
Normal,21,0.089773,0.029657,0.000162,True,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-C8-A12...",193,PAM50
Her2,22,0.559805,0.100093,0.000148,True,"{TCGA-D8-A1XT-01, TCGA-A7-A2KD-01, TCGA-C8-A13...",118,PAM50
Luminal,11,0.717999,0.689527,0.0006,False,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-D8-A1X...",266,PAM50
Basal,2,0.938534,0.173488,0.000335,True,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-C8-A12...",195,Intrinsic
Normal,21,0.089773,0.02847,0.000162,True,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-C8-A12...",193,Intrinsic
Her2,22,0.559805,0.096085,0.000148,True,"{TCGA-D8-A1XT-01, TCGA-A7-A2KD-01, TCGA-C8-A13...",118,Intrinsic
Luminal,11,0.717999,0.661922,0.0006,False,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-D8-A1X...",266,Intrinsic
Claudin-low,10,0.115465,0.040036,0.001455,True,"{TCGA-B6-A3ZX-01, TCGA-D8-A1JL-01, TCGA-C8-A12...",133,Intrinsic
LumB,84,0.206061,0.294717,0.000556,True,"{TCGA-D8-A1J9-01, TCGA-A7-A26H-01, TCGA-AR-A24...",510,PAM50_AB


In [10]:
performances, best_matches = calculate_perfromance(consensus_biclusters, known_groups_t,all_samples_t,
                                                  performance_measure="ARI")
performances

PAM50          0.723435
Intrinsic      0.699095
PAM50_AB       0.473822
SCMOD2         0.484726
IHC            0.505737
Luminal        0.717999
Basal          0.938534
Her2           0.559805
LumA           0.467914
LumB           0.206061
Normal         0.091334
Claudin-low    0.115465
IHC_HER2       0.453328
IHC_ER         0.586139
IHC_PR         0.424782
IHC_TNBC       0.505250
NET_kmeans     0.770237
NET_ward       0.715634
dtype: float64

In [11]:
best_matches

Unnamed: 0,bm_id,ARI,weight,adj_pval,is_enriched,samples,n_samples,classification
Normal,20,0.091334,0.029657,0.000137,True,"{TCGA-AR-A1AO-01, TCGA-AC-A62X-01, TCGA-S3-AA1...",191,PAM50
Her2,23,0.559805,0.100093,0.000149,True,"{TCGA-AQ-A0Y5-01, TCGA-C8-A26W-01, TCGA-A2-A3X...",118,PAM50
Basal,1,0.938534,0.180723,0.000337,True,"{TCGA-AC-A62X-01, TCGA-S3-AA15-01, TCGA-B6-A40...",195,PAM50
Luminal,10,0.717999,0.689527,0.000602,False,"{TCGA-AR-A1AO-01, TCGA-A2-A4S1-01, TCGA-S3-AA1...",266,PAM50
Normal,20,0.091334,0.02847,0.000137,True,"{TCGA-AR-A1AO-01, TCGA-AC-A62X-01, TCGA-S3-AA1...",191,Intrinsic
Her2,23,0.559805,0.096085,0.000149,True,"{TCGA-AQ-A0Y5-01, TCGA-C8-A26W-01, TCGA-A2-A3X...",118,Intrinsic
Basal,1,0.938534,0.173488,0.000337,True,"{TCGA-AC-A62X-01, TCGA-S3-AA15-01, TCGA-B6-A40...",195,Intrinsic
Luminal,10,0.717999,0.661922,0.000602,False,"{TCGA-AR-A1AO-01, TCGA-A2-A4S1-01, TCGA-S3-AA1...",266,Intrinsic
Claudin-low,9,0.115465,0.040036,0.001462,True,"{TCGA-A2-A4S1-01, TCGA-A7-A3J0-01, TCGA-A2-A3X...",133,Intrinsic
Normal,20,0.091334,0.029657,0.000137,True,"{TCGA-AR-A1AO-01, TCGA-AC-A62X-01, TCGA-S3-AA1...",191,PAM50_AB


# METABRIC

In [13]:
in_dir = "results_on_real_data_tuned_v2/METABRIC_optimized/"
out_dir = "results_on_real_data_tuned_v2/METABRIC_optimized/"
dataset = 'METABRIC'
exprs_file = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"

suffix  = ".bin=kmeans,pval=0.01,clust=WGCNA,direction=DOWN-UP,ds=3,dch=0.995,max_power=10,precluster=True.biclusters.tsv" 

In [14]:
exprs= pd.read_csv(exprs_file,sep = "\t",index_col=0)

n_runs = 5
cseed = 42 # seed for consensus of 5 runs
seeds = []
random.seed(cseed)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

# reading biclusters detected in independent runs
biclustering_results = []
for seed in seeds:
    biclusters = read_bic_table(in_dir+dataset+".seed="+str(seed)+suffix)
    print("seed %s n biclusters: %s"%(seed,biclusters.shape[0]))
    # reindexing
    biclusters.index = ["seed="+str(seed)+"_"+str(x) for x in biclusters.index.values ]
    biclustering_results.append(biclusters)



generate  5  seeds [670487, 116739, 26225, 777572, 288389]
seed 670487 n biclusters: 168
seed 116739 n biclusters: 164
seed 26225 n biclusters: 159
seed 777572 n biclusters: 161
seed 288389 n biclusters: 154


In [None]:
consensus_biclusters = make_consensus_biclusters2(biclustering_results, zscore(exprs), 
                                                      seed = cseed,
                                                      verbose = True,
                                                      plot = True)


In [16]:
exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

known_groups_m, all_samples_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

In [17]:
performances, best_matches = calculate_perfromance(consensus_biclusters, known_groups_m,all_samples_m,
                                                  performance_measure="ARI")
performances

PAM50          0.754447
Intrinsic      0.729983
PAM50_AB       0.334729
SCMOD2         0.399722
IHC            0.624434
Luminal        0.792972
Basal          0.891643
Her2           0.534394
LumA           0.239427
LumB           0.180247
Normal         0.157154
Claudin-low    0.188365
IHC_HER2       0.877487
IHC_ER         0.810650
IHC_PR         0.269513
IHC_TNBC       0.713750
NET_kmeans     0.891851
NET_ward       0.747780
dtype: float64

In [18]:
best_matches

Unnamed: 0,bm_id,ARI,weight,adj_pval,is_enriched,samples,n_samples,classification
Basal,0,0.891643,0.128151,0.000464,True,"{MB-5295, MB-4679, MB-5070, MB-0901, MB-3046, ...",255,PAM50
Normal,73,0.157154,0.028361,7.8e-05,True,"{MB-0534, MB-5295, MB-0901, MB-5070, MB-3046, ...",303,PAM50
Her2,14,0.534394,0.128151,0.00031,True,"{MB-2984, MB-2823, MB-6114, MB-5019, MB-5199, ...",245,PAM50
Luminal,7,0.792972,0.715336,0.000673,False,"{MB-5052, MB-7079, MB-7256, MB-6063, MB-0350, ...",520,PAM50
Basal,0,0.891643,0.122613,0.000464,True,"{MB-5295, MB-4679, MB-5070, MB-0901, MB-3046, ...",255,Intrinsic
Normal,73,0.157154,0.027136,7.8e-05,True,"{MB-0534, MB-5295, MB-0901, MB-5070, MB-3046, ...",303,Intrinsic
Her2,14,0.534394,0.122613,0.00031,True,"{MB-2984, MB-2823, MB-6114, MB-5019, MB-5199, ...",245,Intrinsic
Luminal,7,0.792972,0.684422,0.000673,False,"{MB-5052, MB-7079, MB-7256, MB-6063, MB-0350, ...",520,Intrinsic
Claudin-low,0,0.188365,0.043216,0.00011,True,"{MB-5295, MB-4679, MB-5070, MB-0901, MB-3046, ...",255,Intrinsic
LumA,57,0.239427,0.314076,0.000555,True,"{MB-0534, MB-0899, MB-5451, MB-0192, MB-0345, ...",466,PAM50_AB
