In [1]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from utils.method import read_bic_table

from utils.eval import find_best_matches, make_known_groups, make_ref_groups
from utils.eval import calculate_perfromance, compare_gene_clusters

In [4]:
classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"],
                "SCMOD2":["ER-/HER2-","ER+/HER2- Low Prolif","ER+/HER2- High Prolif","HER2+"],
                "IHC":["IHC_TNBC","IHC_ER","IHC_HER2","IHC_PR"]}

exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


known_groups_t, freqs_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, freqs_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

In [5]:
n_runs = 5
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [6]:
subt_t = []
subt_m = []
clustering_similarities = []

pvals = [0.0001, 0.0005,0.005,0.001,0.01,0.05]
bin_methods = ["kmeans","GMM","ward"] 

In [8]:
from run_desmond import run_DESMOND

In [9]:
### Louvain 
out_dir= "results_on_real_data/"
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []
for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for m in modularities:
                #print("run",run,bin_method,pval,m,seed)
                
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"Louvain"+";m="+str(m)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_t+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_t = read_bic_table(fname)
                    """result_t = run_DESMOND(exprs_file_t, basename_t, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                min_n_samples = 5,
                                                bin_method = bin_method, pval = pval,
                                                clust_method = "Louvain",
                                                similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False)"""
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_t = calculate_perfromance(result_t, known_groups_t,
                                                          freqs_t, set(exprs_t.columns.values),
                                                          classifications=classifications)
                    performance_t.update(params_dict)
                    performance_t["time"] = time_t
                    subt_t.append(performance_t)
                    t_failed = False
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_m+".seed="+str(seed)+\
                    ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",m="+str(m)+".biclusters.tsv"
                    result_m = read_bic_table(fname)
                    """result_m = run_DESMOND(exprs_file_m, basename_m, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                min_n_samples = 5,
                                                bin_method = bin_method, pval = pval,
                                                clust_method = "Louvain",
                                                similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False)"""
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_m = calculate_perfromance(result_m, known_groups_m,
                                                          freqs_m, set(exprs_m.columns.values),
                                                          classifications=classifications)
                    performance_m.update(params_dict)
                    performance_m["time"] = time_m
                    subt_m.append(performance_m)
                    m_failed = False
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results - only if gene sets are defined for each cluster
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)                    
                else:
                    clust_sim = {}
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)
                               

In [10]:
out_dir = "results_on_real_data_WGCNA/"
### WGCNA
ds_values = [0,1,2,3,4]
dch = 0.995

for run in range(n_runs):
    seed = seeds[run]
    for pval in pvals:
        for bin_method in bin_methods:
            for ds in ds_values:
                
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"WGCNA"+";ds="+str(ds)+";dch="+str(dch)
                #print("run",run,seed,params)
                params_dict = {"parameters":params, "seed":seed,"run":run}
                
                ### running TCGA or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_t+".seed="+str(seed)+".bin="+bin_method \
                    +",pval="+str(pval)+",clust=WGCNA"+",ds="+str(ds)+",dch="+str(dch)+".biclusters.tsv"
                    result_t = read_bic_table(fname)
                    """result_t = run_DESMOND(exprs_file_t, basename_t, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                min_n_samples = 5,
                                                bin_method = bin_method, pval = pval,
                                                clust_method = "WGCNA",
                                                ds = ds, dch  = dch,
                                                #similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False)"""
                    time_t = time()-t0
                    # find the best matches between TCGA biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_t = calculate_perfromance(result_t, known_groups_t,
                                                          freqs_t, set(exprs_t.columns.values),
                                                          classifications=classifications)
                    performance_t.update(params_dict)
                    performance_t["time"] = time_t
                    subt_t.append(performance_t)
                    t_failed = False
                except:
                    print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                    print(fname)
                    t_failed = True
                    subt_t.append({params_dict})
                
                ### running METABRIC or reading results
                try:
                    t0 = time()
                    fname = out_dir+basename_m+".seed="+str(seed)+".bin="+bin_method \
                    +",pval="+str(pval)+",clust=WGCNA"+",ds="+str(ds)+",dch="+str(dch)+".biclusters.tsv"
                    result_m = read_bic_table(fname)
                    """result_m = run_DESMOND(exprs_file_m, basename_m, out_dir=out_dir,
                                                save=True, load = True,
                                                ceiling = 3,
                                                min_n_samples = 5,
                                                bin_method = bin_method, pval = pval,
                                                clust_method = "WGCNA",
                                                ds = ds, dch  = dch,
                                                #similarity_cutoffs = similarity_cutoffs,
                                                seed = seed,
                                                verbose = False)"""
                    time_m = time()-t0
                    # find the best matches between METABRIC biclusters and subtypes
                    # and calculate overall performance == weighted sum of Jaccard indexes
                    performance_m = calculate_perfromance(result_m, known_groups_m,
                                                          freqs_m, set(exprs_m.columns.values),
                                                          classifications=classifications)
                    performance_m.update(params_dict)
                    performance_m["time"] = time_m
                    subt_m.append(performance_m)
                    m_failed = False
                except:
                    print("METABRIC biclustering failed with ",seed,  pval,bin_method,file = sys.stderr)
                    print(fname)
                    m_failed = True
                    subt_m.append(params_dict)
                    
                # compare clustering results - only if gene sets are defined for each cluster
                if not (t_failed or m_failed): 
                    N = exprs_m.shape[0]
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)
                    
                    
                else:
                    clust_sim = {}
                clust_sim.update(params_dict)
                clustering_similarities.append(clust_sim)


In [14]:
df = pd.DataFrame.from_records(subt_t).groupby("parameters").agg("mean").sort_values("overall_performance_Intrinsic",ascending = False)
df.head(10)

Unnamed: 0_level_0,Basal,LumA,Her2,LumB,Normal,Luminal,Claudin-low,ER-/HER2-,HER2+,ER+/HER2- Low Prolif,...,IHC_ER,IHC_PR,IHC_TNBC,IHC_HER2,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=ward;pval=0.0005;clust=Louvain;m=0.9,0.950249,0.673546,0.480769,0.431694,0.103878,0.920398,0.125581,0.860987,0.780488,0.661972,...,0.865407,0.755864,0.466981,0.423469,0.828268,0.644551,0.756246,375882.4,2.0,0.028799
bin=ward;pval=0.005;clust=Louvain;m=0.6,0.953,0.580247,0.492647,0.464988,0.094164,0.918184,0.121156,0.863063,0.854369,0.520532,...,0.863274,0.75405,0.463245,0.394444,0.827967,0.607545,0.751696,375882.4,2.0,0.035804
bin=ward;pval=0.0001;clust=Louvain;m=0.7,0.949749,0.603748,0.492647,0.410673,0.088888,0.916049,0.117371,0.851617,0.854369,0.552818,...,0.86604,0.750361,0.468257,0.394444,0.825688,0.612192,0.751839,375882.4,2.0,0.029265
bin=ward;pval=0.0005;clust=Louvain;m=0.6,0.944724,0.641405,0.478261,0.457088,0.105642,0.918388,0.117925,0.846847,0.846154,0.657258,...,0.863587,0.749572,0.469156,0.392265,0.825481,0.647384,0.750283,375882.4,2.0,0.03085
bin=ward;pval=0.05;clust=Louvain;m=0.6,0.945455,0.703397,0.492647,0.465157,0.070904,0.91473,0.118935,0.847059,0.854369,0.740938,...,0.862983,0.749868,0.475269,0.394444,0.823621,0.683741,0.750726,375882.4,2.0,0.036727
bin=kmeans;pval=0.0001;clust=Louvain;m=0.6,0.937576,0.606481,0.477803,0.464504,0.063948,0.918403,0.118838,0.861514,0.79568,0.521117,...,0.867627,0.739713,0.469258,0.420287,0.823057,0.613688,0.750804,375882.4,2.0,0.030984
bin=kmeans;pval=0.001;clust=Louvain;m=0.7,0.938077,0.61934,0.477803,0.48019,0.121951,0.914608,0.118415,0.850774,0.79568,0.54403,...,0.870372,0.741531,0.476031,0.420287,0.822266,0.626807,0.753206,375882.4,2.0,0.035238
bin=kmeans;pval=0.0005;clust=Louvain;m=0.7,0.936879,0.617115,0.477803,0.479489,0.119681,0.914385,0.118148,0.84572,0.79568,0.537536,...,0.871082,0.746729,0.47919,0.420287,0.821835,0.623006,0.755766,375882.4,2.0,0.028358
bin=ward;pval=0.001;clust=Louvain;m=0.5,0.949444,0.568162,0.478261,0.44155,0.120482,0.909414,0.118712,0.853545,0.846154,0.506814,...,0.863367,0.747712,0.469118,0.392265,0.820814,0.598334,0.749454,375882.4,2.0,0.033661
bin=ward;pval=0.01;clust=Louvain;m=0.4,0.949495,0.661342,0.478261,0.411705,0.121172,0.909045,0.118483,0.850679,0.846154,0.625537,...,0.858154,0.757744,0.47343,0.392265,0.820589,0.639422,0.751298,375882.4,2.0,0.030577


In [16]:
#df2 = pd.DataFrame.from_records(subt_m).groupby("parameters").agg("mean").sort_values("overall_performance_Intrinsic",ascending = False)
df2.sort_values("overall_performance_Intrinsic",ascending = False).head(60)

Unnamed: 0_level_0,Basal,LumB,LumA,Her2,Normal,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,...,IHC_HER2,IHC_PR,IHC_TNBC,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time,rank
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.0001;clust=WGCNA;ds=0;dch=0.995,0.849441,0.546428,0.466814,0.563183,0.086653,0.930145,0.162624,0.669133,0.686179,0.514888,...,0.680394,0.6308,0.676319,0.819198,0.548141,0.784318,375882.4,2.0,0.039411,0
bin=kmeans;pval=0.05;clust=WGCNA;ds=0;dch=0.995,0.859728,0.547714,0.577181,0.527241,0.084985,0.930618,0.159904,0.719759,0.693298,0.687489,...,0.755871,0.625107,0.683256,0.816212,0.612929,0.791554,375882.4,2.0,0.047503,1
bin=kmeans;pval=0.0005;clust=WGCNA;ds=0;dch=0.995,0.855674,0.546121,0.480255,0.532637,0.084862,0.930028,0.161102,0.721651,0.691933,0.530134,...,0.746905,0.624203,0.679674,0.816022,0.559676,0.787604,375882.4,2.0,0.037732,2
bin=kmeans;pval=0.05;clust=WGCNA;ds=1;dch=0.995,0.859728,0.546451,0.577181,0.517938,0.085035,0.930618,0.159904,0.744659,0.693298,0.687489,...,0.778546,0.631223,0.683256,0.815073,0.615571,0.795391,375882.4,2.0,0.043231,3
bin=kmeans;pval=0.05;clust=WGCNA;ds=2;dch=0.995,0.862203,0.544383,0.577181,0.49788,0.095057,0.933209,0.159904,0.798582,0.693298,0.687489,...,0.835699,0.630635,0.683256,0.814963,0.623856,0.80167,375882.4,2.0,0.051257,4
bin=kmeans;pval=0.005;clust=WGCNA;ds=0;dch=0.995,0.855456,0.546688,0.500634,0.523139,0.085552,0.929955,0.161526,0.713629,0.689765,0.556259,...,0.742671,0.635449,0.679299,0.814817,0.567144,0.790341,375882.4,2.0,0.03991,5
bin=kmeans;pval=0.001;clust=WGCNA;ds=0;dch=0.995,0.855674,0.539539,0.488602,0.514266,0.085338,0.929714,0.161102,0.742279,0.691933,0.541111,...,0.776353,0.625341,0.679674,0.813567,0.565626,0.790027,375882.4,2.0,0.040921,6
bin=kmeans;pval=0.05;clust=WGCNA;ds=3;dch=0.995,0.859728,0.549503,0.577181,0.494035,0.107918,0.930759,0.159904,0.794199,0.693298,0.687489,...,0.838214,0.643216,0.683256,0.81286,0.624196,0.807232,375882.4,2.0,0.090379,7
bin=kmeans;pval=0.005;clust=WGCNA;ds=1;dch=0.995,0.855456,0.544393,0.498434,0.506131,0.086325,0.929955,0.161526,0.739777,0.689765,0.551175,...,0.773042,0.635033,0.679299,0.812753,0.568195,0.792588,375882.4,2.0,0.040694,8
bin=kmeans;pval=0.05;clust=WGCNA;ds=4;dch=0.995,0.859728,0.545587,0.577181,0.487437,0.115682,0.930834,0.159904,0.797305,0.693298,0.687489,...,0.84413,0.650191,0.683256,0.812313,0.623343,0.810844,375882.4,2.0,0.132307,9


(234, 22)

In [41]:
df.loc["bin=kmeans;pval=0.01;clust=WGCNA;ds=0;dch=0.995",:]

Basal                                 0.884920
LumA                                  0.723474
Her2                                  0.508427
LumB                                  0.492807
Normal                                0.077641
Luminal                               0.885366
Claudin-low                           0.108444
ER-/HER2-                             0.850129
HER2+                                 0.549617
ER+/HER2- Low Prolif                  0.714178
ER+/HER2- High Prolif                 0.499169
IHC_ER                                0.856127
IHC_PR                                0.746717
IHC_TNBC                              0.387198
IHC_HER2                              0.331488
overall_performance_Intrinsic         0.794970
overall_performance_SCMOD2            0.646301
overall_performance_IHC               0.734865
seed                             375882.400000
run                                   2.000000
time                                  0.040497
Name: bin=kme

In [42]:
df2.loc["bin=kmeans;pval=0.01;clust=WGCNA;ds=0;dch=0.995",:]

Basal                                 0.855987
LumB                                  0.552651
LumA                                  0.505270
Her2                                  0.492839
Normal                                0.085385
Luminal                               0.930752
Claudin-low                           0.159272
HER2+                                 0.789980
ER-/HER2-                             0.694705
ER+/HER2- Low Prolif                  0.562054
ER+/HER2- High Prolif                 0.470832
IHC_ER                                0.930804
IHC_HER2                              0.831263
IHC_PR                                0.625012
IHC_TNBC                              0.684830
overall_performance_Intrinsic         0.811611
overall_performance_SCMOD2            0.578511
overall_performance_IHC               0.795745
seed                             375882.400000
run                                   2.000000
time                                  0.044270
Name: bin=kme

In [22]:
pd.DataFrame.from_records(clustering_similarities).to_csv("UnPaSt_similarities.tsv",sep = "\t")
pd.DataFrame.from_records(subt_t).to_csv("UnPaSt_TCGA.tsv",sep = "\t")
pd.DataFrame.from_records(subt_m).to_csv("UnPaSt_METABRIC.tsv",sep = "\t")


In [26]:
df = pd.read_csv("UnPaSt_TCGA.tsv",sep = "\t",index_col =0).groupby("parameters").agg("mean").sort_values(by= "overall_performance_Intrinsic",ascending=False)
df2 = pd.read_csv("UnPaSt_METABRIC.tsv",sep = "\t",index_col =0).groupby("parameters").agg("mean").sort_values(by= "overall_performance_Intrinsic",ascending=False)


In [33]:
df2.sort_values(by= "overall_performance_Intrinsic",ascending=False).head(3)

Unnamed: 0_level_0,Basal,LumB,LumA,Her2,Normal,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,...,IHC_ER,IHC_HER2,IHC_PR,IHC_TNBC,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.0001;clust=WGCNA;ds=0;dch=0.995,0.849441,0.546428,0.466814,0.563183,0.086653,0.930145,0.162624,0.669133,0.686179,0.514888,...,0.929429,0.680394,0.6308,0.676319,0.819198,0.548141,0.784318,375882.4,2.0,0.039411
bin=kmeans;pval=0.05;clust=WGCNA;ds=0;dch=0.995,0.859728,0.547714,0.577181,0.527241,0.084985,0.930618,0.159904,0.719759,0.693298,0.687489,...,0.93463,0.755871,0.625107,0.683256,0.816212,0.612929,0.791554,375882.4,2.0,0.047503
bin=kmeans;pval=0.0005;clust=WGCNA;ds=0;dch=0.995,0.855674,0.546121,0.480255,0.532637,0.084862,0.930028,0.161102,0.721651,0.691933,0.530134,...,0.92931,0.746905,0.624203,0.679674,0.816022,0.559676,0.787604,375882.4,2.0,0.037732


In [34]:
df2.sort_values(by= "overall_performance_IHC",ascending=False).head(3)

Unnamed: 0_level_0,Basal,LumB,LumA,Her2,Normal,Luminal,Claudin-low,HER2+,ER-/HER2-,ER+/HER2- Low Prolif,...,IHC_ER,IHC_HER2,IHC_PR,IHC_TNBC,overall_performance_Intrinsic,overall_performance_SCMOD2,overall_performance_IHC,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=GMM;pval=0.001;clust=Louvain;m=0,0.823901,0.601145,0.579131,0.47093,0.085714,0.928415,0.168358,0.764259,0.721841,0.687527,...,0.947443,0.87218,0.725843,0.684012,0.803792,0.624747,0.840842,375882.4,2.0,0.038301
bin=GMM;pval=0.0005;clust=Louvain;m=0,0.823901,0.601145,0.576046,0.468109,0.085714,0.927636,0.168358,0.754978,0.721841,0.68359,...,0.945113,0.865602,0.725843,0.684012,0.802913,0.622184,0.839193,375882.4,2.0,0.036453
bin=GMM;pval=0.0001;clust=Louvain;m=0,0.823901,0.601145,0.570273,0.468109,0.084907,0.927636,0.168358,0.754978,0.721841,0.670538,...,0.945113,0.865602,0.725843,0.684012,0.802891,0.617763,0.839193,375882.4,2.0,0.037562


In [10]:

df["rank"] = range(df.shape[0])
df2["rank"] = range(df2.shape[0])
r = df["rank"]+df2["rank"]
r.sort_values()

parameters
bin=kmeans;pval=0.0005;clust=Louvain;m=0       49
bin=kmeans;pval=0.0001;clust=Louvain;m=0.7     50
bin=ward;pval=0.005;clust=Louvain;m=0.5        51
bin=kmeans;pval=0.0001;clust=Louvain;m=0.6     54
bin=kmeans;pval=0.0005;clust=Louvain;m=0.7     54
                                             ... 
bin=GMM;pval=0.05;clust=Louvain;m=0.4         452
bin=GMM;pval=0.001;clust=Louvain;m=0.3        456
bin=GMM;pval=0.005;clust=Louvain;m=0.3        461
bin=GMM;pval=0.01;clust=Louvain;m=0.3         463
bin=GMM;pval=0.05;clust=Louvain;m=0.3         466
Name: rank, Length: 234, dtype: int64

In [13]:
#df2.loc["bin=kmeans;pval=0.0005;clust=Louvain;m=0",:]

In [38]:
s = pd.read_csv("UnPaSt_similarities.tsv",sep = "\t",index_col=0).groupby("parameters").agg("mean")
s["avg_percent_matched"] = (s["percent_matched_1"]+s["percent_matched_2"])*0.5
s.sort_values(by = "avg_percent_matched",ascending = False)

Unnamed: 0_level_0,n_1,n_2,percent_matched_1,percent_matched_2,n_shared_genes_1,n_shared_genes_2,avg_bm_J_1,avg_bm_J_2,seed,run,avg_percent_matched
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bin=kmeans;pval=0.01;clust=WGCNA;ds=0;dch=0.995,41.2,48.8,0.602776,0.541623,352.8,343.2,0.155471,0.150165,375882.4,2.0,0.572200
bin=kmeans;pval=0.005;clust=WGCNA;ds=0;dch=0.995,46.4,46.6,0.568464,0.570817,316.6,314.0,0.164179,0.160031,375882.4,2.0,0.569641
bin=kmeans;pval=0.0001;clust=WGCNA;ds=0;dch=0.995,32.8,39.4,0.562200,0.543741,205.2,225.4,0.106610,0.098606,375882.4,2.0,0.552971
bin=GMM;pval=0.0001;clust=WGCNA;ds=0;dch=0.995,21.8,42.2,0.649769,0.449309,192.4,255.8,0.154863,0.129065,375882.4,2.0,0.549539
bin=kmeans;pval=0.01;clust=WGCNA;ds=1;dch=0.995,59.4,62.8,0.548608,0.503577,362.4,347.2,0.190632,0.197385,375882.4,2.0,0.526093
...,...,...,...,...,...,...,...,...,...,...,...
bin=ward;pval=0.005;clust=WGCNA;ds=4;dch=0.995,352.4,270.0,0.121451,0.160050,214.8,233.2,0.301238,0.292877,375882.4,2.0,0.140750
bin=ward;pval=0.01;clust=WGCNA;ds=4;dch=0.995,367.0,292.4,0.117732,0.145710,221.6,229.8,0.320397,0.318961,375882.4,2.0,0.131721
bin=ward;pval=0.001;clust=WGCNA;ds=4;dch=0.995,311.6,251.6,0.105935,0.151194,170.4,204.6,0.330959,0.298815,375882.4,2.0,0.128564
bin=ward;pval=0.0001;clust=WGCNA;ds=4;dch=0.995,275.6,212.6,0.103799,0.147670,149.2,178.6,0.309060,0.282133,375882.4,2.0,0.125734
