In [25]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time
import subprocess
import time

import matplotlib.pyplot as plt
import seaborn as sns

from run_desmond import run_DESMOND

from utils.eval import find_best_matches, make_known_groups

def match_known_subtypes(results, subtypes, annotation,exprs):

    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {}
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)

    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR","IHC_TNBC"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)

    known_groups = [pam50,lum,claudin,scmod2,ihc]
    best_matches = []
    for group in known_groups:
        bm = find_best_matches(results,group,all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
    best_matches = pd.concat(best_matches, axis=0)
    return best_matches


from utils.eval import find_best_matching_biclusters

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster,
    # and the average J index for best matches
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()

    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)

    clust_similarity = {}
    # number of biclusters
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()


    return clust_similarity, bm, bm2

In [5]:
real_data_path = '/Users/fernando/Documents/Research/DESMOND2_data_simulated/preprocessed_v6/'

file_metabric_annotation = f'{real_data_path}METABRIC_1904.annotation_v6.tsv'
file_metabric_expression = f'{real_data_path}/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv'
file_metabric_subtypes = f'{real_data_path}/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv'
file_tcga_annotation = f'{real_data_path}TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv'
file_tcga_expression = f'{real_data_path}TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv'
file_tcga_subtypes = f'{real_data_path}TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv'
file_gene_mapping = f'{real_data_path}gene_id_mapping.tsv'

out_dir = '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/results/'

basename_t = "TCGA"
basename_m = "METABRIC"


m_subtypes = pd.read_csv(file_metabric_subtypes,sep = "\t",index_col=0)
m_annotation = pd.read_csv(file_metabric_annotation,sep = "\t",index_col=0)

t_subtypes = pd.read_csv(file_tcga_subtypes,sep = "\t",index_col=0)
t_annotation = pd.read_csv(file_tcga_annotation,sep = "\t",index_col=0)


exprs_t= pd.read_csv(file_tcga_expression, sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(file_metabric_expression, sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


In [16]:

tool_list = {
    'kmeans': 'run_kmeans.py',
    'WGCNAkmeans': 'run_WGCNAkmeans.py',
    'HC': 'run_HC.py',
    'WGCNAHC': 'run_WGCNAHC.py'
}


expr_files = {}
result_files = {}

for realdata_file in os.listdir(real_data_path):
    realdatafile_path = os.path.join(real_data_path, realdata_file)
    # print(realdatafile_path)
    prefix = realdata_file.split("_")[0]
    # print(prefix)
    if "exprs" in realdata_file:
        expr_files[prefix] = realdatafile_path
        result_files[prefix] = f'{prefix}.tsv'

expr_files
result_files



{'METABRIC': 'METABRIC.tsv', 'TCGA-BRCA': 'TCGA-BRCA.tsv'}

In [32]:
commands = list()
running = list()

def get_command(tool_name, script_location, expr_file, out_file):
    command = []
    if tool_name in ['kmeans', 'WGCNAkmeans', 'WGCNAHC', 'HC']:
        command.append("python3")
        command.append(script_location)
        command.append(expr_file)
        command.append(out_file)
    return command

tool_name = 'kmeans'
dataset = 'METABRIC'


for tool_name in tool_list.keys():
    tool_name = 'HC'

    for dataset in expr_files.keys():
        # print(test_case)
        expr_file = expr_files[dataset]
        for r in range(1, 6):
            commands.append(get_command(tool_name, os.path.join(script_folder, tool_list[tool_name]), expr_files[dataset], os.path.join(out_dir, f'{dataset}_run{r}.tsv')))

commands

[['python3',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/run_HC.py',
  '/Users/fernando/Documents/Research/DESMOND2_data_simulated/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/results/METABRIC_run1.tsv'],
 ['python3',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/run_HC.py',
  '/Users/fernando/Documents/Research/DESMOND2_data_simulated/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/results/METABRIC_run2.tsv'],
 ['python3',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/run_HC.py',
  '/Users/fernando/Documents/Research/DESMOND2_data_simulated/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv',
  '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/results/METABRIC_run3.tsv'],
 ['python3',
  '/Users/fernando/Documents/Research/DESM

Traceback (most recent call last):
  File "/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/run_kmeans.py", line 5, in <module>
    from kneed import KneeLocator
ModuleNotFoundError: No module named 'kneed'


In [33]:
print(f"Commands running")
#parallel_execs = int(sys.argv[1])
parallel_execs = 1
while len(commands) > 0 or len(running) > 0:
    if len(running) < parallel_execs and len(commands) > 0:
        command = commands[0]
        print(f'starting command: {command}')
        commands = commands[1:]
        p = subprocess.Popen(command)
        running.append(p)
    done = []
    for i in range(0, len(running)):
        if running[i].poll() is not None:
            done.append(i)
    for i in done:
        del running[i]
    time.sleep(1)

print(f"All done, result scores are in their directories")

Commands running
starting command: ['python3', '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/run_HC.py', '/Users/fernando/Documents/Research/DESMOND2_data_simulated/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv', '/Users/fernando/Documents/Research/DESMOND2/evaluation/clustering/results/METABRIC_run1.tsv']
single braycurtis


100%|██████████| 20/20 [00:24<00:00,  1.20s/it]


KeyboardInterrupt: 