In [6]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from utils.eval import find_best_matches, generate_exprs

from methods import NMF, PCA, sparse_PCA, moCluster, MOFA2, iClusterPlus

from methods.utils import interpret_results, resultsHandler

from pathlib import Path
import multiprocessing as mp

import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns

from utils.method import read_bic_table

from utils.eval import find_best_matches, make_known_groups, make_ref_groups
from utils.eval import calculate_perfromance, compare_gene_clusters

from contextlib import redirect_stdout


def read_nmf_results(comb, exprs):
    from methods.NMF import interpret_results, resultsHandler

    p = comb['output_path']

    path_w = os.path.join(p, 'W.csv')
    path_h = os.path.join(p, 'H.csv')

    df_w = pd.read_csv(path_w, index_col=0)
    df_h = pd.read_csv(path_h, index_col=0)
    
    result = interpret_results.format_sklearn_output(df_h.values, len(df_h.index), exprs.columns, False)
    result_genes = interpret_results.format_sklearn_output(df_w.values, len(df_w.columns), exprs.index, True)
    result['genes'] = result_genes['samples']
    result['n_genes'] = result_genes['n_samples']
    return result, resultsHandler.read_runtime(comb["output_path"])


gene_sets_are_defined = ['NMF', 'sparse_PCA', ]

classifications={"Intrinsic":["Luminal","Basal","Her2","Normal","Claudin-low"],
                "SCMOD2":["ER-/HER2-","ER+/HER2- Low Prolif","ER+/HER2- High Prolif","HER2+"],
                "IHC":["IHC_TNBC","IHC_ER","IHC_HER2","IHC_PR"]}

file_metabric_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904.annotation_v6.tsv'
file_metabric_expression = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv'
file_metabric_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv'
file_tcga_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv'
file_tcga_expression = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv'
file_tcga_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv'
file_gene_mapping = '/local/DESMOND2_data/v6/preprocessed_v6/gene_id_mapping.tsv'

out_dir = '/cosybio/project/hartung/unpast/unpast_real'

basename_t = "TCGA"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv(file_metabric_subtypes,sep = "\t",index_col=0)
m_annotation = pd.read_csv(file_metabric_annotation,sep = "\t",index_col=0)

t_subtypes = pd.read_csv(file_tcga_subtypes,sep = "\t",index_col=0)
t_annotation = pd.read_csv(file_tcga_annotation,sep = "\t",index_col=0)


exprs_t= pd.read_csv(file_tcga_expression,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(file_metabric_expression,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3

known_groups_t, freqs_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, freqs_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

result_t = None
result_m = None

def eval_method(METHOD):
    global result_t
    global result_m
    
    method_name = METHOD.__name__.split('.')[-1]
    print('method_name:', method_name)


    #### Preparation
    # METABRIC
    file_path_m = file_metabric_expression
    output_path_m = os.path.join(out_dir, basename_m, method_name)
    ground_truth_file_m = file_metabric_annotation
    combinations_m = METHOD.generate_arg_list(file_path_m, output_path_m, ground_truth_file_m)
    # TCGA
    file_path_t = file_tcga_expression
    output_path_t = os.path.join(out_dir, basename_t, method_name)
    ground_truth_file_t = file_tcga_annotation
    combinations_t = METHOD.generate_arg_list(file_path_t, output_path_t, ground_truth_file_t)


    #### Run
    # Methods will compute results or read existing results
    # sanity check
    assert len(combinations_m) == len(combinations_t)
    
    print(f'{len(combinations_m)} combinations.')
    subt_t = []
    subt_m = [] 
    clustering_similarities = []
    for _iteration, (comb_m, comb_t) in enumerate(zip(combinations_m, combinations_t)):

        
        if method_name == 'NMF':
            try:
                result_m, runtime_m = read_nmf_results(comb_m, exprs_m)
                result_t, runtime_t = read_nmf_results(comb_t, exprs_t)
            except Exception as e:
                print(e)
                continue
        else:
            result_m, runtime_m = METHOD.run_real(comb_m, is_terminated=True)
            result_t, runtime_t = METHOD.run_real(comb_t, is_terminated=True)
            
        if 'genes' in result_m and result_m is not False and type(result_m['genes'][0]) is str:
            result_m['genes'] = result_m['genes'].map(eval).map(set)
        if 'genes' in result_t and result_t is not False and type(result_t['genes'][0]) is str:
            result_t['genes'] = result_t['genes'].map(eval).map(set)

        t_failed = False
        m_failed = False

        try:
            # in case no result file
            if result_m is False:
                # raise Exception
                continue
              
            performance_m = calculate_perfromance(result_m, known_groups_m,
                                                    freqs_m, set(exprs_m.columns.values),
                                                    classifications=classifications)
            performance_m.update({'parameters': comb_m['output_path'], 'run': comb_m['random_state']})
            performance_m['time'] = runtime_m
        except ZeroDivisionError:
            performance_m = {}
            m_failed = True
        subt_m.append(performance_m)

        try:
            # in case no result file
            if result_t is False:
                # raise Exception
                continue
            
            performance_t = calculate_perfromance(result_t, known_groups_t,
                                                    freqs_t, set(exprs_t.columns.values),
                                                    classifications=classifications)
            performance_t.update({'parameters': comb_t['output_path'], 'run': comb_t['random_state']})
            performance_t['time'] = runtime_t
        except ZeroDivisionError:
            performance_t = {}
            t_failed = True
        subt_t.append(performance_t)
        
        
        if method_name in gene_sets_are_defined:
            # compare clustering results - only if gene sets are defined for each cluster
            clust_sim = {}
            if not (t_failed or m_failed):
                N = exprs_m.shape[0]
                try:
                    if len(result_t[(result_t['n_genes'] > 0) & (result_t['n_samples'] > 0)]) == 0 or len(result_m[(result_m['n_genes'] > 0) & (result_m['n_samples'] > 0)]) == 0:
                        raise KeyError
                    
                    clust_sim, bm, bm2 = compare_gene_clusters(result_t, result_m, N)  
                    # print('Wuhu')
                    
                except KeyError:
                    # 'n_shared' is not defined because gene clusters empty
                    pass     
            # comb_m and comb_t have same parameters besides input file
            clust_sim.update(comb_m)
            clustering_similarities.append(clust_sim)
            
        if not _iteration % 100:
            print('Iteration:', _iteration)

            # save results
            pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_METABRIC.tsv'), sep="\t")    
            pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, basename_t, method_name, f'{method_name}_TCGA.tsv'), sep="\t")
            
            pd.DataFrame.from_records(clustering_similarities).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_similarities.tsv'), sep = "\t")
            pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_TCGA_similarities.tsv'),sep = "\t")
            pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_METABRIC_similarities.tsv'),sep = "\t")
    
    print(f'found {len(subt_m)} combinations for metabric')
    print(f'found {len(subt_t)} combinations for tcga')
    
    # save results
    pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_METABRIC.tsv'), sep="\t")    
    pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, basename_t, method_name, f'{method_name}_TCGA.tsv'), sep="\t")
    
    pd.DataFrame.from_records(clustering_similarities).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_similarities.tsv'), sep = "\t")
    pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_TCGA_similarities.tsv'),sep = "\t")
    pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, basename_m, method_name, f'{method_name}_METABRIC_similarities.tsv'),sep = "\t")




In [None]:
METHODS = [NMF, sparse_PCA, moCluster, MOFA2]



In [None]:
eval_method(MOFA2)

method_name: MOFA2
1900 combinations.
Iteration: 0
Iteration: 1000
found 1900 combinations for metabric
found 1900 combinations for tcga


In [None]:
eval_method(moCluster)

method_name: moCluster
4560 combinations.
Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
found 4560 combinations for metabric
found 4560 combinations for tcga


In [None]:
eval_method(NMF)

method_name: NMF
30000 combinations.
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900
Iteration: 1000
Iteration: 1100
Iteration: 1200
Iteration: 1300
Iteration: 1400
Iteration: 1500
Iteration: 1600
Iteration: 1700
Iteration: 1800
Iteration: 1900
Iteration: 2000
Iteration: 2100
Iteration: 2200
Iteration: 2300
Iteration: 2400
Iteration: 2500
Iteration: 2600
Iteration: 2700
Iteration: 2800
Iteration: 2900
Iteration: 3000
Iteration: 3100
Iteration: 3200
Iteration: 3300
Iteration: 3400
Iteration: 3500
Iteration: 3600
Iteration: 3700
Iteration: 3800
Iteration: 3900
Iteration: 4000
Iteration: 4100
Iteration: 4200
Iteration: 4300
Iteration: 4400
Iteration: 4500
Iteration: 4600
Iteration: 4700
Iteration: 4800
Iteration: 4900
Iteration: 5000
Iteration: 5100
Iteration: 5200
Iteration: 5300
Iteration: 5400
Iteration: 5500
Iteration: 5600
Iteration: 5700


KeyboardInterrupt: 

In [None]:
eval_method(sparse_PCA)

method_name: sparse_PCA
600 combinations.
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
found 575 combinations for metabric
found 575 combinations for tcga


In [7]:
eval_method(iClusterPlus)

method_name: iClusterPlus
380 combinations.
Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
found 380 combinations for metabric
found 380 combinations for tcga


In [5]:
result_t

Unnamed: 0,samples,n_samples
1,"{TCGA-B6-A1KC-01, TCGA-E9-A1RD-01, TCGA-EW-A1I...",712
2,"{TCGA-GM-A5PX-01, TCGA-E2-A572-01, TCGA-AC-A62...",367
