In [None]:
### Set number of cores for parallel computation
# For just reading output files and creating summary, use 1 kernel
KERNEL = 1


import os
os.environ["OMP_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus

from methods.utils import interpret_results, resultsHandler, miscellaneous

from pathlib import Path
import multiprocessing as mp

from utils.eval import compare_gene_clusters, make_ref_groups, calculate_perfromance


file_metabric_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904.annotation_v6.tsv'
file_metabric_expression = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv'
file_metabric_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv'
file_tcga_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv'
file_tcga_expression = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv'
file_tcga_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv'
file_gene_mapping = '/local/DESMOND2_data/v6/preprocessed_v6/gene_id_mapping.tsv'

# out_dir = '/home/hartung/data/preprocessed_v6/results's
out_dir = '/cosybio/project/hartung/unpast/unpast_real'

basename_t = "TCGA"
basename_m = "METABRIC"

m_subtypes = pd.read_csv(file_metabric_subtypes,sep = "\t",index_col=0)
m_annotation = pd.read_csv(file_metabric_annotation,sep = "\t",index_col=0)

t_subtypes = pd.read_csv(file_tcga_subtypes,sep = "\t",index_col=0)
t_annotation = pd.read_csv(file_tcga_annotation,sep = "\t",index_col=0)


exprs_t= pd.read_csv(file_tcga_expression,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(file_metabric_expression,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3

known_groups_t, freqs_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, freqs_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

n_samples_m = len(exprs_m.columns)
n_samples_t = len(exprs_t.columns)

        
METHODS = [sparse_PCA] # [NMF, moCluster, MOFA2, iClusterPlus, ]
for METHOD in METHODS:
    method_name = METHOD.__name__.split('.')[-1]
    print('method_name', method_name)

    #### Preparation
    # METABRIC
    file_path_m = file_metabric_expression
    output_path_m = os.path.join(out_dir, basename_m, method_name)
    ground_truth_file_m = file_metabric_annotation
    combinations_m = METHOD.generate_arg_list(file_path_m, output_path_m, ground_truth_file_m)
    # TCGA
    file_path_t = file_tcga_expression
    output_path_t = os.path.join(out_dir, basename_t, method_name)
    ground_truth_file_t = file_tcga_annotation
    combinations_t = METHOD.generate_arg_list(file_path_t, output_path_t, ground_truth_file_t)


    #### Compute in parallel
    # Option to compute the results in parallel, methods will store results
    # Follow up with executing the 'Run' below to read existing results and evaluate
    if KERNEL > 1:
        with mp.Pool(KERNEL) as pool:
            pool.map(METHOD.run_real, combinations_m + combinations_t)

    #### Run
    # Methods will compute results or read existing results
    # sanity check
    assert len(combinations_m) == len(combinations_t)
    subt_t = []
    subt_m = []
    best_matches_m_list = []
    best_matches_t_list = []
    clustering_similarities = []
    for comb_m, comb_t in zip(combinations_m, combinations_t):
        result_m, runtime_m = METHOD.run_real(comb_m, is_terminated=True)
        result_t, runtime_t = METHOD.run_real(comb_t, is_terminated=True)
        
        t_failed = False
        m_failed = False
        
        # Please exclude too small cluster with <5 samples and too large clusters > all but 5 samples before performance evaluation
        if result_m is not False:
            result_m = result_m[(result_m['n_samples'] >= 5) & (result_m['n_samples'] <= (n_samples_m-5))]
        if result_t is not False:
            result_t = result_t[(result_t['n_samples'] >= 5) & (result_t['n_samples'] <= (n_samples_t-5))]

        try:
            performance_m, best_matches_m = calculate_perfromance(result_m, known_groups_m, set(exprs_m.columns.values), min_n_samples=5)
            performance_m = performance_m.to_dict()
            best_matches_m = best_matches_m.to_dict()
            
            performance_m.update({'parameters': miscellaneous.combination_to_string(comb_m), 'run': comb_m['random_state']})
            performance_m['time'] = runtime_m
        except (ZeroDivisionError, AttributeError):
            m_failed = True
            performance_m = {}
            best_matches_m = {}
        subt_m.append(performance_m)
        best_matches_m_list.append(best_matches_m)

        try:
            performance_t, best_matches_t = calculate_perfromance(result_t, known_groups_t,
                                                    set(exprs_t.columns.values))
            performance_t = performance_t.to_dict()
            best_matches_t = best_matches_t.to_dict()
            
            performance_t.update({'parameters': miscellaneous.combination_to_string(comb_t), 'run': comb_t['random_state']})
            performance_t['time'] = runtime_t
        except (ZeroDivisionError, AttributeError):
            t_failed= True
            performance_t = {}
            best_matches_t = {}
        subt_t.append(performance_t)
        best_matches_t_list.append(best_matches_t)
        
        # compare clustering results - only if gene sets are defined for each cluster
        clust_sim = {}
        if not (t_failed or m_failed):
            N = exprs_m.shape[0]
            try:
                if isinstance(result_t['genes'][0], str):
                    result_t['genes'] = result_t['genes'].map(eval)
                    result_t['genes'] = result_t['genes'].map(set)
                if isinstance(result_m['genes'][0], str):
                    result_m['genes'] = result_m['genes'].map(eval)
                    result_m['genes'] = result_m['genes'].map(set)
                clust_sim, bm, bm2 = compare_gene_clusters(result_t, result_m, N)
            except (ZeroDivisionError, KeyError) as e:
                # print('Error')
                # print(e)
                # 'n_shared' is not defined because gene clusters empty
                pass     
        # comb_m and comb_t have same parameters besides input file
        clust_sim.update(comb_m)
        clustering_similarities.append(clust_sim)

    # save results
    pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, f'{method_name}_METABRIC.tsv'), sep="\t")    
    pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, f'{method_name}_TCGA.tsv'), sep="\t")
    pd.DataFrame.from_records(clustering_similarities).to_csv(os.path.join(out_dir, f'{method_name}_similarities.tsv'), sep="\t")
    
    pd.DataFrame.from_records(best_matches_m_list).to_csv(os.path.join(out_dir, f'{method_name}_METABRIC_best_matches.tsv'), sep="\t")    
    pd.DataFrame.from_records(best_matches_t_list).to_csv(os.path.join(out_dir, f'{method_name}_TCGA_best_matches.tsv'), sep="\t")
