In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from utils.eval import generate_exprs, calculate_perfromance

from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus

from methods.utils import interpret_results, resultsHandler, miscellaneous

from pathlib import Path
import multiprocessing as mp
from contextlib import redirect_stdout
from utils.method import read_bic_table



# Settings for simulated data
N_GENES_LIST = ['5', '50', '500']
SCENARIOS = ['A', 'B', 'C']

for METHOD in [NMF]:  # MOFA2, moCluster, sparse_PCA, iClusterPlus, 
    all_combinations = []
    for SCENARIO in SCENARIOS:
        for N_GENES in N_GENES_LIST:

            #### prep file paths
            # data files
            data_path = f'/local/DESMOND2_data_simulated/simulated/{SCENARIO}'
            FILE = f'{SCENARIO}.n_genes={N_GENES},m=4,std=1,overlap=yes.exprs_z.tsv'
            file_path = os.path.join(data_path, FILE)
            # output folder
            output_path = os.path.join('/cosybio/project/hartung/unpast/unpast_simluated', METHOD.__name__.split('.')[1], SCENARIO, N_GENES)
            # ground truth file
            ground_truth_file = os.path.join(data_path, f'{SCENARIO}.n_genes={N_GENES},m=4,std=1,overlap=yes.biclusters.tsv')

            if SCENARIO == 'A':
                file_path = file_path.replace('overlap=yes', 'overlap=no')
                ground_truth_file= ground_truth_file.replace('overlap=yes', 'overlap=no')

            #### prep params
            combinations = METHOD.generate_arg_list(file_path, output_path, ground_truth_file)
            all_combinations.extend(combinations)
    print('Method', METHOD)
    print('len(all_combinations)', len(all_combinations))
    print()
    
    
    
    # # check if all are done
    # done = []
    # not_done = []
    # for i, comb in enumerate(all_combinations):
    #     if os.path.isfile(os.path.join(comb['output_path'], 'result.csv')):
    #         done.append(comb)
    #     else:
    #         not_done.append(comb)
       
       
            
    # # run all 
    # with mp.Pool(40) as pool:
    #     pool.map(METHOD.run_simulated, all_combinations)
    
    # performance, best_matches = calculate_perfromance(result, known_groups,all_samples)

    # d = {"scenario":scenario,"gsize":gsize,
    #         "n_run":n_run,"seed":seed,
    #         "parameters":params, 
    #         "performance":performance["simulated"]}
    # d["runtime"] = time()-t0
    # df2.append(d)
    
    # eval
    



In [None]:
for METHOD in [MOFA2, moCluster, sparse_PCA, iClusterPlus, NMF]:  # NMF, MOFA2, moCluster, sparse_PCA
    method_name = METHOD.__name__.split('.')[-1]
    performances = []
    for SCENARIO in SCENARIOS:
        for N_GENES in N_GENES_LIST:

            #### prep file paths
            # data files
            data_path = f'/local/DESMOND2_data_simulated/simulated/{SCENARIO}'
            FILE = f'{SCENARIO}.n_genes={N_GENES},m=4,std=1,overlap=yes.exprs_z.tsv'
            file_path = os.path.join(data_path, FILE)
            # output folder
            output_path = os.path.join('/cosybio/project/hartung/unpast/unpast_simluated', method_name, SCENARIO, N_GENES)
            # ground truth file
            ground_truth_file = os.path.join(data_path, f'{SCENARIO}.n_genes={N_GENES},m=4,std=1,overlap=yes.biclusters.tsv')

            if SCENARIO == 'A':
                file_path = file_path.replace('overlap=yes', 'overlap=no')
                ground_truth_file= ground_truth_file.replace('overlap=yes', 'overlap=no')
                
            all_samples = set(pd.read_csv(file_path, sep="\t",index_col=0, nrows=0).columns.values)
            ground_truth = pd.read_csv(ground_truth_file,sep ="\t",index_col=0)
            ground_truth["samples"] = ground_truth["samples"].apply(lambda x: set(x.split(" ")))
            if "genes" in ground_truth.columns.values:
                ground_truth["genes"] = ground_truth["genes"].apply(lambda x: set(x.split(" ")))
            # make dictionary with sample sets for calculate_perforamce()
            known_groups = {}
            known_groups["simulated"] = {}
            for group in ground_truth.index.values:
                known_groups["simulated"][group] = ground_truth.loc[group,"samples"]

            #### prep params
            combinations = METHOD.generate_arg_list(file_path, output_path, ground_truth_file)
            
            for comb in combinations:
                result = METHOD.run_simulated(comb)
                performance, best_matches = calculate_perfromance(result, known_groups,all_samples)
                params = miscellaneous.combination_to_string(comb)
                d = {"scenario":SCENARIO,
                     "gsize":N_GENES,
                     "n_run": comb['random_state'] if 'random_state' in comb else 1,
                     "seed": comb['random_state'] if 'random_state' in comb else 1,
                     "parameters": params, 
                     "performance": performance["simulated"] if 'simulated' in performance else None}
                performances.append(d)
                
    df_performances = pd.DataFrame.from_records(performances)
    df_performances.to_csv(os.path.join('/cosybio/project/hartung/unpast/unpast_simluated', f"{method_name}_ABC.tsv"),sep = "\t")
        

    