## get best parameters

In [1]:
# best average rank in BRCA

import pandas as pd
from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus

for method in [NMF, moCluster, MOFA2, iClusterPlus, sparse_PCA]:
    method_name = method.__name__.split('.')[-1]
    print('method_name', method_name)
    
    df_meta = pd.read_csv(f'/cosybio/project/hartung/unpast/unpast_real/{method_name}_METABRIC.tsv', sep='\t', index_col=0)
    df_tcga = pd.read_csv(f'/cosybio/project/hartung/unpast/unpast_real/{method_name}_TCGA.tsv', sep='\t', index_col=0)
    
    df_meta = df_meta.dropna(subset='parameters')
    df_tcga = df_tcga.dropna(subset='parameters')
    
    # remove random state from params
    df_meta['parameters'] = df_meta['parameters'].map(lambda x: ';'.join([y for y in x.split(';') if not y.startswith('random_state=')]))
    df_tcga['parameters'] = df_tcga['parameters'].map(lambda x: ';'.join([y for y in x.split(';') if not y.startswith('random_state=')]))
    
    # merge permutations
    df_meta = df_meta.groupby('parameters').mean()
    df_tcga = df_tcga.groupby('parameters').mean()
    
    df_meta = df_meta.sort_values('PAM50', ascending=False)
    df_meta['rank'] = list(range(len(df_meta.index)))
    
    df_tcga = df_tcga.sort_values('PAM50', ascending=False)
    df_tcga['rank'] = list(range(len(df_tcga.index)))
    
    df = pd.concat([df_tcga, df_meta]).groupby('parameters').sum()
    
    params = df.sort_values('rank', ascending=True).index[0]
    d = {}
    for x in params.split(';'):
        if not len(x):
            continue
        key, value = x.split('=') 
        try:
            value = eval(value)
        except:
            pass
        d[key] = value
    print(d)

method_name NMF
{'k': 8, 'init': 'nndsvda', 'tol': 0.0001, 'transposed': False, 'alpha_W': -0.1, 'alpha_H': 0.0, 'shuffle': False, 'solver': 'cd', 'beta_loss': 'frobenius', 'max_iter': 1000}
method_name moCluster
{'n_dimensions': 5, 'n_cluster': 13, 'solver': 'svd', 'center': True, 'method': 'globalScore', 'option': 'uniform', 'scale': False, 'k': 1}
method_name MOFA2
{'n_factors': 12, 'n_cluster': 9, 'ard_weights': True, 'ard_factors': False, 'likelihood': 'gaussian', 'spikeslab_weights': True, 'spikeslab_factors': False}
method_name iClusterPlus
{'lambda_n': 10, 'n_cluster': 12, 'lambda_scale': 1, 'iter_max': 20, 'eps': 0.0001, 'type': 'gaussian', 'burnin_n': 200, 'draw_n': 200, 'sdev': 0.05}
method_name sparse_PCA
{'n_components': 9, 'alpha': 1, 'ridge_alpha': 0.001, 'max_iter': 1000, 'method': 'cd', 'tol': 1e-08}


In [3]:
# highest PAM50 in each dataset

import pandas as pd
from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus

for method in [NMF, moCluster, MOFA2, iClusterPlus, sparse_PCA]:
    method_name = method.__name__.split('.')[-1]
    print('method_name', method_name)
    for dataset in ['TCGA', 'METABRIC']:
        print(dataset)
        
        df_meta = pd.read_csv(f'/cosybio/project/hartung/unpast/unpast_real/{method_name}_{dataset}.tsv', sep='\t', index_col=0)
        
        df_meta = df_meta.dropna(subset='parameters')
        
        # remove random state from params
        df_meta['parameters'] = df_meta['parameters'].map(lambda x: ';'.join([y for y in x.split(';') if not y.startswith('random_state=')]))
        
        # merge permutations
        df_meta = df_meta.groupby('parameters').mean()
        
        params = df_meta.sort_values('PAM50', ascending=False).index[0]
        d = {}
        for x in params.split(';'):
            if not len(x):
                continue
            key, value = x.split('=') 
            try:
                value = eval(value)
            except:
                pass
            d[key] = value
        print(d)

method_name NMF
TCGA
{'k': 8, 'init': 'nndsvda', 'tol': 0.0001, 'transposed': False, 'alpha_W': -0.1, 'alpha_H': 0.0, 'shuffle': False, 'solver': 'cd', 'beta_loss': 'frobenius', 'max_iter': 1000}
METABRIC
{'k': 3, 'init': 'nndsvda', 'tol': 0.0001, 'transposed': False, 'alpha_W': -0.1, 'alpha_H': 0.0, 'shuffle': False, 'solver': 'cd', 'beta_loss': 'frobenius', 'max_iter': 200}
method_name moCluster
TCGA
{'n_dimensions': 4, 'n_cluster': 10, 'solver': 'fast', 'center': True, 'method': 'globalScore', 'option': 'inertia', 'scale': False, 'k': 1}
METABRIC
{'n_dimensions': 15, 'n_cluster': 20, 'solver': 'fast', 'center': True, 'method': 'globalScore', 'option': 'lambda1', 'scale': False, 'k': 0.1}
method_name MOFA2
TCGA
{'n_factors': 2, 'n_cluster': 7, 'ard_weights': True, 'ard_factors': False, 'likelihood': 'gaussian', 'spikeslab_weights': True, 'spikeslab_factors': False}
METABRIC
{'n_factors': 19, 'n_cluster': 11, 'ard_weights': True, 'ard_factors': False, 'likelihood': 'gaussian', 'spikes

In [15]:
# highest in simulated

import pandas as pd
from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus

for method in [NMF, moCluster, MOFA2, iClusterPlus, sparse_PCA]:
    method_name = method.__name__.split('.')[-1]
    print('method_name', method_name)
    
    df_meta = pd.read_csv(f'/cosybio/project/hartung/unpast/unpast_simluated/{method_name}_ABC.tsv', sep='\t', index_col=0)
    
    df_meta = df_meta.dropna(subset='parameters')
    
    # # remove random state from params
    df_meta['parameters'] = df_meta['parameters'].map(lambda x: ';'.join([y for y in x.split(';') if not y.startswith('random_state=')]))
    
    params = df_meta.groupby('parameters').mean().sort_values(['performance'], ascending=False).index[0]
    
    d = {}
    for x in params.split(';'):
        if not len(x):
            continue
        key, value = x.split('=') 
        try:
            value = eval(value)
        except:
            pass
        d[key] = value
    print(d)

method_name NMF
{'k': 3, 'init': 'nndsvd', 'tol': 0.0001, 'transposed': False, 'alpha_W': 0.2, 'alpha_H': 0.0, 'shuffle': True, 'solver': 'cd', 'beta_loss': 'frobenius', 'max_iter': 1000}
method_name moCluster
{'n_dimensions': 8, 'n_cluster': 8, 'solver': 'svd', 'center': True, 'method': 'globalScore', 'option': 'uniform', 'scale': False, 'k': 0.1}
method_name MOFA2
{'n_factors': 18, 'n_cluster': 5, 'ard_weights': True, 'ard_factors': False, 'likelihood': 'gaussian', 'spikeslab_weights': True, 'spikeslab_factors': False}
method_name iClusterPlus
{'lambda_n': 10, 'n_cluster': 4, 'lambda_scale': 1, 'iter_max': 20, 'eps': 0.0001, 'type': 'gaussian', 'burnin_n': 200, 'draw_n': 200, 'sdev': 0.05}
method_name sparse_PCA
{'n_components': 2, 'alpha': 1, 'ridge_alpha': 0.1, 'max_iter': 1000, 'method': 'cd', 'tol': 1e-08}


## get results with parameters

In [5]:
# asthma

import best_parameters
import pandas as pd
from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus
import glob

asthma_default_dir = '/cosybio/project/hartung/unpast/unpast_asthma/results_default'
asthma_optimized_dir = '/cosybio/project/hartung/unpast/unpast_asthma/results'

for method in [NMF, moCluster, MOFA2, iClusterPlus, sparse_PCA]:
    for dataset in ['GSE4302']: #  GSE89809, 
        method_name = method.__name__.split('.')[-1]
        print('method_name', method_name)
        paths = glob.glob(f'{asthma_default_dir}/{method_name}/{dataset}/*/result.tsv')
        for path in paths:
            seed = path.split('/')[-2]
            df = pd.read_csv(path, sep='\t', index_col=0)
            df.to_csv(f'{method_name}_{seed}.tsv', sep='\t')

method_name NMF
method_name moCluster
method_name MOFA2
method_name iClusterPlus
method_name sparse_PCA


In [28]:
# BRCA

import best_parameters
import pandas as pd
from methods import NMF, sparse_PCA, moCluster, MOFA2, iClusterPlus
import glob
import os
import copy

file_metabric_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904.annotation_v6.tsv'
file_metabric_expression = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv'
file_metabric_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv'
file_tcga_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv'
file_tcga_expression = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv'
file_tcga_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv'
file_gene_mapping = '/local/DESMOND2_data/v6/preprocessed_v6/gene_id_mapping.tsv'

# out_dir = '/home/hartung/data/preprocessed_v6/results's
out_dir = '/cosybio/project/hartung/unpast/unpast_real'

basename_m = "METABRIC"

METHODS = [sparse_PCA, NMF, moCluster, MOFA2, iClusterPlus]
for METHOD in METHODS:
    method_name = METHOD.__name__.split('.')[-1]
    print('method_name', method_name)

    #### Preparation
    # METABRIC
    file_path_m = file_metabric_expression
    output_path_m = os.path.join(out_dir, basename_m, method_name)
    ground_truth_file_m = file_metabric_annotation
    combinations_m = METHOD.generate_arg_list(file_path_m, output_path_m, ground_truth_file_m)
    
    params = getattr(best_parameters.BestBrcaMETABRIC, method_name)
    for comb_m in combinations_m:
        is_hit = True
        for key, value in params.items():
            if comb_m[key] != value:
                is_hit = False
                break
        if is_hit: 
            seed = comb_m['random_state']

            # comb_m['output_path'] = comb_m['output_path'].replace('METABRIC', 'TCGA')
            
            result_m, runtime_m = METHOD.run_real(comb_m, is_terminated=True)
            if isinstance(result_m['genes'][0], str):
                result_m['genes'] = result_m['genes'].map(eval)
                result_m['genes'] = result_m['genes'].map(set)
            
            result_m.to_csv(f'{method_name}_{seed}.tsv', sep='\t')
    

method_name sparse_PCA
method_name NMF
method_name moCluster
method_name MOFA2
method_name iClusterPlus


In [27]:
# scp -r bba1401@llaima.zbh.uni-hamburg.de:/home/bba1401/Projects/unpast/DESMOND2/evaluation/factorization/*.tsv .

TypeError: unsupported operand type(s) for +: 'int' and 'str'

Unnamed: 0,samples,n_samples,genes,n_genes
0,{},0,"{ELANE, CCDC34, CDC42, NDN, SLIT3, DPY30, EPHA...",14963
1,"{MB-4674, MB-4310, MB-3417, MB-5086}",4,"{ELANE, CDC42, NDN, BCL2L15, DPY30, KDELR3, RO...",14075
2,"{MB-7044, MB-0641, MB-2900, MB-0442, MB-0286, ...",54,"{ELANE, CCDC34, CDC42, NDN, SLIT3, DPY30, EPHA...",14711
3,"{MB-0079, MB-2613, MB-0048, MB-3351, MB-2618, ...",38,"{ELANE, CDC42, NDN, SLIT3, DPY30, EPHA10, KDEL...",14540
4,"{MB-0906, MB-4189, MB-6062, MB-2643, MB-0209, ...",47,"{CCDC34, CDC42, NDN, SLIT3, DPY30, ROR1, PROS1...",13453
