In [None]:
# Set number of cores for parallel computation
KERNEL = 10


import os
os.environ["OMP_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
import sys,os
import random
import copy

import matplotlib.pyplot as plt
import seaborn as sns

from utils.eval import find_best_matches, generate_exprs

from methods import NMF, PCA, sparse_PCA, moCluster, MOFA2, iClusterPlus

from methods.utils import interpret_results, resultsHandler

from pathlib import Path
import multiprocessing as mp

from utils.eval import find_best_matches, make_known_groups

def match_known_subtypes(results, subtypes, annotation,exprs):
    
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(subtypes, exprs,target_col = "PAM50",verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(subtypes, exprs,target_col = 'SCMOD2',verbose=False)
    claudin = {} 
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']==1,:].index.values).intersection(all_samples)
    
    ihc = {}
    for x in ["IHC_HER2","IHC_ER","IHC_PR","IHC_TNBC"]:
        ihc[x] = set(annotation.loc[annotation[x]=="Positive",:].index.values)

    known_groups = [pam50,lum,claudin,scmod2,ihc]
    best_matches = []
    for group in known_groups:
        bm = find_best_matches(results,group,all_samples,FDR=0.05,verbose = False)
        best_matches.append(bm)
    best_matches = pd.concat(best_matches, axis=0)
    return best_matches


from utils.eval import find_best_matching_biclusters

def compare_gene_clusters(tcga_result,metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster, 
    # and the average J index for best matches 
    bm = find_best_matching_biclusters(tcga_result,metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()
    
    bm = bm.loc[bm["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    bm2 = bm2.loc[bm2["n_shared"]>1,:].sort_values(by="n_shared",ascending = False)
    
    
    clust_similarity = {}
    # number of biclusters 
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0]/metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:,"n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:,"n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:,"J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:,"J"].mean()
    
    
    return clust_similarity, bm, bm2




file_metabric_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904.annotation_v6.tsv'
file_metabric_expression = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv'
file_metabric_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv'
file_tcga_annotation = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv'
file_tcga_expression = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv'
file_tcga_subtypes = '/local/DESMOND2_data/v6/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv'
file_gene_mapping = '/local/DESMOND2_data/v6/preprocessed_v6/gene_id_mapping.tsv'

out_dir= '/home/bba1401/data/unpast_real'

basename_t = "TCGA"
basename_m = "METABRIC" 


m_subtypes = pd.read_csv(file_metabric_subtypes,sep = "\t",index_col=0)
m_annotation = pd.read_csv(file_metabric_annotation,sep = "\t",index_col=0)

t_subtypes = pd.read_csv(file_tcga_subtypes,sep = "\t",index_col=0)
t_annotation = pd.read_csv(file_tcga_annotation,sep = "\t",index_col=0)


exprs_t= pd.read_csv(file_tcga_expression,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(file_metabric_expression,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


METHODS = [iClusterPlus] # [NMF, sparse_PCA, moCluster, MOFA2]
for METHOD in METHODS:
    method_name = METHOD.__name__.split('.')[-1]
    
    #### Preparation
    # METABRIC
    file_path_m = file_metabric_expression
    output_path_m = os.path.join(out_dir, basename_m, method_name)
    ground_truth_file_m = file_metabric_annotation
    combinations_m = METHOD.generate_arg_list(file_path_m, output_path_m, ground_truth_file_m, cluster_range=range(10, 21))
    # TCGA
    file_path_t = file_tcga_expression
    output_path_t = os.path.join(out_dir, basename_t, method_name)
    ground_truth_file_t = file_tcga_annotation
    combinations_t = METHOD.generate_arg_list(file_path_t, output_path_t, ground_truth_file_t, cluster_range=range(10, 21))

    
    #### Compute in parallel
    # Option to compute the results in parallel, methods will store results
    # Follow up with executing the 'Run' below to read existing results and evaluate
    if KERNEL > 1:
        with mp.Pool(KERNEL) as pool:
            pool.map(METHOD.run_real, combinations_m + combinations_t)

            
    #### Run
    # Methods will compute results or read existing results
    # sanity check
    assert len(combinations_m) == len(combinations_t)
    subt_t = []
    subt_m = []
    clustering_similarities = []
    for comb_m, comb_t in zip(combinations_m, combinations_t):
        result_m, runtime_m = METHOD.run_real(comb_m)
        result_t, runtime_t = METHOD.run_real(comb_t)
        
        try:
            m_best_matches = match_known_subtypes(result_m, m_subtypes, m_annotation,exprs_m)
            m_best_matches = m_best_matches["J"].to_dict()
        except ZeroDivisionError:
            m_best_matches = {}
        subt_m.append(m_best_matches)

        try:
            t_best_matches = match_known_subtypes(result_t, t_subtypes, t_annotation,exprs_t)
            t_best_matches = t_best_matches["J"].to_dict()
        except ZeroDivisionError:
            t_best_matches = {}
        subt_t.append(t_best_matches)
        
    # save results
    pd.DataFrame.from_records(subt_m).to_csv(os.path.join(out_dir, basename_m, method_name, 'best_cluster_matches.csv'))    
    pd.DataFrame.from_records(subt_t).to_csv(os.path.join(out_dir, basename_t, method_name, 'best_cluster_matches.csv'))


Returning existing results: Returning existing results:Returning existing results:Returning existing results:/home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=1/n_cluster=10/type=gaussian/n_burnin=200/n_draw=200/lambda_n=5/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1
Returning existing results: Returning existing results:Returning existing results: Returning existing results:Returning existing results:/home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=1/n_cluster=18/type=gaussian/n_burnin=200/n_draw=200/lambda_n=10/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1  /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=3/n_cluster=10/type=gaussian/n_burnin=200/n_draw=200/lambda_n=5/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1  
Returning existing results:/home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=1/n_cluster=15/type=gaussian/n_burnin=200/n_draw=200/lambda_n=25/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1 /home/bba1401/

Returning existing results: Running /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=2/n_cluster=19/type=gaussian/n_burnin=200/n_draw=200/lambda_n=10/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1... /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=2/n_cluster=10/type=gaussian/n_burnin=200/n_draw=200/lambda_n=null/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1
Running /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=3/n_cluster=13/type=gaussian/n_burnin=200/n_draw=200/lambda_n=null/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1...
/home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=3/n_cluster=11/type=gaussian/n_burnin=200/n_draw=200/lambda_n=5/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1

Returning existing results: Running /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=1/n_cluster=16/type=gaussian/n_burnin=200/n_draw=200/lambda_n=25/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1.../home/bba1401/data

Loading required package: parallel
Loading required package: data.table
Loading required package: parallel
Loading required package: parallel
Loading required package: data.table
Loading required package: parallel
Loading required package: parallel
Loading required package: parallel
Loading required package: data.table
Loading required package: parallel
Loading required package: data.table
Loading required package: data.table
Loading required package: parallel
Loading required package: data.table
Loading required package: parallel
Loading required package: parallel
Loading required package: data.table
Loading required package: data.table
Loading required package: data.table
Loading required package: data.table


50  points of lambdas are used to tune parameters.
50  points of lambdas are used to tune parameters.
Begin parallel computation
Begin parallel computation
10  points of lambdas are used to tune parameters.
Begin parallel computation
50  points of lambdas are used to tune parameters.
Begin parallel computation
25  points of lambdas are used to tune parameters.
Begin parallel computation
50  points of lambdas are used to tune parameters.
Begin parallel computation
50  points of lambdas are used to tune parameters.
Begin parallel computation
50  points of lambdas are used to tune parameters.
Begin parallel computation
50  points of lambdas are used to tune parameters.
Begin parallel computation
10  points of lambdas are used to tune parameters.
Begin parallel computation
End parallel computation
Saved /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=1/n_cluster=19/type=gaussian/n_burnin=200/n_draw=200/lambda_n=10/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1.
Running /h

Loading required package: parallel
Loading required package: data.table


25  points of lambdas are used to tune parameters.
Begin parallel computation
End parallel computation
Saved /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=2/n_cluster=19/type=gaussian/n_burnin=200/n_draw=200/lambda_n=10/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1.
Running /home/bba1401/data/unpast_real/METABRIC/iClusterPlus/random_state=2/n_cluster=19/type=gaussian/n_burnin=200/n_draw=200/lambda_n=25/maxiter=20/sdev=0.05/eps=0.0001/lambda_scale=1...


Loading required package: parallel
Loading required package: data.table


25  points of lambdas are used to tune parameters.
Begin parallel computation
