In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import sys 
import bokeh
from bokeh.plotting import figure, show

sys.path.append('../../')
from utils import bokeh_ui_utils

In [2]:
def csv_files_to_df(files: list):

    combined_df = None

    for f in files:

        if combined_df is None:
            combined_df = pd.read_csv(f)
            continue

        curr_df = pd.read_csv(f)

        combined_df = pd.concat([combined_df, curr_df], ignore_index=True)

    return combined_df

In [3]:
SUBSETS = ['max_fraction_same_cluster_MCMsubset.tsv', 'max_fraction_same_cluster_RibosomalSubset.tsv', 'max_fraction_same_cluster_MucocystSubsets.tsv', 'max_fraction_same_cluster_Histone_subset.tsv', 'max_fraction_same_cluster_ProteosomeSubset.tsv']

In [4]:
FRACTION_THRESHOLD = 0.1

In [5]:
microarray_data_pattern = './microarray/*'

rna_seq_data_pattern = './rna_seq/*'

microarray_stats_files = glob.glob(microarray_data_pattern)

rna_seq_stats_files = glob.glob(rna_seq_data_pattern)

microarray_stats_df = csv_files_to_df(microarray_stats_files)

scan_data = csv_files_to_df(rna_seq_stats_files)

In [6]:
#             TTHERM_ID module   phase  normalized_expression
# 0       YF00037831.t1  m0595  000min              -0.361857
# 1       YF00030341.t1  m0595  000min              -1.183010
# 2       YF00030300.t1  m0595  000min              -1.395015
# 3       YF00028635.t1  m0595  000min              -1.039003
# 4       YF00027788.t1  m0595  000min              -0.337790
# ...               ...    ...     ...                    ...
# 193711  YF00013907.t1  m0000  240min               1.180763
# 193712  YF00009909.t1  m0000  240min              -0.944980
# 193713  YF00007402.t1  m0000  240min              -0.145474
# 193714  YF00003605.t1  m0000  240min              -0.363803
# 193715  YF00001862.t1  m0000  240min              -0.189259

In [7]:
METRIC = 'manhattan'

scan_data_filtered = scan_data.loc[
    (scan_data['metric'] == METRIC)
    &
    (scan_data['parameter'] <= 0.3)
]

scan_data_filtered.head()

Unnamed: 0,partition_type,dimensionality,metric,graph,nns,clustering,parameter,silhouette_score,modularity,nclusters,...,sd_enriched_cluster_size,max_enriched_cluster_size,min_enriched_cluster_size,nenriched_cluster_genes,datetime,max_fraction_same_cluster_MCMsubset.tsv,max_fraction_same_cluster_ProteosomeSubset.tsv,max_fraction_same_cluster_Histone_subset.tsv,max_fraction_same_cluster_MucocystSubsets.tsv,max_fraction_same_cluster_RibosomalSubset.tsv
5720,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,12,leiden_cpm,0.0,,0.0,1,...,0.0,21524,21524,21524,2024-05-07 20:40:29.755280,1.0,1.0,1.0,1.0,1.0
5721,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,12,leiden_cpm,0.005,0.01472,0.559733,160,...,76.832819,475,475,12139,2024-05-07 20:40:29.755280,0.5,0.367347,0.714286,0.566667,0.722222
5722,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,12,leiden_cpm,0.01,0.003422,0.495548,300,...,38.37898,219,219,9715,2024-05-07 20:40:29.755280,0.5,0.367347,0.714286,0.566667,0.522222
5723,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,12,leiden_cpm,0.015,-0.000666,0.453829,444,...,24.369918,140,140,8951,2024-05-07 20:40:29.755280,0.5,0.367347,0.714286,0.566667,0.533333
5724,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,12,leiden_cpm,0.02,-0.000599,0.42981,567,...,20.563502,116,116,8326,2024-05-07 20:40:29.755280,0.5,0.367347,0.714286,0.566667,0.533333


In [8]:
x_heatmap_profile = sorted([str(n) for n in list(scan_data_filtered['nns'].unique())])
x_heatmap_profile

['10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9']

In [9]:
rps = sorted([str(n) for n in list(scan_data_filtered['parameter'].unique())])
rps

['0.0',
 '0.005',
 '0.01',
 '0.015',
 '0.02',
 '0.025',
 '0.03',
 '0.035',
 '0.04',
 '0.045',
 '0.05',
 '0.055',
 '0.06',
 '0.065',
 '0.07',
 '0.075',
 '0.08',
 '0.085',
 '0.09',
 '0.095',
 '0.1',
 '0.105',
 '0.11',
 '0.115',
 '0.12',
 '0.125',
 '0.13',
 '0.135',
 '0.14',
 '0.145',
 '0.15',
 '0.155',
 '0.16',
 '0.165',
 '0.17',
 '0.175',
 '0.18',
 '0.185',
 '0.19',
 '0.195',
 '0.2',
 '0.205',
 '0.21',
 '0.215',
 '0.22',
 '0.225',
 '0.23',
 '0.235',
 '0.24',
 '0.245',
 '0.25',
 '0.255',
 '0.26',
 '0.265',
 '0.27',
 '0.275',
 '0.28',
 '0.285',
 '0.29',
 '0.295',
 '0.3']

In [10]:
scan_data_filtered.columns

Index(['partition_type', 'dimensionality', 'metric', 'graph', 'nns',
       'clustering', 'parameter', 'silhouette_score', 'modularity',
       'nclusters', 'mean_cluster_size', 'median_cluster_size',
       'sd_cluster_size', 'max_cluster_size', 'min_cluster_size', 'ngenes',
       'nenriched_clusters', 'mean_enriched_cluster_size',
       'median_enriched_cluster_size', 'sd_enriched_cluster_size',
       'max_enriched_cluster_size', 'min_enriched_cluster_size',
       'nenriched_cluster_genes', 'datetime',
       'max_fraction_same_cluster_MCMsubset.tsv',
       'max_fraction_same_cluster_ProteosomeSubset.tsv',
       'max_fraction_same_cluster_Histone_subset.tsv',
       'max_fraction_same_cluster_MucocystSubsets.tsv',
       'max_fraction_same_cluster_RibosomalSubset.tsv'],
      dtype='object')

In [11]:
for SUBSET in SUBSETS:

    scan_data_filtered_hm = scan_data_filtered.loc[:, ['parameter', 'nclusters', 'nns', SUBSET]]

    scan_data_filtered_hm['parameter'] = [str(n) for n in scan_data_filtered_hm['parameter']]
    scan_data_filtered_hm['nns'] = [str(n) for n in scan_data_filtered_hm['nns']]
    scan_data_filtered_hm['nclusters'] = [str(n) for n in scan_data_filtered_hm['nclusters']]
    scan_data_filtered_hm.rename(columns={'nclusters': 'module', SUBSET: SUBSET.replace('.tsv', '')}, inplace=True)

    hm_cds = bokeh.plotting.ColumnDataSource(scan_data_filtered_hm)
    hm_cds.data['fill_alpha'] = [0.7]*len(scan_data_filtered_hm)
    hm_cds.data['line_alpha'] = [0.7]*len(scan_data_filtered_hm)

    hm = bokeh_ui_utils.heatmap(hm_cds, bokeh.palettes.Inferno256, 0, 1, x_heatmap_profile, rps, s_z=SUBSET.replace('.tsv', ''), index_name='parameter', col_name='nns', plot_sizing_mode='stretch_both')

    show(hm)