In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import sys 
import bokeh
from bokeh.plotting import figure, show
from bokeh.models import TabPanel, Tabs

sys.path.append('../../')
from utils import bokeh_ui_utils

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
def csv_files_to_df(files: list):

    combined_df = None

    for f in files:

        if combined_df is None:
            combined_df = pd.read_csv(f)
            continue

        curr_df = pd.read_csv(f)

        combined_df = pd.concat([combined_df, curr_df], ignore_index=True)

    return combined_df

In [3]:
SUBSETS = ['max_fraction_same_cluster_MCMsubset.tsv', 'max_fraction_same_cluster_RibosomalSubset.tsv', 'max_fraction_same_cluster_MucocystSubsets.tsv', 'max_fraction_same_cluster_Histone_subset.tsv', 'max_fraction_same_cluster_ProteosomeSubset.tsv']

# MODIFIABLE PARAMETERS

In [4]:
METRIC = 'manhattan'

MAX_RESOLUTION_PARAMETER = 0.5

DATASET = 'microarray'
# DATASET = 'rna_seq'

In [5]:
data_pattern = f'././tgd2024_may15_avg_scan_stats_{DATASET}/*'

stats_files = glob.glob(data_pattern)

scan_data = csv_files_to_df(stats_files)

In [6]:
scan_data_filtered = scan_data.loc[
    (scan_data['metric'] == METRIC)
    &
    (scan_data['parameter'] <= MAX_RESOLUTION_PARAMETER)
]

scan_data_filtered.head()

Unnamed: 0,partition_type,dimensionality,metric,graph,nns,clustering,parameter,silhouette_score,modularity,nclusters,...,sd_enriched_cluster_size,max_enriched_cluster_size,min_enriched_cluster_size,nenriched_cluster_genes,datetime,max_fraction_same_cluster_MCMsubset.tsv,max_fraction_same_cluster_ProteosomeSubset.tsv,max_fraction_same_cluster_Histone_subset.tsv,max_fraction_same_cluster_MucocystSubsets.tsv,max_fraction_same_cluster_RibosomalSubset.tsv
0,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,11,leiden_cpm,0.0,,0.0,1,...,0.0,20426,20426,20426,2024-05-20 11:36:20.510456,1.0,1.0,1.0,1.0,1.0
1,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,11,leiden_cpm,0.005,0.032619,0.56081,173,...,62.729253,344,25,14288,2024-05-20 11:36:20.510456,0.333333,0.5,1.0,1.0,0.428571
2,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,11,leiden_cpm,0.01,0.02841,0.499882,330,...,32.895869,162,8,11457,2024-05-20 11:36:20.510456,0.333333,0.5,1.0,1.0,0.428571
3,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,11,leiden_cpm,0.015,0.026659,0.462227,474,...,21.051883,110,12,9266,2024-05-20 11:36:20.510456,0.333333,0.5,1.0,1.0,0.428571
4,EXP,baseline,manhattan,umap_fuzzy_simplicial_set,11,leiden_cpm,0.02,0.026406,0.441483,609,...,18.903373,98,8,8601,2024-05-20 11:36:20.510456,0.333333,0.5,1.0,1.0,0.428571


In [7]:
x_heatmap_profile = [str(n) for n in sorted(list(scan_data_filtered['nns'].unique()))]
x_heatmap_profile

['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

In [8]:
rps = [str(n) for n in sorted(list(scan_data_filtered['parameter'].unique()))]
rps

['0.0',
 '0.005',
 '0.01',
 '0.015',
 '0.02',
 '0.025',
 '0.03',
 '0.035',
 '0.04',
 '0.045',
 '0.05',
 '0.055',
 '0.06',
 '0.065',
 '0.07',
 '0.075',
 '0.08',
 '0.085',
 '0.09',
 '0.095',
 '0.1',
 '0.105',
 '0.11',
 '0.115',
 '0.12',
 '0.125',
 '0.13',
 '0.135',
 '0.14',
 '0.145',
 '0.15',
 '0.155',
 '0.16',
 '0.165',
 '0.17',
 '0.175',
 '0.18',
 '0.185',
 '0.19',
 '0.195',
 '0.2',
 '0.205',
 '0.21',
 '0.215',
 '0.22',
 '0.225',
 '0.23',
 '0.235',
 '0.24',
 '0.245',
 '0.25',
 '0.255',
 '0.26',
 '0.265',
 '0.27',
 '0.275',
 '0.28',
 '0.285',
 '0.29',
 '0.295',
 '0.3',
 '0.305',
 '0.31',
 '0.315',
 '0.32',
 '0.325',
 '0.33',
 '0.335',
 '0.34',
 '0.345',
 '0.35',
 '0.355',
 '0.36',
 '0.365',
 '0.37',
 '0.375',
 '0.38',
 '0.385',
 '0.39',
 '0.395',
 '0.4',
 '0.405',
 '0.41',
 '0.415',
 '0.42',
 '0.425',
 '0.43',
 '0.435',
 '0.44',
 '0.445',
 '0.45',
 '0.455',
 '0.46',
 '0.465',
 '0.47',
 '0.475',
 '0.48',
 '0.485',
 '0.49',
 '0.495',
 '0.5']

In [9]:
scan_data_filtered.columns

Index(['partition_type', 'dimensionality', 'metric', 'graph', 'nns',
       'clustering', 'parameter', 'silhouette_score', 'modularity',
       'nclusters', 'mean_cluster_size', 'median_cluster_size',
       'sd_cluster_size', 'max_cluster_size', 'min_cluster_size', 'ngenes',
       'nenriched_clusters', 'mean_enriched_cluster_size',
       'median_enriched_cluster_size', 'sd_enriched_cluster_size',
       'max_enriched_cluster_size', 'min_enriched_cluster_size',
       'nenriched_cluster_genes', 'datetime',
       'max_fraction_same_cluster_MCMsubset.tsv',
       'max_fraction_same_cluster_ProteosomeSubset.tsv',
       'max_fraction_same_cluster_Histone_subset.tsv',
       'max_fraction_same_cluster_MucocystSubsets.tsv',
       'max_fraction_same_cluster_RibosomalSubset.tsv'],
      dtype='object')

In [10]:
tabs = []

for SUBSET in SUBSETS:

    scan_data_filtered_hm = scan_data_filtered.loc[:, ['parameter', 'nns', SUBSET]]

    scan_data_filtered_hm['parameter'] = [str(n) for n in scan_data_filtered_hm['parameter']]
    scan_data_filtered_hm['nns'] = [str(n) for n in scan_data_filtered_hm['nns']]
    scan_data_filtered_hm.rename(columns={'nclusters': 'module', SUBSET: SUBSET.replace('.tsv', '')}, inplace=True)

    hm_cds = bokeh.plotting.ColumnDataSource(scan_data_filtered_hm)
    hm_cds.data['fill_alpha'] = [0.7]*len(scan_data_filtered_hm)
    hm_cds.data['line_alpha'] = [0.7]*len(scan_data_filtered_hm)

    hm = bokeh_ui_utils.heatmap(hm_cds, bokeh.palettes.Inferno256, 0, 1, x_heatmap_profile, rps, s_z=SUBSET.replace('.tsv', ''), index_name='parameter', col_name='nns', plot_sizing_mode='inherit', hover_module=False)

    tabs.append(TabPanel(child=hm, title=SUBSET.replace('.tsv', '').replace('max_fraction_same_cluster_', '')))

tabbed_plot = Tabs(tabs=tabs)

In [11]:
show(tabbed_plot)