In [1]:
from matplotlib.pyplot import show as show_static
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import distinctipy
import glob

sys.path.append('../../')
from utils import dataframe_utils

### SET PLOTTING METRIC

In [2]:
# plotting_metric = 'manhattan'
# plotting_metric = 'clr'
plotting_metric = 'cosine'
# plotting_metric = 'minkowski_0.25'
# plotting_metric = 'minkowski_0.5'
# plotting_metric = 'minkowski_0.75'
# plotting_metric = 'minkowski_1.0'
# plotting_metric = 'minkowski_1.5'
# plotting_metric = 'minkowski_2.0'
# plotting_metric = 'minkowski_3.0'
# plotting_metric = 'minkowski_4.0'
# plotting_metric = 'minkowski_5.0'
# plotting_metric = 'euclidean'

In [3]:
# df_stats = pd.read_csv('./scan_stats_mar2024.csv')
# df_stats = pd.read_csv('./scan_stats_v2.csv')
# df_stats = pd.read_csv('./scan_stats_clr.csv')
# df_stats = pd.read_csv('./scan_stats_metrics2.csv')
# df_stats = pd.read_csv('./scan_stats_sklearn_distance.csv')
# df_stats = pd.read_csv('./scan_stats_sklearn_distance_v2.csv')

pattern = './scan_stats_apr02/*'

stats_files = glob.glob(pattern)

df_stats = None

for s_file in stats_files:
    if df_stats is None:
        df_stats = pd.read_csv(s_file)
        continue

    curr_df = pd.read_csv(s_file)

    df_stats = pd.concat([df_stats, curr_df], ignore_index=True)

In [4]:
def compute_fraction_clusters_enriched(row):
    return row['nenriched_clusters'] / (row['nclusters'])

In [5]:
df_stats['fraction_clusters_enriched'] = df_stats.apply(compute_fraction_clusters_enriched, axis=1)

In [6]:
df_stats['metric'].unique()

array(['minkowski_0.5', 'clr', 'minkowski_4.0', 'manhattan',
       'minkowski_1.0', 'minkowski_2.0', 'euclidean', 'minkowski_5.0',
       'cosine', 'minkowski_3.0'], dtype=object)

In [7]:
df_stats.columns

Index(['partition_type', 'dimensionality', 'metric', 'graph', 'nns',
       'clustering', 'parameter', 'silhouette_score', 'modularity',
       'nclusters', 'mean_cluster_size', 'median_cluster_size',
       'sd_cluster_size', 'nenriched_clusters', 'mean_enriched_cluster_size',
       'median_enriched_cluster_size', 'sd_enriched_cluster_size',
       'nenriched_cluster_genes', 'datetime', 'fraction_clusters_enriched'],
      dtype='object')

In [8]:
{col: type(df_stats[col].values[0]) for col in list(df_stats.columns)}

{'partition_type': str,
 'dimensionality': str,
 'metric': str,
 'graph': str,
 'nns': numpy.int64,
 'clustering': str,
 'parameter': numpy.float64,
 'silhouette_score': numpy.float64,
 'modularity': numpy.float64,
 'nclusters': numpy.int64,
 'mean_cluster_size': numpy.float64,
 'median_cluster_size': numpy.float64,
 'sd_cluster_size': numpy.float64,
 'nenriched_clusters': numpy.int64,
 'mean_enriched_cluster_size': numpy.float64,
 'median_enriched_cluster_size': numpy.float64,
 'sd_enriched_cluster_size': numpy.float64,
 'nenriched_cluster_genes': numpy.int64,
 'datetime': str,
 'fraction_clusters_enriched': numpy.float64}

In [9]:
[col for col in list(df_stats.columns) if isinstance(df_stats[col].values[0], (int, float, np.int64, np.float64))]

['nns',
 'parameter',
 'silhouette_score',
 'modularity',
 'nclusters',
 'mean_cluster_size',
 'median_cluster_size',
 'sd_cluster_size',
 'nenriched_clusters',
 'mean_enriched_cluster_size',
 'median_enriched_cluster_size',
 'sd_enriched_cluster_size',
 'nenriched_cluster_genes',
 'fraction_clusters_enriched']

In [10]:
def smallest_unit(number):
    # Convert the number to a string to handle decimal places
    num_str = str(number)
    
    # Find the position of the decimal point
    decimal_pos = num_str.find('.')
    
    # If there's no decimal point, return 1 (for integers)
    if decimal_pos == -1:
        return 1
    
    # Calculate the length of the fractional part
    fractional_length = len(num_str) - decimal_pos - 1
    
    # Calculate the smallest unit
    smallest_unit = 10 ** (-fractional_length)
    
    return smallest_unit

In [11]:
# resource: https://malouche.github.io/notebooks/scatter_bokeh2.html
from bokeh.plotting import figure, show
from bokeh.models import HoverTool
from bokeh.models import Panel, Tabs
from bokeh.plotting import output_file, save

import pandas as pd
import math

tabs = []

bokeh_data_dict = {}

bokeh_data_dict['METRIC COMPARISON: UNFILTERED'] = df_stats

filter_cols = [col for col in list(df_stats.columns) if isinstance(df_stats[col].values[0], (int, float, np.int64, np.float64))]

bokeh_data_dict['METRIC COMPARISON: FILTERED (10 < mean_cluster_size < 40) (10 < median_cluster_size < 40)'] = df_stats.loc[
    # (df_stats['metric'] == 'cosine') &
    # (df_stats['nns'] == 3) & 
    # (df_stats['parameter'] == 0.405) & 

    (df_stats['mean_cluster_size'] > 10) &
    (df_stats['median_cluster_size'] > 10) &
    (df_stats['mean_cluster_size'] < 40) &
    (df_stats['median_cluster_size'] < 40) &
    # (df_stats['nenriched_cluster_genes'] > 7000) &
    # (df_stats['modularity'] > 0.4) &

    (df_stats['dimensionality'] == 'baseline') &
    (df_stats['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats['clustering'] == 'leiden_cpm') &
    (df_stats['partition_type'] == 'EXP')
]

bokeh_x = 'modularity'
# bokeh_y = 'nenriched_clusters'
bokeh_y = 'fraction_clusters_enriched'
color_column = 'metric'  # Column to use for coloring the points

x_min, x_max = math.floor(df_stats[bokeh_x].min()) - smallest_unit(df_stats[bokeh_x].min()), math.ceil(df_stats[bokeh_x].max()) + smallest_unit(df_stats[bokeh_x].max())
y_min, y_max = math.floor(df_stats[bokeh_y].min()) - smallest_unit(df_stats[bokeh_y].min()), math.ceil(df_stats[bokeh_y].max()) + smallest_unit(df_stats[bokeh_y].max())

metric_colors_dict = {
    'minkowski_5.0': '#fa0217',
    'minkowski_3.0': '#fa029b',
    'minkowski_2.0': '#bc02fa',
    'clr': '#0207fa',
    'cosine': '#02e1fa',
    'minkowski_1.0': '#02fa28',
    'manhattan': '#e9fa02',
    'minkowski_0.5': '#fa9f02',
    'euclidean': '#02fab8',
    'minkowski_4.0': '#8bfa02'
}

for tab_name, bokeh_df in bokeh_data_dict.items():

    p = figure(plot_width=1000, plot_height=800, title='HOVER OVER POINTS TO SHOW DATA. CLICK ON METRICS IN THE LEGEND TO HIDE THEM.', sizing_mode='stretch_both', x_range=(x_min, x_max), y_range=(y_min, y_max))

    for val in sorted(bokeh_df[color_column].unique()):
        col_df=bokeh_df.loc[(bokeh_df[color_column]==val)]
        p.scatter(bokeh_x, bokeh_y, size=3, color=metric_colors_dict[val], legend_label=val, source=col_df)


    hover = HoverTool(tooltips=[
        ('Partition Type', '@partition_type'),
        ('Dimensionality', '@dimensionality'),
        ('Metric', '@metric'),
        ('Graph', '@graph'),
        ('Nearest Neighbors', '@nns'),
        ('Clustering', '@clustering'),
        ('Parameter', '@parameter'),
        ('Silhouette Score', '@silhouette_score'),
        ('Modularity', '@modularity'),
        ('Number of Clusters', '@nclusters'),
        ('Mean Cluster Size', '@mean_cluster_size'),
        ('Median Cluster Size', '@median_cluster_size'),
        ('Standard Deviation of Cluster Size', '@sd_cluster_size'),
        ('Number of Enriched Clusters', '@nenriched_clusters'),
        ('Mean Enriched Cluster Size', '@mean_enriched_cluster_size'),
        ('Median Enriched Cluster Size', '@median_enriched_cluster_size'),
        ('Standard Deviation of Enriched Cluster Size', '@sd_enriched_cluster_size'),
        ('Number of Enriched Cluster Genes', '@nenriched_cluster_genes'),
        ('Fraction of Clusters Enriched', '@fraction_clusters_enriched'),
    ])
    p.add_tools(hover)

    p.legend.click_policy="hide"
    p.legend.location = "bottom_left"

    p.xaxis.axis_label = bokeh_x
    p.yaxis.axis_label = bokeh_y

    p.min_border = 100

    tabs.append(Panel(child=p, title=tab_name))

plot_tabs1 = [t for t in tabs]


In [12]:
# resource: https://malouche.github.io/notebooks/scatter_bokeh2.html
from bokeh.plotting import figure, show
from bokeh.models import HoverTool
from bokeh.models import Panel, Tabs, RangeSlider, CustomJS, Column, Row, ColumnDataSource, CategoricalColorMapper, Legend

from bokeh.plotting import output_file, save

import pandas as pd
import math

tabs = []

bokeh_data_dict = {}

bokeh_data_dict['DYNAMIC FILTERING: UNFILTERED'] = df_stats

filter_cols = [col for col in list(df_stats.columns) if isinstance(df_stats[col].values[0], (int, float, np.int64, np.float64))]

bokeh_data_dict['DYNAMIC FILTERING: FILTERED (10 < mean_cluster_size < 40) (10 < median_cluster_size < 40)'] = df_stats.loc[
    # (df_stats['metric'] == 'cosine') &
    # (df_stats['nns'] == 3) & 
    # (df_stats['parameter'] == 0.405) & 

    (df_stats['mean_cluster_size'] > 10) &
    (df_stats['median_cluster_size'] > 10) &
    (df_stats['mean_cluster_size'] < 40) &
    (df_stats['median_cluster_size'] < 40) &
    # (df_stats['nenriched_cluster_genes'] > 7000) &
    # (df_stats['modularity'] > 0.4) &

    (df_stats['dimensionality'] == 'baseline') &
    (df_stats['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats['clustering'] == 'leiden_cpm') &
    (df_stats['partition_type'] == 'EXP')
]

bokeh_x = 'modularity'
# bokeh_y = 'nenriched_clusters'
bokeh_y = 'fraction_clusters_enriched'
color_column = 'metric'  # Column to use for coloring the points

x_min, x_max = math.floor(df_stats[bokeh_x].min()) - smallest_unit(df_stats[bokeh_x].min()), math.ceil(df_stats[bokeh_x].max()) + smallest_unit(df_stats[bokeh_x].max())
y_min, y_max = math.floor(df_stats[bokeh_y].min()) - smallest_unit(df_stats[bokeh_y].min()), math.ceil(df_stats[bokeh_y].max()) + smallest_unit(df_stats[bokeh_y].max())

metric_colors_dict = {
    'minkowski_5.0': '#fa0217',
    'minkowski_3.0': '#fa029b',
    'minkowski_2.0': '#bc02fa',
    'clr': '#0207fa',
    'cosine': '#02e1fa',
    'minkowski_1.0': '#02fa28',
    'manhattan': '#e9fa02',
    'minkowski_0.5': '#fa9f02',
    'euclidean': '#02fab8',
    'minkowski_4.0': '#8bfa02'
}

color_mapper = CategoricalColorMapper(factors=list(metric_colors_dict.keys()), palette=list(metric_colors_dict.values()))

for tab_name, bokeh_df in bokeh_data_dict.items():

    bokeh_data_source = ColumnDataSource(bokeh_df)

    sliders = {}

    for variable in filter_cols:
        sliders[variable] = RangeSlider(start=bokeh_df[variable].min(),
                                            end=bokeh_df[variable].max(),
                                            value=(bokeh_df[variable].min(), bokeh_df[variable].max()),
                                            step=smallest_unit(bokeh_df[variable].max()),
                                            title=f"{variable} range")


    p = figure(plot_width=1000, plot_height=800, title='HOVER OVER POINTS TO SHOW DATA. USE THE SLIDERS TO FILTER THE PARTITIONS.', sizing_mode='stretch_both', x_range=(x_min, x_max), y_range=(y_min, y_max))

    p.circle(
            x=bokeh_x,
            y=bokeh_y,
            source=bokeh_data_source,
            color={'field': color_column, 'transform': color_mapper},
            size=5,
            alpha=1,
            line_color='black'
        )

    callback = CustomJS(args=dict(source=bokeh_data_source, sliders=sliders), code="""
        const data = source.data;
        const indices = [];
        const variables = Object.keys(sliders);
        
        for (let i = 0; i < data['index'].length; i++) {
            let include = true;
            
            for (const variable of variables) {
                const slider_range = sliders[variable].value;
                const value = data[variable][i];
                
                if (value < slider_range[0] || value > slider_range[1]) {
                    include = false;
                    break;
                }
            }
            
            if (include) {
                indices.push(i);
            }
        }
        
        source.selected.indices = indices;
        source.change.emit();
    """)

    # Attach the callback to all sliders
    for slider in sliders.values():
        slider.js_on_change('value', callback)

    # Add all sliders to a layout
    slider_layout = Column(*sliders.values())

    hover = HoverTool(tooltips=[
        ('Partition Type', '@partition_type'),
        ('Dimensionality', '@dimensionality'),
        ('Metric', '@metric'),
        ('Graph', '@graph'),
        ('Nearest Neighbors', '@nns'),
        ('Clustering', '@clustering'),
        ('Parameter', '@parameter'),
        ('Silhouette Score', '@silhouette_score'),
        ('Modularity', '@modularity'),
        ('Number of Clusters', '@nclusters'),
        ('Mean Cluster Size', '@mean_cluster_size'),
        ('Median Cluster Size', '@median_cluster_size'),
        ('Standard Deviation of Cluster Size', '@sd_cluster_size'),
        ('Number of Enriched Clusters', '@nenriched_clusters'),
        ('Mean Enriched Cluster Size', '@mean_enriched_cluster_size'),
        ('Median Enriched Cluster Size', '@median_enriched_cluster_size'),
        ('Standard Deviation of Enriched Cluster Size', '@sd_enriched_cluster_size'),
        ('Number of Enriched Cluster Genes', '@nenriched_cluster_genes'),
        ('Fraction of Clusters Enriched', '@fraction_clusters_enriched'),
    ])
    p.add_tools(hover)

    # p.legend.click_policy="hide"
    # p.legend.location = "bottom_left"

    p.xaxis.axis_label = bokeh_x
    p.yaxis.axis_label = bokeh_y

    p.min_border = 100

    slider_layout.margin = 100

    layout = Row(p, slider_layout)

    tabs.append(Panel(child=layout, title=tab_name))

plot_tabs2 = [t for t in tabs]

# FIXME add filtering
# https://docs.bokeh.org/en/latest/docs/user_guide/basic/data.html#customjsfilter

In [13]:
tabbed_plot = Tabs(tabs=(plot_tabs1 + plot_tabs2))

output_file('./clustering_analysis.html')

save(tabbed_plot)

show(tabbed_plot)

In [None]:
df_stats

In [None]:
df_stats.drop_duplicates(subset=df_stats.columns.difference(['datetime']), inplace=True)

In [None]:
df_stats

In [None]:
dataframe_utils.sql_query_df({'scan_stats_df': df_stats}, 'select max(parameter) from scan_stats_df')

In [None]:
dataframe_utils.sql_query_df({'scan_stats_df': df_stats}, 'select min(parameter) from scan_stats_df')

In [None]:
dataframe_utils.sql_query_df({'scan_stats_df': df_stats}, 'select count(distinct parameter) from scan_stats_df')

In [None]:
def compute_num_clusters_fraction_of_genes_enriched(row):
    return row['nenriched_clusters'] * (row['nenriched_cluster_genes'] / 20326)

In [None]:
df_stats['num_clusters_fraction_of_genes_enriched'] = df_stats.apply(compute_num_clusters_fraction_of_genes_enriched, axis=1)

In [None]:
df_stats

In [None]:
19 * (19933/20326)

# DISTANCE METRIC COMPARISON

In [None]:
num_colors = len(df_stats['metric'].unique())
colors = distinctipy.get_colors(num_colors)
distinctipy.color_swatch(colors)

In [None]:
plt.figure(figsize=(40, 40))

color_dict = {} 

for idx, m in enumerate(df_stats['metric'].unique()):
    for p in sorted(df_stats['parameter'].unique()):
        df = df_stats.loc[
            (df_stats['parameter'] == p) & 
            (df_stats['dimensionality'] == 'baseline') &
            (df_stats['graph'] == 'umap_fuzzy_simplicial_set') &
            (df_stats['clustering'] == 'leiden_cpm') &
            (df_stats['metric'] == m)
        ]

        y = 'nenriched_clusters'
        # y = 'num_clusters_fraction_of_genes_enriched'

        # x = 'silhouette_score'
        x = 'modularity'

        # plt.plot(df[x].values, df[y].values, label=f'{p}', color=colors[idx])
        plt.scatter(df[x].values, df[y].values, label=f'{p}', color=colors[idx], s=5)


    color_dict[m] = colors[idx]  

plt.xlabel(x, fontsize=40)
plt.ylabel(y, fontsize=40)
plt.title(f'{y} VS. {x}', fontsize=40)

handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_dict[m], markersize=10) for m in color_dict]
plt.legend(handles, color_dict.keys(), title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=40)

plt.show()

In [None]:
df_stats.loc[
    (df_stats['mean_cluster_size'] > 9) & 
    (df_stats['mean_cluster_size'] < 30) &
    (df_stats['median_cluster_size'] > 9) & 
    (df_stats['median_cluster_size'] < 30)
    ]

In [None]:
df_stats_true = df_stats.loc[df_stats['partition_type'] == 'EXP']
df_stats_neg_ctrl = df_stats.loc[~(df_stats['partition_type'] == 'EXP')]

In [None]:
df_stats_true

In [None]:
df_stats_neg_ctrl

In [None]:
df_true_cluster = df_stats_true.loc[
    (df_stats_true['nns'] == 10) & 
    (df_stats_true['parameter'] == 0.0105) & 

    (df_stats_true['dimensionality'] == 'baseline') &
    (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats_true['clustering'] == 'leiden_cpm') &
    (df_stats_true['metric'] == plotting_metric) 
]

In [None]:
df_neg_ctrls = df_stats_neg_ctrl.loc[
    (df_stats_neg_ctrl['nns'] == 10) & 
    (df_stats_neg_ctrl['parameter'] == 0.0105) & 

    (df_stats_neg_ctrl['dimensionality'] == 'baseline') &
    (df_stats_neg_ctrl['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats_neg_ctrl['clustering'] == 'leiden_cpm') &
    (df_stats_neg_ctrl['metric'] == plotting_metric) 
]

In [None]:
df_true_cluster

In [None]:
df_neg_ctrls

In [None]:
hist_clustering_metric = 'silhouette_score'
# hist_clustering_metric = 'modularity'

hist_data = df_neg_ctrls[hist_clustering_metric].values
hist_data

In [None]:
# hist_line_at_x = df_true_cluster[hist_clustering_metric].values[0]
# hist_line_at_x

In [None]:
# # Create the histogram
# plt.hist(hist_data, bins=30, edgecolor='black')

# # Add a vertical line at the specified x value
# plt.axvline(x=hist_line_at_x, color='red', linestyle='dashed', linewidth=2, label='experimental at x={}'.format(round(hist_line_at_x, 2)))

# # Add labels and title
# plt.xlabel(hist_clustering_metric)
# plt.ylabel('frequency')
# plt.title(f'{len(hist_data)} neg. controls vs. experimental {hist_clustering_metric} distribution')

# plt.legend()

# # Show the plot
# plt.show()

In [None]:
df_stats_true['silhouette_score'].mean()

In [None]:
df_stats_true['silhouette_score'].median()

In [None]:
df_stats_true['silhouette_score'].max()

In [None]:
df_stats_true['modularity'].mean()

In [None]:
df_stats_neg_ctrl['silhouette_score'].mean()

In [None]:
df_stats_neg_ctrl['silhouette_score'].median()

In [None]:
df_stats_neg_ctrl['silhouette_score'].max()

In [None]:
df_stats_neg_ctrl['modularity'].mean()

In [None]:
df_stats_neg_ctrl['modularity'].median()

In [None]:
df_stats_true['nns'].unique()

In [None]:
df_stats_true['parameter'].unique()

In [None]:
df_stats_true['parameter'].min()

In [None]:
df_stats_true.loc[df_stats_true['modularity'] > 0.8]

In [None]:
df_stats_true.loc[(df_stats_true['silhouette_score'] > 0.04) & (df_stats_true['nclusters'] > 100)]

In [None]:
df_stats_true.loc[df_stats_true['nclusters'] < 10]

In [None]:
df_stats_true.loc[(df_stats_true['metric'] != 'manhattan') & (df_stats_true['metric'] != 'euclidean')]

In [None]:
df_stats_true.loc[(df_stats_true['metric'] != 'cosine') & (df_stats_true['metric'] != 'euclidean') & (df_stats_true['nns'] == 12) & (df_stats_true['parameter'] == 0.095)]

In [None]:
df_stats_true.loc[(df_stats_true['metric'] != 'manhattan') & (df_stats_true['metric'] != 'euclidean')]

In [None]:
for p in (df_stats_true['parameter'].unique()):
    pdf = df_stats_true.loc[df_stats_true['parameter'] == p]
    max_pdf = pdf.loc[pdf['silhouette_score'] == pdf['silhouette_score'].max()]
    print(max_pdf[['nns', 'silhouette_score', 'nclusters']])

In [None]:
# Create an empty DataFrame to store the results
best_nn_df = pd.DataFrame(columns=['parameter', 'nns', 'silhouette_score', 'nclusters'])

# Iterate over unique parameters
for p in df_stats_true['parameter'].unique():
    pdf = df_stats_true.loc[df_stats_true['parameter'] == p]
    
    # Find the row with the maximum silhouette_score
    max_pdf = pdf.loc[pdf['silhouette_score'].idxmax()]
    
    # Append the relevant information to the result DataFrame
    best_nn_df = best_nn_df.append({'parameter': p, 'nns': max_pdf['nns'],
                                  'silhouette_score': max_pdf['silhouette_score'],
                                  'nclusters': max_pdf['nclusters'],
                                  'nenriched_clusters': max_pdf['nenriched_clusters'],
                                  'nenriched_cluster_genes': max_pdf['nenriched_cluster_genes']
                                  }, ignore_index=True)
    

best_nn_df[best_nn_df['nclusters'] > 50]

# CONSTRAINED NN PLOTTING PARAMETERS

In [None]:
# PLOTTING PARAMETERS
df = df_stats_true.loc[
    (df_stats_true['nns'] == 6) & 
    # (df_stats_true['parameter'] == 0.0005) & 

    (df_stats_true['dimensionality'] == 'baseline') &
    (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats_true['clustering'] == 'leiden_cpm') &
    (df_stats_true['metric'] == plotting_metric) &
    (df_stats_true['nclusters'] > 50)
]

df

In [None]:
df = df_stats.loc[
    (df_stats['nns'] > 4) & 
    (df_stats['nns'] < 6) & 
    (df_stats['parameter'] > 0.034) & 
    (df_stats['parameter'] < 0.036) &


    (df_stats['dimensionality'] == 'baseline') &
    (df_stats['graph'] == 'umap_fuzzy_simplicial_set') &
    (df_stats['clustering'] == 'leiden_cpm') &
    (df_stats['metric'] == 'manhattan')
]

In [None]:
df_optimal = df_stats_true.loc[
    (df_stats_true['dimensionality'] == 'baseline')
    & (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set')
    & (df_stats_true['clustering'] == 'leiden_cpm') 
    & (df_stats_true['metric'] == plotting_metric) 
    # & (df_stats_true['nenriched_cluster_genes'] > 5000) 
    # & (df_stats_true['nenriched_cluster_genes'] < 6000) 
    # & (df_stats_true['nclusters'] > 1000)
    # & (df_stats_true['nclusters'] < 1300)
    & (df_stats_true['nenriched_clusters'] > 350)
    # & (df_stats_true['modularity'] > 0.5)
    & (df_stats_true['mean_cluster_size'] < 50)
    & (df_stats_true['median_cluster_size'] < 50)
    & (df_stats_true['mean_enriched_cluster_size'] < 50)
    & (df_stats_true['median_enriched_cluster_size'] < 50)
    & (df_stats_true['mean_cluster_size'] > 10)
    & (df_stats_true['median_cluster_size'] > 10)
    & (df_stats_true['mean_enriched_cluster_size'] > 10)
    & (df_stats_true['median_enriched_cluster_size'] > 10)
].sort_values(by=['modularity', 'silhouette_score'], ascending=False)
df_optimal

In [None]:
plt.plot(df_optimal['modularity'].values, df_optimal['silhouette_score'].values)
plt.show()

In [None]:
plt.plot(df_stats_true['modularity'].values, df_stats_true['silhouette_score'].values)
plt.show()

In [None]:
df_stats_true.loc[
    (df_stats_true['dimensionality'] == 'baseline')
    & (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set')
    & (df_stats_true['clustering'] == 'leiden_cpm') 
    & (df_stats_true['metric'] == plotting_metric) 
    & (df_stats_true['nclusters'] > 1000)
    & (df_stats_true['nclusters'] < 1300)
]

In [None]:
df_stats_true.loc[
    (df_stats_true['num_clusters_fraction_of_genes_enriched'] == df_stats_true['num_clusters_fraction_of_genes_enriched'].max())
]

In [None]:
df_stats_true.loc[
    (df_stats_true['nenriched_cluster_genes'] == df_stats_true['nenriched_cluster_genes'].max())
]

# Upper limit of nenriched_cluster_genes: 10247/19152 genes 53.50%
# (2021) 6625 genes (32.59%)

In [None]:
df.loc[df['nenriched_clusters']==df['nenriched_clusters'].max()]

In [None]:
# 8NN local maxs (sklearn NN)
df.loc[
    (df['parameter']==0.950)
    | (df['parameter']==0.755)
    | (df['parameter']==0.530)
    | (df['parameter']==0.370)
    | (df['parameter']==0.225)
    | (df['parameter']==0.115)
    | (df['parameter']==0.045)
    | (df['parameter']==0.020)
    ]

In [None]:
# 10NN local maxs (sklearn NN)
df.loc[
    (df['parameter']==0.1)
    ]

In [None]:
# 2NN local maxs
df.loc[
    (df['parameter']==0.280)
    ]

In [None]:
# 3NN local maxs
df.loc[
    (df['parameter']==0.900)
    | (df['parameter']==0.605)
    | (df['parameter']==0.536)
    | (df['parameter']==0.485)
    | (df['parameter']==0.340)
    | (df['parameter']==0.075)
    ]

In [None]:
# 5NN local maxs
df.loc[
    (df['parameter']==0.935)
    | (df['parameter']==0.795)
    | (df['parameter']==0.640)
    | (df['parameter']==0.465)
    | (df['parameter']==0.350)
    | (df['parameter']==0.295)
    | (df['parameter']==0.190)
    | (df['parameter']==0.125)
    | (df['parameter']==0.0499999999999999)
    | (df['parameter']==0.0349999999999999)
    ]

In [None]:
# 6NN local maxs
df.loc[
    (df['parameter']==0.800)
    | (df['parameter']==0.665)
    | (df['parameter']==0.435)
    | (df['parameter']==0.045)
    ]

In [None]:
y = 'nenriched_clusters'
# y = 'num_clusters_fraction_of_genes_enriched'

# x = 'silhouette_score'
x = 'modularity'

labels = 'parameter'
# labels = 'nns'

In [None]:
x_in = 50
y_in = 50
# x_in = 15
# y_in = 10
plt.figure(figsize=(x_in, y_in))

# Create scatter plot
plt.scatter(df[x].values, df[y].values)

label_adj_x = (0
            + (df[x].max() - df[x].min()) * 0.0075
            # + x_in/6.4
                )
label_adj_y = (0 
            # + (y_in/4.8)
               )

# Label each point with the number of nearest neighbors below the point
for i, txt in enumerate(df[labels].values):
    plt.text(df[x].values[i] + label_adj_x, df[y].values[i] + label_adj_y, 
            str("{:.3f}".format(
                txt
                ))
                ,
            fontsize=8, ha='center', va='top', color='black')

# Set axis labels and title
plt.xlabel(x)
plt.ylabel(y)
plt.title(f'{y} VS. {x} for varying {labels}')

# Display the plot
plt.show()

In [None]:
df_stats_true['parameter'].unique()

# NN CURVES FOR VARYING RP PLOTTING PARAMETERS

In [None]:
plot_ps = sorted(df_stats_true['parameter'].unique())

# FOR FINER STEP SCAN
if df_stats_true.loc[(df_stats_true['metric'] == plotting_metric)].shape[0] > 200:
    plot_ps = []
    for p_idx in np.arange(19, len(df_stats_true['parameter'].unique()), 20):
        plot_ps.append(sorted(df_stats_true['parameter'].unique())[p_idx])

In [None]:
import distinctipy

num_colors = len(df_stats_true['parameter'].unique())

colors = distinctipy.get_colors(num_colors)

In [None]:
plt.figure(figsize=(20, 20))

for idx, p in enumerate(plot_ps):
    df = df_stats_true.loc[
        # (df_stats_true['nns'] == 6) & 
        (df_stats_true['parameter'] == p) & 

        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == plotting_metric) &
        (df_stats_true['nclusters'] > 50)
    ]

    y = 'nenriched_clusters'
    # y = 'num_clusters_fraction_of_genes_enriched'

    # x = 'silhouette_score'
    x = 'modularity'

    plt.plot(df[x].values, df[y].values, label=str(p), color=colors[idx])

# Set axis labels and title
plt.xlabel(x)
plt.ylabel(y)
plt.title(f'{y} VS. {x}')

# Display the plot
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 20))

for idx, p in enumerate(plot_ps):
    df = df_stats_true.loc[
        # (df_stats_true['nns'] == 6) & 
        (df_stats_true['parameter'] == p) & 

        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == plotting_metric) &
        (df_stats_true['nclusters'] > 50)
    ]

    y = 'nenriched_cluster_genes'
    # y = 'num_clusters_fraction_of_genes_enriched'

    # x = 'silhouette_score'
    x = 'modularity'

    plt.plot(df[x].values, df[y].values, label=str(p), color=colors[idx])

# Set axis labels and title
plt.xlabel(x)
plt.ylabel(y)
plt.title(f'{y} VS. {x}')

# Display the plot
plt.legend()
plt.show()

In [None]:
df_stats_true.loc[
        (df_stats_true['nns'] == 3) & 
        (df_stats_true['parameter'] == 0.5) & 
        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == 'cosine') &
        (df_stats_true['nclusters'] > 50)
    ]

In [None]:
df_stats_true.loc[
        (df_stats_true['nns'] == 3) & 
        (df_stats_true['parameter'] == 0.21) & 
        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == 'cosine') &
        (df_stats_true['nclusters'] > 50)
    ]

In [None]:
df_stats_true.loc[
        (df_stats_true['nns'] == 3) & 
        (df_stats_true['parameter'] == 0.405) & 
        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == 'cosine') &
        (df_stats_true['nclusters'] > 50)
    ]

In [None]:
df_stats_true.loc[
        # (df_stats_true['nns'] == 3) & 
        # (df_stats_true['parameter'] == 0.405) & 
        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == 'cosine') &
        (df_stats_true['mean_cluster_size'] > 10) &
        (df_stats_true['median_cluster_size'] > 10) &
        (df_stats_true['mean_cluster_size'] < 30) &
        (df_stats_true['median_cluster_size'] < 30) &
        (df_stats_true['nenriched_cluster_genes'] > 7000) &
        (df_stats_true['modularity'] > 0.4)
    ]

In [None]:
import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection='3d')

for idx, p in enumerate(plot_ps):
    df = df_stats_true.loc[
        # (df_stats_true['nns'] == 6) & 
        (df_stats_true['parameter'] == p) & 
        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == plotting_metric) &
        (df_stats_true['nclusters'] > 50)
    ]

    y = 'nenriched_clusters'
    x = 'modularity'
    z = 'nenriched_cluster_genes'

    ax.plot(df[x].values, df[y].values, df[z].values, label=str(p), color=colors[idx])

# Set axis labels and title
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
plt.title(f'{y} VS. {x} VS. {z}')

# Display the plot
plt.legend()
plt.show()

# NN SCATTER FOR VARYING RP PLOTTING PARAMETERS

In [None]:
for p in sorted(df_stats_true['parameter'].unique()):
    df = df_stats_true.loc[
        # (df_stats_true['nns'] == 6) & 
        (df_stats_true['parameter'] == p) & 

        (df_stats_true['dimensionality'] == 'baseline') &
        (df_stats_true['graph'] == 'umap_fuzzy_simplicial_set') &
        (df_stats_true['clustering'] == 'leiden_cpm') &
        (df_stats_true['metric'] == plotting_metric) &
        (df_stats_true['nclusters'] > 50)
    ]

    y = 'nenriched_clusters'
    # y = 'num_clusters_fraction_of_genes_enriched'

    # x = 'silhouette_score'
    x = 'modularity'

    # labels = 'parameter'
    labels = 'nns'

    # Create scatter plot
    plt.scatter(df[x].values, df[y].values)

    label_adj = (df[x].max() - df[x].min()) * 0.03

    # Label each point with the number of nearest neighbors below the point
    for i, txt in enumerate(df[labels].values):
        plt.text(df[x].values[i] + label_adj, df[y].values[i], str(txt),
                fontsize=8, ha='center', va='top', color='black')

    # Set axis labels and title
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f'{y} VS. {x} for varying {labels}')

    # Display the plot
    print(p)
    plt.show()