In [1]:
import copy
import json
import os
import re
import glob
import tqdm
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pdb
from sklearn.metrics import silhouette_score, pairwise_distances, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors

import scipy.stats as st
import scipy.spatial
import scipy.cluster.hierarchy

import requests
import bs4

import umap
# import pymde

# import torch

import igraph as ig
import leidenalg as la

from Bio import SeqIO

import bokeh
from bokeh.plotting import show as show_interactive, output_file, output_notebook
from bokeh.layouts import column, row
from bokeh.models import (
    CustomJS,
    TextInput,
    LassoSelectTool,
    Select,
    MultiSelect,
    ColorBar,
    Legend,
    LegendItem,
    DataTable,
    DateFormatter,
    TableColumn,
    Button,
    HTMLTemplateFormatter,
    FactorRange,
)
from bokeh.events import SelectionGeometry
from bokeh.transform import linear_cmap, jitter

from matplotlib.pyplot import show as show_static

import networkx as nx

import subprocess

from pynndescent import NNDescent

from csv import DictWriter

from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clr_df = pd.read_csv('./clr_network_for_distances.csv')
clr_df.rename(columns={'Unnamed: 0':'TTHERM_ID'}, inplace=True)
print(clr_df.shape)
clr_df.head()

(20326, 20327)


Unnamed: 0,TTHERM_ID,TTHERM_000000042,TTHERM_000000045,TTHERM_00000010,TTHERM_00000020,TTHERM_00000030,TTHERM_00000040,TTHERM_00000070,TTHERM_000001189,TTHERM_000001241,...,TTHERM_02091560,TTHERM_02094560,TTHERM_02096560,TTHERM_02105572,TTHERM_02272860,TTHERM_02293890,TTHERM_02385080,TTHERM_02555200,TTHERM_02607240,TTHERM_02653470
0,TTHERM_000000042,0.0,0.0,1.506963,2.200162,1.766007,0.04684,0.225868,1.825975,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.325549,0.0
1,TTHERM_000000045,0.0,0.0,0.863134,0.0,0.0,0.0,0.403905,0.203517,0.003814,...,0.0,0.0,0.0,0.0,0.326196,0.078071,0.019517,0.0,0.0,0.0
2,TTHERM_00000010,1.506963,0.863134,0.0,2.829012,2.49344,0.0,0.349718,1.912393,0.0,...,0.686328,0.0,1.474013,0.0,0.0,1.268221,1.130958,0.0,0.040282,0.143249
3,TTHERM_00000020,2.200162,0.0,2.829012,0.0,3.052325,0.497379,0.39899,2.813094,0.0,...,1.230303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TTHERM_00000030,1.766007,0.0,2.49344,3.052325,0.0,2.678441,0.0,2.85857,0.0,...,0.0,0.0,0.369241,0.0,0.0,0.0,0.0,0.471134,0.070106,0.0


In [3]:
max_zscore = clr_df.max(axis=None, numeric_only=True)
max_zscore

25.2832981872387

In [4]:
min_zscore = clr_df.min(axis=None, numeric_only=True)
min_zscore

0.0

In [5]:
zscore_arr = clr_df.loc[:,clr_df.columns[1:]].to_numpy()
zscore_arr

array([[0.        , 0.        , 1.50696259, ..., 0.        , 1.32554853,
        0.        ],
       [0.        , 0.        , 0.8631345 , ..., 0.        , 0.        ,
        0.        ],
       [1.50696259, 0.8631345 , 0.        , ..., 0.        , 0.04028212,
        0.14324939],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 1.83724257,
        2.20228469],
       [1.32554853, 0.        , 0.04028212, ..., 1.83724257, 0.        ,
        0.20059729],
       [0.        , 0.        , 0.14324939, ..., 2.20228469, 0.20059729,
        0.        ]])

In [6]:
zero_zscore_idxs = np.where(zscore_arr == 0)
zero_zscore_idxs

(array([    0,     0,     0, ..., 20325, 20325, 20325]),
 array([    0,     1,     8, ..., 20317, 20318, 20325]))

In [7]:
zero_zscore_idxs[1].shape

(177366646,)

In [8]:
inverse_zscore_arr = 1 / zscore_arr

  inverse_zscore_arr = 1 / zscore_arr


In [9]:
inverse_zscore_arr

array([[        inf,         inf,  0.66358648, ...,         inf,
         0.75440467,         inf],
       [        inf,         inf,  1.15856799, ...,         inf,
                inf,         inf],
       [ 0.66358648,  1.15856799,         inf, ...,         inf,
        24.82490999,  6.98083234],
       ...,
       [        inf,         inf,         inf, ...,         inf,
         0.54429394,  0.4540739 ],
       [ 0.75440467,         inf, 24.82490999, ...,  0.54429394,
                inf,  4.9851121 ],
       [        inf,         inf,  6.98083234, ...,  0.4540739 ,
         4.9851121 ,         inf]])

In [10]:
np.count_nonzero(inverse_zscore_arr == float('inf'))

177366646

In [11]:
def min_max_scale_2d_arr(arr: np.array):

    flat_arr = arr.flatten()

    non_inf_mask = flat_arr != float('inf')

    max_val = max(flat_arr[non_inf_mask])
    min_val = min(flat_arr)

    print(max_val)
    print(min_val)

    scaled_arr = (arr - min_val) / (max_val - min_val)

    return scaled_arr



In [12]:
scaled_inverse_zscore_arr = min_max_scale_2d_arr(inverse_zscore_arr)
scaled_inverse_zscore_arr

74455098.3625645
0.03955180184936206


array([[           inf,            inf, 8.38135591e-09, ...,
                   inf, 9.60112714e-09,            inf],
       [           inf,            inf, 1.50294099e-08, ...,
                   inf,            inf,            inf],
       [8.38135591e-09, 1.50294099e-08,            inf, ...,
                   inf, 3.32890007e-07, 9.32277399e-08],
       ...,
       [           inf,            inf,            inf, ...,
                   inf, 6.77914812e-09, 5.56741054e-09],
       [9.60112714e-09,            inf, 3.32890007e-07, ...,
        6.77914812e-09,            inf, 6.64233936e-08],
       [           inf,            inf, 9.32277399e-08, ...,
        5.56741054e-09, 6.64233936e-08,            inf]])

In [13]:
np.count_nonzero(scaled_inverse_zscore_arr == float('inf'))

177366646

In [14]:
np.where(scaled_inverse_zscore_arr == float('inf'))

(array([    0,     0,     0, ..., 20325, 20325, 20325]),
 array([    0,     1,     8, ..., 20317, 20318, 20325]))

In [15]:
for idx in range(len(zero_zscore_idxs[0])):
    scaled_inverse_zscore_arr[zero_zscore_idxs[0][idx]][zero_zscore_idxs[1][idx]] = 1

In [16]:
scaled_inverse_zscore_arr

array([[1.00000000e+00, 1.00000000e+00, 8.38135591e-09, ...,
        1.00000000e+00, 9.60112714e-09, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.50294099e-08, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [8.38135591e-09, 1.50294099e-08, 1.00000000e+00, ...,
        1.00000000e+00, 3.32890007e-07, 9.32277399e-08],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 6.77914812e-09, 5.56741054e-09],
       [9.60112714e-09, 1.00000000e+00, 3.32890007e-07, ...,
        6.77914812e-09, 1.00000000e+00, 6.64233936e-08],
       [1.00000000e+00, 1.00000000e+00, 9.32277399e-08, ...,
        5.56741054e-09, 6.64233936e-08, 1.00000000e+00]])

In [17]:
np.where(scaled_inverse_zscore_arr == 0)

(array([15819, 19949]), array([19949, 15819]))

In [18]:
np.where(scaled_inverse_zscore_arr == float('inf'))

(array([], dtype=int64), array([], dtype=int64))

In [None]:
# scale zscores linearly from 0 to 1


# distance metric of 1/zscore


# 1/zscore for everything not zero, scale 0 to 1 linearly, assign 1s to previous zeros

inverted_zscore_arr = (max_zscore + min_zscore) - zscore_arr

In [None]:
inverted_zscore_arr

In [None]:
inverted_zscore_arr.shape

In [None]:
np.min(inverted_zscore_arr)

In [None]:
(inverted_zscore_arr.shape[0] * inverted_zscore_arr.shape[1]) - np.count_nonzero(inverted_zscore_arr)

In [None]:
zero_idxs = np.where(inverted_zscore_arr == 0)

In [None]:
zero_idxs

In [None]:
import copy

nonzero_inverted_zscore_arr = copy.deepcopy(inverted_zscore_arr)

In [None]:
for idx_pair in zero_idxs:
    nonzero_inverted_zscore_arr[idx_pair[0]][idx_pair[1]] = 1e-20

In [None]:
np.where(nonzero_inverted_zscore_arr == 0)

In [None]:
np.fill_diagonal(nonzero_inverted_zscore_arr, 0)

In [None]:
def shuffle_row(row):
    shuffled_row = row.values.copy()
    np.random.shuffle(shuffled_row)
    return pd.Series(shuffled_row, index=row.index)

def shuffle_rows(df):
    columns_to_shuffle = df.columns[1:]
    df[columns_to_shuffle] = df[columns_to_shuffle].apply(shuffle_row, axis=1)
    return df

In [None]:
def get_geom_mean_expression(expression_df):
    """
    
    Function to take an expression dataframe from the microarrays and collapse it into the means of
    all replicate chips.
    """
    # C2 and S12 got removed during quality control
    x = [
        'Ll', 
        'Lm', 
        'Lh', 
        'S0', 
        'S3', 
        'S6', 
        'S9', 
        # 'S12', 
        'S15', 
        'S24', 
        'C0', 
        # 'C2', 
        'C4', 
        'C6', 
        'C8', 
        'C10', 
        'C12', 
        'C14', 
        'C16', 
        'C18']
    
    # cols = expression_df.columns[1:]
    # x = [c for c in x if c in cols]
    
    condition_expr_dict = {c.split("_")[0]: [] for c in expression_df.columns[1:]}
    
    for c in list(expression_df.columns)[1:]:
        
        cond = c.split('_')[0]
        if cond in condition_expr_dict.keys():
            expr_list = condition_expr_dict.get(cond, [])

            # Need to avoid true zeros
            expr_list.append(expression_df[c].values)
            condition_expr_dict[cond] = expr_list
        
    condition_mean_dict = {c: (st.mstats.gmean(np.array(condition_expr_dict[c]) + 1, 0) - 1) for c in condition_expr_dict.keys() if c in x}
    
    mean_expr_df = pd.DataFrame(condition_mean_dict)
    mean_expr_df['TTHERM_ID'] = expression_df['TTHERM_ID'].values
    cols = list(mean_expr_df.columns)
    reorder = cols[-1:] + cols[:-1]
    mean_expr_df = mean_expr_df[reorder]
    
    return mean_expr_df

def normalizer(array):
    """
    Normalizes the values of an array to range from zero to one
    """
    
    a = np.array(array)
    
    normalized = (array - np.min(array)) / (np.max(array) - np.min(array))
    
    return normalized

def normalize_expression_per_gene(expression_df):
    """
    Function to normalize all gene expression to range from zero to one.
    """
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = data.apply(lambda row: normalizer(row), axis=1)
        norm_expression_df['TTHERM_ID'] = ttids
        
        columns = norm_expression_df.columns.tolist()
        
        rearrangment = columns[-1:] + columns[:-1]
        
        norm_expression_df = norm_expression_df[rearrangment]
        
    else:
        norm_expression_df = expression_df.apply(lambda row: normalizer(row), axis=1)
    
    return norm_expression_df

In [None]:
def compute_pairwise_distance_matrix(data_df, metric, n_jobs=-1, p_minkowski=1):

    if metric == 'minkowski':
        pair_dists = pairwise_distances(data_df, metric=metric, n_jobs=n_jobs, p=p_minkowski)
    else:
        pair_dists = pairwise_distances(data_df, metric=metric, n_jobs=n_jobs)
    
    return pair_dists

In [None]:
def compute_nns(data_df, nn, metric, random_state=42, n_jobs=-1, p_minkowski=1, distance_matrix=None):
    
    # if metric == 'clr':
    num_neighbors = NearestNeighbors(n_neighbors=nn-1, metric='precomputed', n_jobs=-1).fit(distance_matrix)
    nn_dists, nn_idxs = num_neighbors.kneighbors(return_distance=True)

    nn_dists_list = []
    nn_idxs_list = []

    # add the node itself to the nearest neighbors data 
    for idx in range(len(nn_dists)):
        nn_dists_list.append(np.flip(np.append(np.flip(nn_dists[idx]), 0)))
        nn_idxs_list.append(np.flip(np.append(np.flip(nn_idxs[idx]), idx)))

    return np.array(nn_idxs_list), np.array(nn_dists_list)


    # n_trees = min(64, 5 + int(round((data_df.shape[0]) ** 0.5 / 20.0)))
    # n_iters = max(5, int(round(np.log2(data_df.shape[0]))))

    # if metric == 'minkowski':
    #     knn_search_index = NNDescent(
    #             data_df,
    #             n_neighbors=nn,
    #             metric=metric,
    #             metric_kwds={'p': p_minkowski},
    #             random_state=random_state,
    #             n_trees=n_trees,
    #             n_iters=n_iters,
    #             max_candidates=60,
    #             # low_memory=low_memory,
    #             n_jobs=n_jobs,
    #             verbose=False,
    #             compressed=False,
    #         )
    # else:
    #     knn_search_index = NNDescent(
    #                 data_df,
    #                 n_neighbors=nn,
    #                 metric=metric,
    #                 # metric_kwds=metric_kwds,
    #                 random_state=random_state,
    #                 n_trees=n_trees,
    #                 n_iters=n_iters,
    #                 max_candidates=60,
    #                 # low_memory=low_memory,
    #                 n_jobs=n_jobs,
    #                 verbose=False,
    #                 compressed=False,
    #             )
    # nn_idxs, nn_dists = knn_search_index.neighbor_graph

    # return nn_idxs, nn_dists

In [None]:
def compute_anns(data_df, nn, metric, random_state=42, n_jobs=-1, p_minkowski=1, distance_matrix=None):

    n_trees = min(64, 5 + int(round((data_df.shape[0]) ** 0.5 / 20.0)))
    n_iters = max(5, int(round(np.log2(data_df.shape[0]))))

    if metric == 'minkowski':
        knn_search_index = NNDescent(
                data_df,
                n_neighbors=nn,
                metric=metric,
                metric_kwds={'p': p_minkowski},
                random_state=random_state,
                n_trees=n_trees,
                n_iters=n_iters,
                max_candidates=60,
                # low_memory=low_memory,
                n_jobs=n_jobs,
                verbose=False,
                compressed=False,
            )
    else:
        knn_search_index = NNDescent(
                    data_df,
                    n_neighbors=nn,
                    metric=metric,
                    # metric_kwds=metric_kwds,
                    random_state=random_state,
                    n_trees=n_trees,
                    n_iters=n_iters,
                    max_candidates=60,
                    # low_memory=low_memory,
                    n_jobs=n_jobs,
                    verbose=False,
                    compressed=False,
                )
    nn_idxs, nn_dists = knn_search_index.neighbor_graph

    return nn_idxs, nn_dists

In [None]:
def compute_umap_graph(data_df, nn, metric, nn_idxs, nn_dists):
    
    result, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(data_df, nn, 42, metric, knn_indices=nn_idxs, knn_dists=nn_dists, return_dists=True)

    sources, targets = result.nonzero()
    edge_list = zip(sources, targets)
    weights = result.data

    g = ig.Graph(edges=edge_list, edge_attrs={'weight': weights})
    
    return g

In [None]:
def compute_leiden_partition(graph, resolution_parameter, random_state=42):
        
        partition = la.find_partition(graph, la.CPMVertexPartition, resolution_parameter = resolution_parameter, seed=random_state, weights='weight')
        # partition = la.find_partition(g, la.ModularityVertexPartition, seed=42, weights='weight')

        leiden_modules = np.array(partition.membership)

        return leiden_modules

In [None]:
def compute_communities(parition, idx_labels):
    communities = {}

    for idx, membership in enumerate(parition):
        if membership not in communities:
            communities[membership] = []
        communities[membership].append(idx_labels[idx])

    return communities

In [None]:
def compute_silhouette_score(distance_matrix, parition):
    return silhouette_score(distance_matrix, parition, metric='precomputed')

In [None]:
def compute_modularity(graph, communities):
    nx_g = nx.Graph(graph.get_edgelist())
    return nx.community.quality.modularity(nx_g, communities, weight='weight')

In [None]:
def format_parition_for_enrichment(df, parition):
    edf = pd.DataFrame.from_dict({'TTHERM_ID': []})
    edf['TTHERM_ID'] = df['TTHERM_ID'].values
    edf[f'leiden_label_full'] = parition
    return edf

In [None]:
def remove_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

In [None]:
def compute_enrichment(df, parition):
    edf = format_parition_for_enrichment(df, parition)

    temp_scan_file = './temp_scan_partition.csv'

    temp_enrich_file = './temp_scan_enrich.csv'

    edf.to_csv(temp_scan_file, index=False)

    subprocess.run(['python3', './fast_enrichment_analysis.py', temp_scan_file, temp_enrich_file])

    cedf = pd.read_csv(temp_enrich_file)
    
    remove_file(temp_scan_file)

    remove_file(temp_enrich_file)

    return cedf

In [None]:
def compute_num_clusters(parition, communities=None):
    if communities is None:
        return len(set(parition))
    
    if len(set(parition)) != len(communities):
        raise ValueError(f'The number of clusters/modules ({len(set(parition))}) in the parition != the number of communities ({len(communities)}).')
    
    return len(set(parition))

In [None]:
def compute_cluster_sizes(communities):
    return [len(community) for community in communities.values()]

In [None]:
def compute_enriched_cluster_sizes(communities, cedf):
    enriched_cluster_mods = set(cedf['module'].values)
    return [len(community) for mod, community in communities.items() if mod in enriched_cluster_mods]

In [None]:
def compute_cluster_size_mean(cluster_sizes):
    return np.mean(cluster_sizes)

def compute_cluster_size_median(cluster_sizes):
    return np.median(cluster_sizes)

def compute_cluster_size_sd(cluster_sizes):
    return np.std(cluster_sizes)

def compute_cluster_size_sd(cluster_sizes):
    return np.std(cluster_sizes)

In [None]:
def compute_num_enriched_clusters(cedf):
    return len(set(cedf['module'].values))

In [None]:
def compute_num_enriched_cluster_genes(edf, parition):
    total_num_genes = 0

    for m in set(edf['module'].values):
        num_genes = np.count_nonzero(parition == int(m))
        total_num_genes += num_genes
    
    return total_num_genes
    

In [None]:
def write_to_csv(csv_file_path, data_item, header):
    # Check if the CSV file exists and write header if it doesn't
    if not os.path.isfile(csv_file_path):
        with open(csv_file_path, 'w', newline='') as file:
            writer = DictWriter(file, fieldnames=header)
            writer.writeheader()

    with open(csv_file_path, 'a', newline='') as file:
        writer = DictWriter(file, fieldnames=header)
        writer.writerow(data_item)

# CLUSTER START

In [None]:
partition_type = 'EXP'
num_iterations = 1
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

In [None]:
full_filtered_df.head()

In [None]:
# np.random.seed(42)
# X, _ = make_blobs(n_samples=10000, n_features=30, centers=350, cluster_std=1.0, random_state=42)  # Use only 2 features
# # Convert X to a DataFrame
# columns = ['feature' + str(i) for i in range(X.shape[1])]
# df = pd.DataFrame(X, columns=columns)
# raw_data = df.values

In [None]:
dimensions = 47

sampler = st.qmc.LatinHypercube(d=dimensions)
# sampler = st.qmc.Sobol(d=dimensions)
hypercube_sample = sampler.random(n=20326)

hypercube_sample.shape


In [None]:
plt.scatter(hypercube_sample[1234], hypercube_sample[20000])

In [None]:
def get_cpu_cores():
    # If you're using Linux or macOS
    if os.name == 'posix':
        return os.cpu_count()

    # If you're using Windows
    elif os.name == 'nt':
        return multiprocessing.cpu_count()

    # If the operating system is not recognized
    else:
        return "Unable to determine the number of CPU cores."

# Get and print the number of CPU cores
num_cores = get_cpu_cores()
print(f"Number of CPU cores: {num_cores}")

In [None]:
def floor_half_to_even(number):
    return number // 4 * 2

num_workers = floor_half_to_even(num_cores)
num_workers

In [None]:
st.qmc.discrepancy(hypercube_sample, workers=num_workers)

In [None]:
curr_datetime = str(datetime.now())

In [None]:
metric = 'manhattan'
p_minkowski = 0.5
n_jobs = -1
random_state = 42

In [None]:
nn = 5
# nn = 6

In [None]:
rp = 0.035
# rp = 0.030

In [None]:
# num_iterations = 100
# partition_type = 'NC'

In [None]:
for iteration in tqdm.tqdm(range(num_iterations)):
# for p_minkowski in np.arange(1.1, 2.1, 0.1):
    
    if partition_type == 'NC':
        full_filtered_df = shuffle_rows(full_filtered_df)
        
    full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)
    
    raw_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values
    # partition_type = 'TNC'
    # raw_data = pd.DataFrame(hypercube_sample)

    idx_labels = list(range(raw_data.shape[0]))


    distance_matrix = compute_pairwise_distance_matrix(raw_data, metric, n_jobs, p_minkowski)
    # distance_matrix = nonzero_inverted_zscore_arr

    ann_idxs, ann_dists = compute_anns(raw_data, nn, metric, random_state, n_jobs, p_minkowski, distance_matrix)

    nn_idxs, nn_dists = compute_nns(raw_data, nn, metric, random_state, n_jobs, p_minkowski, distance_matrix)

    nn_graph = compute_umap_graph(raw_data, nn, metric, ann_idxs, ann_dists)

    parition = compute_leiden_partition(nn_graph, rp, random_state)

    communities = compute_communities(parition, idx_labels)

    sil_score = compute_silhouette_score(distance_matrix, parition)

    modularity = compute_modularity(nn_graph, communities.values())

    enrichment_df = compute_enrichment(full_filtered_norm_df, parition)

    num_clusters = compute_num_clusters(parition, communities.values())

    num_enriched_clusters = compute_num_enriched_clusters(enrichment_df)

    num_enriched_cluster_genes = compute_num_enriched_cluster_genes(enrichment_df, parition)

    cluster_sizes = compute_cluster_sizes(communities)

    enriched_cluster_sizes = compute_enriched_cluster_sizes(communities, enrichment_df)

    cluster_stats = {
    'partition_type': partition_type,

    'dimensionality': 'baseline',

    'metric': metric,
    # 'metric': 'clr',
    'graph': 'umap_fuzzy_simplicial_set',
    'nns': nn,

    'clustering': 'leiden_cpm',
    'parameter': rp,

    'silhouette_score': sil_score,
    'modularity': modularity,

    'nclusters': num_clusters,
    'mean_cluster_size': compute_cluster_size_mean(cluster_sizes),
    'median_cluster_size': compute_cluster_size_median(cluster_sizes),
    'sd_cluster_size': compute_cluster_size_sd(cluster_sizes),

    'nenriched_clusters': num_enriched_clusters,
    'mean_enriched_cluster_size': compute_cluster_size_mean(enriched_cluster_sizes),
    'median_enriched_cluster_size': compute_cluster_size_median(enriched_cluster_sizes),
    'sd_enriched_cluster_size': compute_cluster_size_sd(enriched_cluster_sizes),
    'nenriched_cluster_genes': num_enriched_cluster_genes,

    'datetime': curr_datetime
    }

    # write_to_csv('./scan_stats_v1.csv', cluster_stats, list(cluster_stats.keys()))

In [None]:
sil_score

In [None]:
modularity

In [None]:
num_clusters

In [None]:
num_enriched_clusters

In [None]:
num_enriched_cluster_genes

In [None]:
def get_gene_module_assignments(all_gene_labels, gene_list, parition):
    gene_module_assignments = {}

    for gene in gene_list:
        if gene not in all_gene_labels:
            raise ValueError(f'The gene {gene} is not in the list of all gene labels.')
        gene_idx = all_gene_labels.index(gene)
        module_num = parition[gene_idx]
        if module_num not in gene_module_assignments:
            gene_module_assignments[module_num] = []
        gene_module_assignments[module_num].append(gene)

    return gene_module_assignments

In [None]:
gene_list_1 = ["TTHERM_01055600", "TTHERM_01002870", "TTHERM_01002860", "TTHERM_00630470", "TTHERM_00624730", "TTHERM_00624720", "TTHERM_00527180", "TTHERM_00522600", "TTHERM_00378890", "TTHERM_00335830", "TTHERM_00221120"]

In [None]:
gene_list_2 = ["TTHERM_00420610", "TTHERM_00410210", "TTHERM_00313130", "TTHERM_00467390"]
#                                                                       MAYBE

In [None]:
gene_list_3 = ["TTHERM_01107420", "TTHERM_01004990", "TTHERM_00985020", "TTHERM_00899470", "TTHERM_00865150", "TTHERM_00858130", "TTHERM_00849480", "TTHERM_00829340", "TTHERM_00780750", "TTHERM_00716180", "TTHERM_00704030", "TTHERM_00691170", "TTHERM_00684590", "TTHERM_00670190", "TTHERM_00571880", "TTHERM_00561799", "TTHERM_00529890", "TTHERM_00526250", "TTHERM_00469140", "TTHERM_00455600", "TTHERM_00439330", "TTHERM_00439030", "TTHERM_00424700", "TTHERM_00316660", "TTHERM_00312120", "TTHERM_00301770", "TTHERM_00297130", "TTHERM_00292160", "TTHERM_00243710", "TTHERM_00113120", "TTHERM_000711791", "TTHERM_00069420", "TTHERM_00048890", "TTHERM_000463439", "TTHERM_000439109", "TTHERM_00037290", "TTHERM_000248319", "TTHERM_000086999", "TTHERM_01079170", "TTHERM_01005150", "TTHERM_00865050", "TTHERM_00773520", "TTHERM_00729230", "TTHERM_00704040", "TTHERM_00672040", "TTHERM_00667000", "TTHERM_00648920", "TTHERM_00614820", "TTHERM_00576890", "TTHERM_00572090", "TTHERM_00483610", "TTHERM_00446570", "TTHERM_00441870", "TTHERM_00219420", "TTHERM_00194810", "TTHERM_00161750", "TTHERM_00142290", "TTHERM_001000210", "TTHERM_00083540", "TTHERM_00058860", "TTHERM_00048980", "TTHERM_00046130", "TTHERM_000420919", "TTHERM_000383629", "TTHERM_00013120", "TTHERM_00011190", "TTHERM_01245640", "TTHERM_01197090", "TTHERM_01195950", "TTHERM_01016190", "TTHERM_00790790", "TTHERM_00585320", "TTHERM_00568050", "TTHERM_00554270", "TTHERM_00498190", "TTHERM_00487030", "TTHERM_00448570", "TTHERM_00277550", "TTHERM_00242370", "TTHERM_00143660", "TTHERM_00105150", "TTHERM_00092850", "TTHERM_000011759"]

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_1, list(parition))

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_2, list(parition))

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_3, list(parition))

In [None]:
print(nn_dists[1234])
print(ann_dists[1234])
print(nn_dists.shape)
print(ann_dists.shape)

In [None]:
print(nn_idxs[1234])
print(ann_idxs[1234])
print(nn_idxs.shape)
print(ann_idxs.shape)