In [None]:
import os
import tqdm

import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, pairwise_distances, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors

import scipy.stats as st

import umap

import igraph as ig
import leidenalg as la

import networkx as nx

import subprocess

from pynndescent import NNDescent

from csv import DictWriter

from datetime import datetime

In [None]:
clr_df = pd.read_csv('./clr_network_for_distances.csv')
clr_df.rename(columns={'Unnamed: 0':'TTHERM_ID'}, inplace=True)
print(clr_df.shape)
clr_df.head()

In [None]:
max_zscore = clr_df.max(axis=None, numeric_only=True)
max_zscore

In [None]:
min_zscore = clr_df.min(axis=None, numeric_only=True)
min_zscore

In [None]:
zscore_arr = clr_df.loc[:,clr_df.columns[1:]].to_numpy()
zscore_arr

In [None]:
inverted_zscore_arr = (max_zscore + min_zscore) - zscore_arr

In [None]:
inverted_zscore_arr

In [None]:
inverted_zscore_arr.shape

In [None]:
np.min(inverted_zscore_arr)

In [None]:
(inverted_zscore_arr.shape[0] * inverted_zscore_arr.shape[1]) - np.count_nonzero(inverted_zscore_arr)

In [None]:
zero_idxs = np.where(inverted_zscore_arr == 0)

In [None]:
zero_idxs

In [None]:
import copy

nonzero_inverted_zscore_arr = copy.deepcopy(inverted_zscore_arr)

In [None]:
for idx_pair in zero_idxs:
    nonzero_inverted_zscore_arr[idx_pair[0]][idx_pair[1]] = 1e-20

In [None]:
np.where(nonzero_inverted_zscore_arr == 0)

In [None]:
np.fill_diagonal(nonzero_inverted_zscore_arr, 0)

In [None]:
def shuffle_row(row):
    shuffled_row = row.values.copy()
    np.random.shuffle(shuffled_row)
    return pd.Series(shuffled_row, index=row.index)

def shuffle_rows(df):
    columns_to_shuffle = df.columns[1:]
    df[columns_to_shuffle] = df[columns_to_shuffle].apply(shuffle_row, axis=1)
    return df

In [None]:
def get_geom_mean_expression(expression_df):
    """
    
    Function to take an expression dataframe from the microarrays and collapse it into the means of
    all replicate chips.
    """
    # C2 and S12 got removed during quality control
    x = [
        'Ll', 
        'Lm', 
        'Lh', 
        'S0', 
        'S3', 
        'S6', 
        'S9', 
        # 'S12', 
        'S15', 
        'S24', 
        'C0', 
        # 'C2', 
        'C4', 
        'C6', 
        'C8', 
        'C10', 
        'C12', 
        'C14', 
        'C16', 
        'C18']
    
    # cols = expression_df.columns[1:]
    # x = [c for c in x if c in cols]
    
    condition_expr_dict = {c.split("_")[0]: [] for c in expression_df.columns[1:]}
    
    for c in list(expression_df.columns)[1:]:
        
        cond = c.split('_')[0]
        if cond in condition_expr_dict.keys():
            expr_list = condition_expr_dict.get(cond, [])

            # Need to avoid true zeros
            expr_list.append(expression_df[c].values)
            condition_expr_dict[cond] = expr_list
        
    condition_mean_dict = {c: (st.mstats.gmean(np.array(condition_expr_dict[c]) + 1, 0) - 1) for c in condition_expr_dict.keys() if c in x}
    
    mean_expr_df = pd.DataFrame(condition_mean_dict)
    mean_expr_df['TTHERM_ID'] = expression_df['TTHERM_ID'].values
    cols = list(mean_expr_df.columns)
    reorder = cols[-1:] + cols[:-1]
    mean_expr_df = mean_expr_df[reorder]
    
    return mean_expr_df

def normalizer(array):
    """
    Normalizes the values of an array to range from zero to one
    """
    
    a = np.array(array)
    
    normalized = (array - np.min(array)) / (np.max(array) - np.min(array))
    
    return normalized

def normalize_expression_per_gene(expression_df):
    """
    Function to normalize all gene expression to range from zero to one.
    """
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = data.apply(lambda row: normalizer(row), axis=1)
        norm_expression_df['TTHERM_ID'] = ttids
        
        columns = norm_expression_df.columns.tolist()
        
        rearrangment = columns[-1:] + columns[:-1]
        
        norm_expression_df = norm_expression_df[rearrangment]
        
    else:
        norm_expression_df = expression_df.apply(lambda row: normalizer(row), axis=1)
    
    return norm_expression_df

In [None]:
partition_type = 'EXP'
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

In [None]:
full_filtered_df.head()

In [None]:
# full_filtered_df = shuffle_rows(full_filtered_df)
# partition_type = 'NC'

In [None]:
full_filtered_df.head()

In [None]:
full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)
raw_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values

In [None]:
def compute_pairwise_distance_matrix(data_df, metric, n_jobs=-1, p_minkowski=1):

    if metric == 'minkowski':
        pair_dists = pairwise_distances(data_df, metric=metric, n_jobs=n_jobs, p=p_minkowski)
    else:
        pair_dists = pairwise_distances(data_df, metric=metric, n_jobs=n_jobs)
    
    return pair_dists

In [None]:
def compute_nns(data_df, nn, metric, random_state=42, n_jobs=-1, p_minkowski=1, distance_matrix=None):
    
    if metric == 'clr':
        num_neighbors = NearestNeighbors(n_neighbors=nn, metric='precomputed', n_jobs=-1).fit(distance_matrix)
        nn_dists, nn_idxs = num_neighbors.kneighbors(return_distance=True)
        return nn_idxs, nn_dists

    n_trees = min(64, 5 + int(round((data_df.shape[0]) ** 0.5 / 20.0)))
    n_iters = max(5, int(round(np.log2(data_df.shape[0]))))

    if metric == 'minkowski':
        knn_search_index = NNDescent(
                data_df,
                n_neighbors=nn,
                metric=metric,
                metric_kwds={'p': p_minkowski},
                random_state=random_state,
                n_trees=n_trees,
                n_iters=n_iters,
                max_candidates=60,
                # low_memory=low_memory,
                n_jobs=n_jobs,
                verbose=False,
                compressed=False,
            )
    else:
        knn_search_index = NNDescent(
                    data_df,
                    n_neighbors=nn,
                    metric=metric,
                    # metric_kwds=metric_kwds,
                    random_state=random_state,
                    n_trees=n_trees,
                    n_iters=n_iters,
                    max_candidates=60,
                    # low_memory=low_memory,
                    n_jobs=n_jobs,
                    verbose=False,
                    compressed=False,
                )
    nn_idxs, nn_dists = knn_search_index.neighbor_graph

    return nn_idxs, nn_dists

In [None]:
def compute_umap_graph(data_df, nn, metric, nn_idxs, nn_dists):
    
    result, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(data_df, nn, 42, metric, knn_indices=nn_idxs, knn_dists=nn_dists, return_dists=True)

    sources, targets = result.nonzero()
    edge_list = zip(sources, targets)
    weights = result.data

    g = ig.Graph(edges=edge_list, edge_attrs={'weight': weights})
    
    return g

In [None]:
def compute_leiden_partition(graph, resolution_parameter, random_state=42):
        
        partition = la.find_partition(graph, la.CPMVertexPartition, resolution_parameter = resolution_parameter, seed=random_state, weights='weight')
        # partition = la.find_partition(g, la.ModularityVertexPartition, seed=42, weights='weight')

        leiden_modules = np.array(partition.membership)

        return leiden_modules

In [None]:
def compute_communities(parition, idx_labels):
    communities = {}

    for idx, membership in enumerate(parition):
        if membership not in communities:
            communities[membership] = []
        communities[membership].append(idx_labels[idx])

    return communities

In [None]:
def compute_silhouette_score(distance_matrix, parition):
    return silhouette_score(distance_matrix, parition, metric='precomputed')

In [None]:
def compute_modularity(graph, communities):
    nx_g = nx.Graph(graph.get_edgelist())
    return nx.community.quality.modularity(nx_g, communities, weight='weight')

In [None]:
def format_parition_for_enrichment(df, parition):
    edf = pd.DataFrame.from_dict({'TTHERM_ID': []})
    edf['TTHERM_ID'] = df['TTHERM_ID'].values
    edf[f'leiden_label_full'] = parition
    return edf

In [None]:
def remove_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

In [None]:
def compute_enrichment(df, parition):
    edf = format_parition_for_enrichment(df, parition)

    temp_scan_file = './temp_scan_partition.csv'

    temp_enrich_file = './temp_scan_enrich.csv'

    edf.to_csv(temp_scan_file, index=False)

    subprocess.run(['python3', './fast_enrichment_analysis.py', temp_scan_file, temp_enrich_file])

    cedf = pd.read_csv(temp_enrich_file)
    
    remove_file(temp_scan_file)

    remove_file(temp_enrich_file)

    return cedf

In [None]:
def compute_num_clusters(parition, communities=None):
    if communities is None:
        return len(set(parition))
    
    if len(set(parition)) != len(communities):
        raise ValueError(f'The number of clusters/modules ({len(set(parition))}) in the parition != the number of communities ({len(communities)}).')
    
    return len(set(parition))

In [None]:
def compute_cluster_sizes(communities):
    return [len(community) for community in communities.values()]

In [None]:
def compute_enriched_cluster_sizes(communities, cedf):
    enriched_cluster_mods = set(cedf['module'].values)
    return [len(community) for mod, community in communities.items() if mod in enriched_cluster_mods]

In [None]:
def compute_cluster_size_mean(cluster_sizes):
    return np.mean(cluster_sizes)

def compute_cluster_size_median(cluster_sizes):
    return np.median(cluster_sizes)

def compute_cluster_size_sd(cluster_sizes):
    return np.std(cluster_sizes)

def compute_cluster_size_sd(cluster_sizes):
    return np.std(cluster_sizes)

In [None]:
def compute_num_enriched_clusters(cedf):
    return len(set(cedf['module'].values))

In [None]:
def compute_num_enriched_cluster_genes(edf, parition):
    total_num_genes = 0

    for m in set(edf['module'].values):
        num_genes = np.count_nonzero(parition == int(m))
        total_num_genes += num_genes
    
    return total_num_genes
    

In [None]:
def write_to_csv(csv_file_path, data_item, header):
    # Check if the CSV file exists and write header if it doesn't
    if not os.path.isfile(csv_file_path):
        with open(csv_file_path, 'w', newline='') as file:
            writer = DictWriter(file, fieldnames=header)
            writer.writeheader()

    with open(csv_file_path, 'a', newline='') as file:
        writer = DictWriter(file, fieldnames=header)
        writer.writerow(data_item)

# SCAN START

In [None]:
curr_datetime = str(datetime.now())

In [None]:
idx_labels = list(range(raw_data.shape[0]))

# metric = 'manhattan'
p_minkowski = None
metrics = [f'minkowski_{str(p)}' for p in np.array([0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5])] + ['clr', 'manhattan', 'euclidean', 'cosine']
n_jobs = -1
random_state = 42

In [None]:
scan_nns = np.arange(2, 13, 1)
# scan_nns = [3]
scan_nns

In [None]:
scan_rps = np.arange(0.1, 1.1, 0.1)
# scan_rps = [0.6]
scan_rps

In [None]:
scan_dict = {}

In [None]:
for metric_p in metrics:
    metric_p_split = metric_p.split('_')

    metric = metric_p

    if metric_p_split[0] == 'minkowski':
        metric = metric_p_split[0]
        p_minkowski = float(metric_p_split[1])

    print(metric_p)
    print()

    if metric != 'clr':
        distance_matrix = compute_pairwise_distance_matrix(raw_data, metric, n_jobs, p_minkowski)
    else:
        distance_matrix = nonzero_inverted_zscore_arr

    for idx, nn in enumerate(scan_nns):     
        print(idx+1,'of',len(scan_nns))     
        print('NNs: ', nn)

        scan_dict[nn] = {}

        nn_idxs, nn_dists = compute_nns(raw_data, nn, metric, random_state, n_jobs, p_minkowski, distance_matrix)
        scan_dict[nn]['nn_idxs'] = nn_idxs
        scan_dict[nn]['nn_dists'] = nn_dists

        nn_graph = compute_umap_graph(raw_data, nn, metric, nn_idxs, nn_dists)
        scan_dict[nn]['nn_graph'] = nn_graph

        for rp in tqdm.tqdm(scan_rps):

            scan_dict[nn][rp] = {}
            
            parition = compute_leiden_partition(nn_graph, rp, random_state)
            scan_dict[nn][rp]['partition'] = parition

            communities = compute_communities(parition, idx_labels)
            scan_dict[nn][rp]['communities'] = communities

            sil_score = compute_silhouette_score(distance_matrix, parition)
            scan_dict[nn][rp]['sil_score'] = sil_score

            modularity = compute_modularity(nn_graph, communities.values())
            scan_dict[nn][rp]['modularity'] = modularity

            enrichment_df = compute_enrichment(full_filtered_norm_df, parition)
            scan_dict[nn][rp]['enrichment_df'] = enrichment_df

            num_clusters = compute_num_clusters(parition, communities.values())
            scan_dict[nn][rp]['num_clusters'] = num_clusters

            num_enriched_clusters = compute_num_enriched_clusters(enrichment_df)
            scan_dict[nn][rp]['num_enriched_clusters'] = num_enriched_clusters

            num_enriched_cluster_genes = compute_num_enriched_cluster_genes(enrichment_df, parition)
            scan_dict[nn][rp]['num_enriched_cluster_genes'] = num_enriched_cluster_genes

            cluster_sizes = compute_cluster_sizes(communities)
            scan_dict[nn][rp]['cluster_sizes'] = cluster_sizes

            enriched_cluster_sizes = compute_enriched_cluster_sizes(communities, enrichment_df)
            scan_dict[nn][rp]['enriched_cluster_sizes'] = enriched_cluster_sizes

            cluster_stats = {
            'partition_type': partition_type,

            'dimensionality': 'baseline',

            'metric': metric_p,
            'graph': 'umap_fuzzy_simplicial_set',
            'nns': nn,

            'clustering': 'leiden_cpm',
            'parameter': rp,

            'silhouette_score': sil_score,
            'modularity': modularity,

            'nclusters': num_clusters,
            'mean_cluster_size': compute_cluster_size_mean(cluster_sizes),
            'median_cluster_size': compute_cluster_size_median(cluster_sizes),
            'sd_cluster_size': compute_cluster_size_sd(cluster_sizes),

            'nenriched_clusters': num_enriched_clusters,
            'mean_enriched_cluster_size': compute_cluster_size_mean(enriched_cluster_sizes),
            'median_enriched_cluster_size': compute_cluster_size_median(enriched_cluster_sizes),
            'sd_enriched_cluster_size': compute_cluster_size_sd(enriched_cluster_sizes),
            'nenriched_cluster_genes': num_enriched_cluster_genes,

            'datetime': curr_datetime
            }

            write_to_csv('./scan_stats_metrics2.csv', cluster_stats, list(cluster_stats.keys()))

In [47]:
pd.read_csv('./scan_stats_metrics2.csv')

Unnamed: 0,partition_type,dimensionality,metric,graph,nns,clustering,parameter,silhouette_score,modularity,nclusters,mean_cluster_size,median_cluster_size,sd_cluster_size,nenriched_clusters,mean_enriched_cluster_size,median_enriched_cluster_size,sd_enriched_cluster_size,nenriched_cluster_genes,datetime
0,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.1,-0.031154,0.93165,4220,4.816588,5.0,1.969317,377,5.007958,5.0,1.97798,1888,2024-02-09 08:11:06.867723
1,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.2,-0.015308,0.8749,5202,3.907343,4.0,1.282879,414,4.070048,4.0,1.259715,1685,2024-02-09 08:11:06.867723
2,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.3,-0.006321,0.834022,5909,3.439838,3.0,1.035163,447,3.621924,4.0,1.022242,1619,2024-02-09 08:11:06.867723
3,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.4,0.004782,0.775205,6926,2.934739,3.0,0.827838,457,3.133479,3.0,0.819427,1432,2024-02-09 08:11:06.867723
4,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.5,0.007417,0.719616,7887,2.577152,2.0,0.749693,447,2.796421,3.0,0.73041,1250,2024-02-09 08:11:06.867723
5,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.6,0.010133,0.706833,8108,2.506907,2.0,0.710813,453,2.721854,3.0,0.710926,1233,2024-02-09 08:11:06.867723
6,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.7,-0.000273,0.654823,9007,2.256689,2.0,0.663614,439,2.530752,3.0,0.499053,1111,2024-02-09 08:11:06.867723
7,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.8,-0.000273,0.654823,9007,2.256689,2.0,0.663614,439,2.530752,3.0,0.499053,1111,2024-02-09 08:11:06.867723
8,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,0.9,-0.000273,0.654823,9007,2.256689,2.0,0.663614,439,2.530752,3.0,0.499053,1111,2024-02-09 08:11:06.867723
9,EXP,baseline,minkowski_0.25,umap_fuzzy_simplicial_set,2,leiden_cpm,1.0,-0.003539,0.429413,12903,1.575293,2.0,0.494299,330,2.0,2.0,0.0,660,2024-02-09 08:11:06.867723
