In [None]:
def shuffle_row(row):
    shuffled_row = row.values.copy()
    np.random.shuffle(shuffled_row)
    return pd.Series(shuffled_row, index=row.index)

def shuffle_rows(df):
    columns_to_shuffle = df.columns[1:]
    df[columns_to_shuffle] = df[columns_to_shuffle].apply(shuffle_row, axis=1)
    return df

# CLUSTER START

In [None]:
partition_type = 'EXP'
num_iterations = 1
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})

In [None]:
full_filtered_df.head()

In [None]:
# np.random.seed(42)
# X, _ = make_blobs(n_samples=10000, n_features=30, centers=350, cluster_std=1.0, random_state=42)  # Use only 2 features
# # Convert X to a DataFrame
# columns = ['feature' + str(i) for i in range(X.shape[1])]
# df = pd.DataFrame(X, columns=columns)
# raw_data = df.values

In [None]:
dimensions = 47

sampler = st.qmc.LatinHypercube(d=dimensions)
# sampler = st.qmc.Sobol(d=dimensions)
hypercube_sample = sampler.random(n=20326)

hypercube_sample.shape


In [None]:
plt.scatter(hypercube_sample[1234], hypercube_sample[20000])

In [None]:
def get_cpu_cores():
    # If you're using Linux or macOS
    if os.name == 'posix':
        return os.cpu_count()

    # If you're using Windows
    elif os.name == 'nt':
        return multiprocessing.cpu_count()

    # If the operating system is not recognized
    else:
        return "Unable to determine the number of CPU cores."

# Get and print the number of CPU cores
num_cores = get_cpu_cores()
print(f"Number of CPU cores: {num_cores}")

In [None]:
def floor_half_to_even(number):
    return number // 4 * 2

num_workers = floor_half_to_even(num_cores)
num_workers

In [None]:
st.qmc.discrepancy(hypercube_sample, workers=num_workers)

In [None]:
curr_datetime = str(datetime.now())

In [None]:
metric = 'cosine'
p_minkowski = None
n_jobs = -1
random_state = 42

In [None]:
nn = 8

In [None]:
rp = 0.030

In [None]:
num_iterations = 100
partition_type = 'NC'

In [None]:
for iteration in tqdm.tqdm(range(num_iterations)):
# for p_minkowski in np.arange(1.1, 2.1, 0.1):
    
    if partition_type == 'NC':
        full_filtered_df = shuffle_rows(full_filtered_df)
        
    full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)
    
    raw_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values
    # partition_type = 'TNC'
    # raw_data = pd.DataFrame(hypercube_sample)

    idx_labels = list(range(raw_data.shape[0]))


    distance_matrix = compute_pairwise_distance_matrix(raw_data, metric, n_jobs, p_minkowski)
    # distance_matrix = nonzero_inverted_zscore_arr

    ann_idxs, ann_dists = compute_anns(raw_data, nn, metric, random_state, n_jobs, p_minkowski, distance_matrix)

    nn_idxs, nn_dists = compute_nns(raw_data, nn, metric, random_state, n_jobs, p_minkowski, distance_matrix)

    nn_graph = compute_umap_graph(raw_data, nn, metric, ann_idxs, ann_dists)

    parition = compute_leiden_partition(nn_graph, rp, random_state)

    communities = compute_communities(parition, idx_labels)

    sil_score = compute_silhouette_score(distance_matrix, parition)

    modularity = compute_modularity(nn_graph, communities.values())

    enrichment_df = compute_enrichment(full_filtered_norm_df, parition)

    num_clusters = compute_num_clusters(parition, communities.values())

    num_enriched_clusters = compute_num_enriched_clusters(enrichment_df)

    num_enriched_cluster_genes = compute_num_enriched_cluster_genes(enrichment_df, parition)

    cluster_sizes = compute_cluster_sizes(communities)

    enriched_cluster_sizes = compute_enriched_cluster_sizes(communities, enrichment_df)

    cluster_stats = {
    'partition_type': partition_type,

    'dimensionality': 'baseline',

    'metric': metric,
    # 'metric': 'clr',
    'graph': 'umap_fuzzy_simplicial_set',
    'nns': nn,

    'clustering': 'leiden_cpm',
    'parameter': rp,

    'silhouette_score': sil_score,
    'modularity': modularity,

    'nclusters': num_clusters,
    'mean_cluster_size': compute_cluster_size_mean(cluster_sizes),
    'median_cluster_size': compute_cluster_size_median(cluster_sizes),
    'sd_cluster_size': compute_cluster_size_sd(cluster_sizes),

    'nenriched_clusters': num_enriched_clusters,
    'mean_enriched_cluster_size': compute_cluster_size_mean(enriched_cluster_sizes),
    'median_enriched_cluster_size': compute_cluster_size_median(enriched_cluster_sizes),
    'sd_enriched_cluster_size': compute_cluster_size_sd(enriched_cluster_sizes),
    'nenriched_cluster_genes': num_enriched_cluster_genes,

    'datetime': curr_datetime
    }

    # write_to_csv('./scan_stats_v1.csv', cluster_stats, list(cluster_stats.keys()))

In [None]:
sil_score

In [None]:
modularity

In [None]:
num_clusters

In [None]:
num_enriched_clusters

In [None]:
num_enriched_cluster_genes

In [None]:
def get_gene_module_assignments(all_gene_labels: list, gene_list: list, parition: list):
    gene_module_assignments = {}

    for gene in gene_list:
        if gene not in all_gene_labels:
            raise ValueError(f'The gene {gene} is not in the list of all gene labels.')
        gene_idx = all_gene_labels.index(gene)
        module_num = parition[gene_idx]
        if module_num not in gene_module_assignments:
            gene_module_assignments[module_num] = []
        gene_module_assignments[module_num].append(gene)

    return gene_module_assignments

In [None]:
gene_list_1 = ["TTHERM_01055600", "TTHERM_01002870", "TTHERM_01002860", "TTHERM_00630470", "TTHERM_00624730", "TTHERM_00624720", "TTHERM_00527180", "TTHERM_00522600", "TTHERM_00378890", "TTHERM_00335830", "TTHERM_00221120"]

In [None]:
gene_list_2 = ["TTHERM_00420610", "TTHERM_00410210", "TTHERM_00313130", "TTHERM_00467390"]
#                                                                       MAYBE

In [None]:
gene_list_3 = ["TTHERM_01107420", "TTHERM_01004990", "TTHERM_00985020", "TTHERM_00899470", "TTHERM_00865150", "TTHERM_00858130", "TTHERM_00849480", "TTHERM_00829340", "TTHERM_00780750", "TTHERM_00716180", "TTHERM_00704030", "TTHERM_00691170", "TTHERM_00684590", "TTHERM_00670190", "TTHERM_00571880", "TTHERM_00561799", "TTHERM_00529890", "TTHERM_00526250", "TTHERM_00469140", "TTHERM_00455600", "TTHERM_00439330", "TTHERM_00439030", "TTHERM_00424700", "TTHERM_00316660", "TTHERM_00312120", "TTHERM_00301770", "TTHERM_00297130", "TTHERM_00292160", "TTHERM_00243710", "TTHERM_00113120", "TTHERM_000711791", "TTHERM_00069420", "TTHERM_00048890", "TTHERM_000463439", "TTHERM_000439109", "TTHERM_00037290", "TTHERM_000248319", "TTHERM_000086999", "TTHERM_01079170", "TTHERM_01005150", "TTHERM_00865050", "TTHERM_00773520", "TTHERM_00729230", "TTHERM_00704040", "TTHERM_00672040", "TTHERM_00667000", "TTHERM_00648920", "TTHERM_00614820", "TTHERM_00576890", "TTHERM_00572090", "TTHERM_00483610", "TTHERM_00446570", "TTHERM_00441870", "TTHERM_00219420", "TTHERM_00194810", "TTHERM_00161750", "TTHERM_00142290", "TTHERM_001000210", "TTHERM_00083540", "TTHERM_00058860", "TTHERM_00048980", "TTHERM_00046130", "TTHERM_000420919", "TTHERM_000383629", "TTHERM_00013120", "TTHERM_00011190", "TTHERM_01245640", "TTHERM_01197090", "TTHERM_01195950", "TTHERM_01016190", "TTHERM_00790790", "TTHERM_00585320", "TTHERM_00568050", "TTHERM_00554270", "TTHERM_00498190", "TTHERM_00487030", "TTHERM_00448570", "TTHERM_00277550", "TTHERM_00242370", "TTHERM_00143660", "TTHERM_00105150", "TTHERM_00092850", "TTHERM_000011759"]

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_1, list(parition))

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_2, list(parition))

In [None]:
get_gene_module_assignments(list(full_filtered_norm_df['TTHERM_ID'].values), gene_list_3, list(parition))

In [None]:
print(nn_dists[1234])
print(ann_dists[1234])
print(nn_dists.shape)
print(ann_dists.shape)

In [None]:
print(nn_idxs[1234])
print(ann_idxs[1234])
print(nn_idxs.shape)
print(ann_idxs.shape)