# Analyse measures of dependence from CWatM data

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append("..")

from pathlib import Path

from tqdm import tqdm
import numpy as np
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import src.data.cwatm_data as cwatm_data
import src.visualization.visualize as visualize

In [None]:
CWATM_MEASURES_FOLDER = Path("../data/processed", "bivariate_metrics", "CWatM")

REGIONS = ["dry cold", "dry warm", "wet cold", "wet warm"]

## Load data

In [None]:
measures_df = pd.read_csv(CWATM_MEASURES_FOLDER.joinpath("measures_global.csv"),
                          index_col=["input", "output"])
measures_dc_df = pd.read_csv(CWATM_MEASURES_FOLDER.joinpath("measures_dry cold.csv"),
                          index_col=["input", "output"])
measures_dw_df = pd.read_csv(CWATM_MEASURES_FOLDER.joinpath("measures_dry warm.csv"),
                          index_col=["input", "output"])
measures_wc_df = pd.read_csv(CWATM_MEASURES_FOLDER.joinpath("measures_wet cold.csv"),
                          index_col=["input", "output"])
measures_ww_df = pd.read_csv(CWATM_MEASURES_FOLDER.joinpath("measures_wet warm.csv"),
                          index_col=["input", "output"])

measures_df

## Process measures results

This processing consists of:
1. Dropping un-interesting variable pairs (either `NaN` pearson or `0` MIC)
2. Selecting the relevant measures for this study:
    - pearson
    - spearman
    - MIC
    - MAS
    - MEV
3. Computing the rank for each measure
4. Computing the p-value of the top scoring 10% of variable pairs (by MIC)
    - Using the shuffled (permutated) measures

With the above data as a basis. We will continue by:

1. Using the Benjamini and Hochberg procedure to control FDR at `alpha = 0.05` (for MIC)
2. On the statistically significant variable pairs:
    - Computing the non-linearity score MIC - pearson^2

Tables and visuals:
- Table of the top scoring 10% variable pairs with the 5 measures and MIC-pearson^2. With rank and p-value.
- (Pearson) correlation matrix of the 5 measures and MIC-pearson^2
- Scatter plot of p-values and the Benjamini and Hochberg line

In [None]:
measures_df = cwatm_data.process_measures_df(measures_df)
measures_dc_df = cwatm_data.process_measures_df(measures_dc_df)
measures_dw_df = cwatm_data.process_measures_df(measures_dw_df)
measures_wc_df = cwatm_data.process_measures_df(measures_wc_df)
measures_ww_df = cwatm_data.process_measures_df(measures_ww_df)

measures_df

In [None]:
ranks_measures_df = cwatm_data.compute_ranks_df(measures_df)
ranks_measures_dc_df = cwatm_data.compute_ranks_df(measures_dc_df)
ranks_measures_dw_df = cwatm_data.compute_ranks_df(measures_dw_df)
ranks_measures_wc_df = cwatm_data.compute_ranks_df(measures_wc_df)
ranks_measures_ww_df = cwatm_data.compute_ranks_df(measures_ww_df)

ranks_measures_df

In [None]:
# p_values_measures_df = cwatm_data.compute_p_values_complete(
#     shuffled_data_path=CWATM_MEASURES_FOLDER.joinpath("shuffled"),
#     region="global",
#     actual_df=measures_df
# )
p_values_measures_dc_df = cwatm_data.compute_p_values_complete(
    shuffled_data_path=CWATM_MEASURES_FOLDER.joinpath("shuffled"),
    region="dry cold",
    actual_df=measures_dc_df
)
p_values_measures_dw_df = cwatm_data.compute_p_values_complete(
    shuffled_data_path=CWATM_MEASURES_FOLDER.joinpath("shuffled"),
    region="dry warm",
    actual_df=measures_dw_df
)
p_values_measures_wc_df = cwatm_data.compute_p_values_complete(
    shuffled_data_path=CWATM_MEASURES_FOLDER.joinpath("shuffled"),
    region="wet cold",
    actual_df=measures_wc_df
)
p_values_measures_ww_df = cwatm_data.compute_p_values_complete(
    shuffled_data_path=CWATM_MEASURES_FOLDER.joinpath("shuffled"),
    region="wet warm",
    actual_df=measures_ww_df
)

p_values_measures_ww_df

In [None]:
# significant_p_values_series, benjamini_hochberg_data = cwatm_data.control_FDR_benjamini_hochberg(
#     p_values_series_in=p_values_measures_df["MIC_p-value"],
#     alpha=0.05
# )
significant_p_values_dc_series, benjamini_hochberg_dc_data = cwatm_data.control_FDR_benjamini_hochberg(
    p_values_series_in=p_values_measures_dc_df["MIC_p-value"],
    alpha=0.05
)
significant_p_values_dw_series, benjamini_hochberg_dw_data = cwatm_data.control_FDR_benjamini_hochberg(
    p_values_series_in=p_values_measures_dw_df["MIC_p-value"],
    alpha=0.05
)
significant_p_values_wc_series, benjamini_hochberg_wc_data = cwatm_data.control_FDR_benjamini_hochberg(
    p_values_series_in=p_values_measures_wc_df["MIC_p-value"],
    alpha=0.05
)
significant_p_values_ww_series, benjamini_hochberg_ww_data = cwatm_data.control_FDR_benjamini_hochberg(
    p_values_series_in=p_values_measures_ww_df["MIC_p-value"],
    alpha=0.05
)

significant_p_values_ww_series

In [None]:
def plot_benjamini_hochberg_results(bh_data):
    plt.figure(figsize=(10, 6))
    
    # Plot the sorted p-values
    plt.plot(bh_data["data"]['rank'], bh_data["data"]['p_value'], marker='x', linestyle='none', label='P-values')
    
    # Plot the BH critical line
    plt.plot(bh_data["data"]['rank'], bh_data["data"]['bh_critical_value'], color='red', label='BH Critical Value')
    
    # Add horizontal line at alpha level
    plt.axhline(y=bh_data["alpha"], color='grey', linestyle='--', label=f'Alpha = {bh_data["alpha"]}')
    
    # Highlight significant points
    significant_points = bh_data["data"]['p_value'] <= bh_data['threshold_p_value']
    print(sum(significant_points))
    plt.scatter(bh_data["data"]['rank'][significant_points],
                bh_data["data"]['p_value'][significant_points],
                color='green',
                label='Significant',
                zorder=5)
    
    plt.xlabel('Rank of P-value')
    plt.ylabel('P-value')
    plt.title('Benjamini-Hochberg Procedure')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot the results
plot_benjamini_hochberg_results(benjamini_hochberg_ww_data)

In [None]:
if "significant_p_values_series" in locals():
    significant_measures_df = measures_df[significant_p_values_series]
else:
    significant_measures_df = measures_df
significant_measures_df = cwatm_data.compute_non_linearity(significant_measures_df)
significant_ranks_measures_df = cwatm_data.compute_ranks_df(significant_measures_df)

significant_measures_dc_df = measures_dc_df[significant_p_values_dc_series]
significant_measures_dc_df = cwatm_data.compute_non_linearity(significant_measures_dc_df)
significant_ranks_measures_dc_df = cwatm_data.compute_ranks_df(significant_measures_dc_df)

significant_measures_dw_df = measures_dw_df[significant_p_values_dw_series]
significant_measures_dw_df = cwatm_data.compute_non_linearity(significant_measures_dw_df)
significant_ranks_measures_dw_df = cwatm_data.compute_ranks_df(significant_measures_dw_df)

significant_measures_wc_df = measures_wc_df[significant_p_values_wc_series]
significant_measures_wc_df = cwatm_data.compute_non_linearity(significant_measures_wc_df)
significant_ranks_measures_wc_df = cwatm_data.compute_ranks_df(significant_measures_wc_df)

significant_measures_ww_df = measures_ww_df[significant_p_values_ww_series]
significant_measures_ww_df = cwatm_data.compute_non_linearity(significant_measures_ww_df)
significant_ranks_measures_ww_df = cwatm_data.compute_ranks_df(significant_measures_ww_df)

## Visualize

In [None]:


top_10percent_table_dc_df = significant_measures_dc_df.join([significant_ranks_measures_dc_df, p_values_measures_dc_df],
                                                            how="left")

top_10percent_table_dc_df = top_10percent_table_dc_df[["pearson",   "pearson_rank",  # "pearson_p-value",  
                                                    "spearman",  "spearman_rank", # "spearman_p-value",
                                                    "MIC",       "MIC_rank",      # "MIC_p-value",
                                                    "MAS",       "MAS_rank",      # "MAS_p-value",
                                                    "MEV",       "MEV_rank",      # "MEV_p-value",
                                                    "MIC - p^2", "MIC - p^2_rank"
                                                    ]]

top_10percent_table_dc_df = top_10percent_table_dc_df.convert_dtypes()

top_10percent_table_dc_df.sort_values("MIC", ascending=False).head(n=int(0.1 * len(top_10percent_table_dc_df))).round(3).info()

In [None]:
def make_top_n_inputs_by_measure_for_each_output(input_df: pd.DataFrame,
                                                 top_n: int = 6,
                                                 with_value: bool = True):
    
    outputs = ["evap-total", "potevap", "tws", "qtot", "qr"]
    stats = ["pearson", "spearman", "MIC"]
    if with_value:
        subcols = ["input", "value"]
    else:
        subcols = ["input"]

    ind_tuples = [(output, rank + 1) for output in outputs for rank in range(top_n)]
    index = pd.MultiIndex.from_tuples(ind_tuples, names=["output", "rank"])

    col_tuples = [(stat, subcol) for stat in stats for subcol in subcols]
    columns = pd.MultiIndex.from_tuples(col_tuples, names=["measure", "subcol"])

    result_df = pd.DataFrame(index=index, columns=columns)

    temp_df = input_df.copy()

    for output in outputs:
        df_output = temp_df.xs(output, level='output')
        for stat in stats:
            df_stat_sorted = df_output.sort_values(by=stat, ascending=False)
            top = df_stat_sorted.iloc[:top_n]
            inputs = top.index.values
            values = top[stat].values
            for i, (input_var, value) in enumerate(zip(inputs, values)):
                rank = i + 1
                result_df.loc[(output, rank), (stat, "input")] = input_var
                if with_value:
                    result_df.loc[(output, rank), (stat, "value")] = np.round(value, 2)

    return result_df

top_inputs_by_measure_for_each_output = make_top_n_inputs_by_measure_for_each_output(
    significant_measures_df,
    with_value=False
)

top_inputs_by_measure_for_each_output.to_csv(
    "../data/processed/bivariate_metrics/CWatM/summary_tables/top_inputs_by_measure_for_each_output.csv"
)
dfi.export(top_inputs_by_measure_for_each_output,
           "../reports/tables/CWatM_data/top_inputs_by_measure_for_each_output_global.png",
           table_conversion="matplotlib", dpi=300)

display(top_inputs_by_measure_for_each_output)

In [None]:
def make_top_n_inputs_by_measure_for_each_output_with_regions(global_df: pd.DataFrame,
                                                              region_dc_df: pd.DataFrame,
                                                              region_dw_df: pd.DataFrame,
                                                              region_wc_df: pd.DataFrame,
                                                              region_ww_df: pd.DataFrame,
                                                              top_n: int = 6,
                                                              with_value: bool = True):
    
    table_global = make_top_n_inputs_by_measure_for_each_output(global_df, top_n=top_n, with_value=with_value)
    table_dc = make_top_n_inputs_by_measure_for_each_output(region_dc_df, top_n=top_n, with_value=with_value)
    table_dw = make_top_n_inputs_by_measure_for_each_output(region_dw_df, top_n=top_n, with_value=with_value)
    table_wc = make_top_n_inputs_by_measure_for_each_output(region_wc_df, top_n=top_n, with_value=with_value)
    table_ww = make_top_n_inputs_by_measure_for_each_output(region_ww_df, top_n=top_n, with_value=with_value)

    table_global = pd.concat([table_global], keys=["global"], names=["region"], axis=1)
    table_dc = pd.concat([table_dc], keys=["dry cold"], names=["region"], axis=1)
    table_dw = pd.concat([table_dw], keys=["dry warm"], names=["region"], axis=1)
    table_wc = pd.concat([table_wc], keys=["wet cold"], names=["region"], axis=1)
    table_ww = pd.concat([table_ww], keys=["wet warm"], names=["region"], axis=1)

    table_all = table_global.join([table_dc, table_dw, table_wc, table_ww])

    table_all = table_all.reorder_levels(["measure", "region", "subcol"], axis="columns").sort_index(axis="columns", level="measure")
    table_all = table_all.reindex(columns=["pearson", "spearman", "MIC"], level="measure")
    table_all = table_all.reindex(columns=["global", "dry cold", "dry warm", "wet cold", "wet warm"], level="region")

    return table_all

top_inputs_by_measure_for_each_output_with_regions_and_value = make_top_n_inputs_by_measure_for_each_output_with_regions(
    significant_measures_df,
    significant_measures_dc_df,
    significant_measures_dw_df,
    significant_measures_wc_df,
    significant_measures_ww_df,
    top_n=6,
    with_value=True
)

top_inputs_by_measure_for_each_output_with_regions = make_top_n_inputs_by_measure_for_each_output_with_regions(
    significant_measures_df,
    significant_measures_dc_df,
    significant_measures_dw_df,
    significant_measures_wc_df,
    significant_measures_ww_df,
    top_n=6,
    with_value=False
)

top_inputs_by_measure_for_each_output_with_regions_and_value.to_csv(
    "../data/processed/bivariate_metrics/CWatM/summary_tables/top_inputs_by_measure_for_each_output_with_regions_and_value.csv"
)
dfi.export(top_inputs_by_measure_for_each_output_with_regions_and_value,
           "../reports/tables/CWatM_data/top_inputs_by_measure_for_each_output_with_regions_and_value.png",
           table_conversion="matplotlib", dpi=300)

top_inputs_by_measure_for_each_output_with_regions.to_csv(
    "../data/processed/bivariate_metrics/CWatM/summary_tables/top_inputs_by_measure_for_each_output_with_regions.csv"
)
dfi.export(top_inputs_by_measure_for_each_output_with_regions, 
           "../reports/tables/CWatM_data/top_inputs_by_measure_for_each_output_with_regions.png",
           table_conversion="matplotlib", dpi=300)

display(top_inputs_by_measure_for_each_output_with_regions_and_value)
display(top_inputs_by_measure_for_each_output_with_regions)

In [None]:
table_global, table_dc, table_dw, table_wc, table_ww = top_inputs_by_measure_for_each_output_with_regions_and_value

In [None]:
table_global.join([table_dc, table_dw, table_wc, table_ww])

In [None]:
def plot_corr_matrix(input_df):
    temp_df = input_df.copy()
    temp_df["pearson"] = temp_df["pearson"]**2
    temp_df["spearman"] = temp_df["spearman"]**2
    pearson_corr_matrix = temp_df.corr(method="pearson").round(2)
    fig = px.imshow(pearson_corr_matrix, text_auto=True, zmin=-1, zmax=+1)
    fig.show()

plot_corr_matrix(significant_measures_df)
plot_corr_matrix(significant_measures_dc_df)
plot_corr_matrix(significant_measures_dw_df)
plot_corr_matrix(significant_measures_wc_df)
plot_corr_matrix(significant_measures_ww_df)


In [None]:
def plot_scatter_matrix(input_df):
    temp_df = input_df.copy()
    fig = go.Figure(
        data=go.Splom(
            dimensions=[dict(label=col,
                             values=temp_df[col]) for col in temp_df.columns],
        diagonal_visible=False, # remove plots on diagonal
        # showupperhalf=False,
        text=measures_df.index.to_list(),
        )
    )
    fig.update_layout(
        title='Measures of dependence',
        width=900,
        height=600,
        hovermode="x",
    )
    fig.show()

plot_scatter_matrix(significant_measures_df)
plot_scatter_matrix(significant_measures_dc_df)
plot_scatter_matrix(significant_measures_dw_df)
plot_scatter_matrix(significant_measures_wc_df)
plot_scatter_matrix(significant_measures_ww_df)

In [None]:
significant_measures_df.loc[("albedoWater", "potevap")]


In [None]:
def plot_values_and_rank(input_df,
                         region,
                         n_top = 20):
    temp_df = input_df.copy()
    fig = visualize.plot_measure_values_and_rank(
        measures_df=temp_df,
        # measures=["pearson", "spearman", "mutual information (sklearn)", "normalized mutual information", "MIC"],
        # measures=["pearson", "spearman", "normalized mutual information", "MIC"],
        # measures=["pearson", "spearman", "MIC"],
        measures=["pearson", "spearman", "MIC", "MIC - p^2", "MAS"],
        sort_values_by="MIC",
        n_top=n_top
    )
    fig.suptitle(region)

plot_values_and_rank(significant_measures_df, region="global")
plot_values_and_rank(significant_measures_dc_df, region="dry cold")
plot_values_and_rank(significant_measures_dw_df, region="dry warm")
plot_values_and_rank(significant_measures_wc_df, region="wet cold")
plot_values_and_rank(significant_measures_ww_df, region="wet warm")


In [None]:
# import plotly.express as px

# # Create a 3D scatter plot
# fig = px.scatter_3d(
#     measures_df.abs(),
#     x="MIC",
#     y="pearson",
#     z="spearman",
#     color="MCN_general",
#     title="MIC metrics",
#     opacity=0.7
# )
# fig.show()
# # # Show the plot
# # fig.show()

In [None]:
# import os
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg


# MEASURE = "MIC"
# IMAGES_FOLDER = Path("../reports/figures/CWatM_data/scatterplots")

# X = measures_df.abs().dropna()

# low_threshold = np.percentile(X[MEASURE], q=90)
# upp_threshold = np.percentile(X[MEASURE], q=100)

# top_df = X[(X[MEASURE] >= low_threshold) & (X[MEASURE] <= upp_threshold)]

# top_df = X[(X["MIC"] >= low_threshold) & (X["MIC"] <= upp_threshold)]

# print(f"Number of samples meeting the condition: {len(top_df)} out of {len(X)}")

# image_filenames = [f"{pairs[0]}_{pairs[1]}.png" for pairs in top_df.sample(n=20).index.to_list()]
# image_filenames = [IMAGES_FOLDER.joinpath(f) for f in image_filenames]

# # Create a figure and subplots
# fig, axes = plt.subplots(nrows=4, ncols=5, figsize=(12, 8))

# # Flatten the axes array for easy iteration
# axes = axes.flatten()

# # Loop through images and display them
# for idx, ax in enumerate(axes):
#     if idx < len(image_filenames):
#         # Check if file exists
#         if os.path.exists(image_filenames[idx]):
#             # Read and display image
#             img = mpimg.imread(image_filenames[idx], format="png")
#             ax.imshow(img)
#             ax.axis('off')  # Hide axes ticks
#             ax.set_title(f'Image {idx+1}')
#         else:
#             ax.text(0.5, 0.5, 'Image not found', fontsize=12, ha='center')
#             ax.axis('off')
#     else:
#         # Hide any extra subplots if fewer images
#         ax.axis('off')

# # Adjust spacing between subplots
# fig.tight_layout()

# # Show the plot
# fig.show()

In [None]:
# import os
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg


# MEASURE = "MIC"
# IMAGES_FOLDER = Path("../reports/figures/CWatM_data/scatterplots")

# X = measures_df.abs().dropna()

# low_threshold = np.percentile(X[MEASURE], q=90)
# upp_threshold = np.percentile(X[MEASURE], q=100)

# top_df = X[(X[MEASURE] >= low_threshold) & (X[MEASURE] <= upp_threshold)]

# print(f"Number of samples meeting the condition: {len(top_df)} out of {len(X)}")

# # Number of samples to display
# top_n = 10  # Adjust this number based on how many images you want per column

# # 1. Sample top_n samples
# top_df_sorted_by_MIC = top_df.sample(n=top_n, random_state=21)

# def get_image_filenames(df):
#     image_filenames = [f"{pairs[0]}_{pairs[1]}.png" for pairs in df.index.to_list()]
#     image_filenames = [IMAGES_FOLDER.joinpath(f) for f in image_filenames]
#     return image_filenames

# # 3. Sort the images according to each column
# # Since we want the same images in each column but sorted differently, we sort 'top_df_sorted_by_MIC' by different columns
# images_sorted_by_pearson = top_df_sorted_by_MIC.sort_values(by='pearson', ascending=False)
# images_sorted_by_spearman = top_df_sorted_by_MIC.sort_values(by='spearman', ascending=False)
# images_sorted_by_MIC = top_df_sorted_by_MIC.sort_values(by='MIC', ascending=False)
# images_sorted_by_MEV = top_df_sorted_by_MIC.sort_values(by='MEV', ascending=False)
# images_sorted_by_MAS = top_df_sorted_by_MIC.sort_values(by='MAS', ascending=False)
# images_sorted_by_MCN = top_df_sorted_by_MIC.sort_values(by='MCN_general', ascending=False)

# # Get image filenames for each sorting
# image_filenames_pearson = get_image_filenames(images_sorted_by_pearson)
# image_filenames_spearman = get_image_filenames(images_sorted_by_spearman)
# image_filenames_MIC = get_image_filenames(images_sorted_by_MIC)
# image_filenames_MEV = get_image_filenames(images_sorted_by_MEV)
# image_filenames_MAS = get_image_filenames(images_sorted_by_MAS)
# image_filenames_MCN = get_image_filenames(images_sorted_by_MCN)

# # 4. Plot the images in a subplot grid

# # Number of rows and columns
# nrows = top_n
# ncols = 6

# # Create a figure and subplots
# fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, nrows * 1))

# # If axes is a 1D array when nrows=1 or ncols=1, make it 2D for consistent indexing
# if nrows == 1 or ncols == 1:
#     axes = axes.reshape(nrows, ncols)

# # List of image filenames sorted by each column
# sorted_images = [image_filenames_pearson, image_filenames_spearman, image_filenames_MIC, image_filenames_MEV, image_filenames_MAS, image_filenames_MCN]
# column_titles = ['Sorted by pearson', 'Sorted by spearman', 'Sorted by MIC', 'Sorted by MEV', 'Sorted by MAS', "Sorted by MCN"]

# # Loop through columns and rows to display images
# for col in range(ncols):
#     images = sorted_images[col]
#     for row in range(nrows):
#         ax = axes[row, col]
#         img_path = images[row]
#         if os.path.exists(img_path):
#             img = mpimg.imread(img_path)
#             ax.imshow(img)
#             ax.axis('off')  # Hide axes ticks
#             # Optionally, add image title or value
#             # ax.set_title(f'Image {row+1}')
#         else:
#             ax.text(0.5, 0.5, 'Image not found', fontsize=12, ha='center')
#             ax.axis('off')
#     # Set the column title
#     axes[0, col].set_title(column_titles[col], fontsize=16)

# # Adjust spacing between subplots
# fig.tight_layout()

# # Show the plot
# plt.show()


## Find trends

In [None]:
# MEASURES = [
#     "pearson",
#     "spearman",
#     "MIC",
#     "MAS",
#     "MEV",
#     "MCN_general"
# ]

In [None]:
# import matplotlib.cm as cm
# import matplotlib.pyplot as plt
# import numpy as np

# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.metrics import silhouette_samples, silhouette_score


# range_n_clusters = [2, 3, 4, 5]

# X = measures_df[MEASURES].dropna().abs().to_numpy()

# for n_clusters in range_n_clusters:
#     # Create a subplot with 1 row and 2 columns
#     fig, (ax1, ax2) = plt.subplots(1, 2)
#     fig.set_size_inches(18, 7)

#     # The 1st subplot is the silhouette plot
#     # The silhouette coefficient can range from -1, 1 but in this example all
#     # lie within [-0.1, 1]
#     ax1.set_xlim([-0.1, 1])
#     # The (n_clusters+1)*10 is for inserting blank space between silhouette
#     # plots of individual clusters, to demarcate them clearly.
#     ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

#     # Initialize the clusterer with n_clusters value and a random generator
#     # seed of 10 for reproducibility.
#     clusterer = KMeans(n_clusters=n_clusters, init="k-means++", random_state=23)
#     # clusterer = DBSCAN()

#     cluster_labels = clusterer.fit_predict(X)

#     # The silhouette_score gives the average value for all the samples.
#     # This gives a perspective into the density and separation of the formed
#     # clusters
#     silhouette_avg = silhouette_score(X, cluster_labels)
#     print(
#         "For n_clusters =",
#         n_clusters,
#         "The average silhouette_score is :",
#         silhouette_avg,
#     )

#     # Compute the silhouette scores for each sample
#     sample_silhouette_values = silhouette_samples(X, cluster_labels)

#     y_lower = 10
#     for i in range(n_clusters):
#         # Aggregate the silhouette scores for samples belonging to
#         # cluster i, and sort them
#         ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

#         ith_cluster_silhouette_values.sort()

#         size_cluster_i = ith_cluster_silhouette_values.shape[0]
#         y_upper = y_lower + size_cluster_i

#         color = cm.nipy_spectral(float(i) / n_clusters)
#         ax1.fill_betweenx(
#             np.arange(y_lower, y_upper),
#             0,
#             ith_cluster_silhouette_values,
#             facecolor=color,
#             edgecolor=color,
#             alpha=0.7,
#         )

#         # Label the silhouette plots with their cluster numbers at the middle
#         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

#         # Compute the new y_lower for next plot
#         y_lower = y_upper + 10  # 10 for the 0 samples

#     ax1.set_title("The silhouette plot for the various clusters.")
#     ax1.set_xlabel("The silhouette coefficient values")
#     ax1.set_ylabel("Cluster label")

#     # The vertical line for average silhouette score of all the values
#     ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

#     ax1.set_yticks([])  # Clear the yaxis labels / ticks
#     ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

#     # # 2nd Plot showing the actual clusters formed
#     # colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
#     # ax2.scatter(
#     #     X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
#     # )

#     # Labeling the clusters
#     centers = clusterer.cluster_centers_
#     # Draw white circles at cluster centers
#     ax2.scatter(
#         centers[:, 0],
#         centers[:, 1],
#         marker="o",
#         c="white",
#         alpha=1,
#         s=200,
#         edgecolor="k",
#     )

#     for i, c in enumerate(centers):
#         ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

#     ax2.set_title("The visualization of the clustered data.")
#     ax2.set_xlabel("Feature space for the 1st feature")
#     ax2.set_ylabel("Feature space for the 2nd feature")

#     plt.suptitle(
#         "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
#         % n_clusters,
#         fontsize=14,
#         fontweight="bold",
#     )

# plt.show()

In [None]:
# MEASURES = [
#     # "pearson",
#     # "spearman",
#     # "mutual information (sklearn)",
#     # "normalized mutual information",
#     "MIC",
#     "MEV",
#     "MAS",
#     "MCN_general"
# ]

In [None]:
# from sklearn.manifold import TSNE


# X = measures_df[MEASURES].dropna()

# # Initialize t-SNE with desired parameters
# tsne = TSNE(n_components=3,        # Reduce to 3 dimensions for visualization
#             perplexity=20,         # Controls the balance between local and global aspects of the data
#             early_exaggeration=12, 
#             n_iter=1000,           # Number of iterations for optimization
#             random_state=23)

# # Fit and transform the data
# X_embedded = tsne.fit_transform(X)

# # Visualize
# plt.figure(figsize=(8, 6))
# scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=X_embedded[:, 2], cmap='viridis', alpha=0.7)
# plt.legend(*scatter.legend_elements(), title="Clusters")
# plt.title("t-SNE Visualization of High-Dimensional Data")
# plt.xlabel("t-SNE Feature 1")
# plt.ylabel("t-SNE Feature 2")
# plt.show()