---

# Livrable 1

---

In [None]:
import os 

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from itertools import combinations
from math import log

import scipy.stats as scs
from statsmodels.sandbox.stats.multicomp import multipletests

import warnings
warnings.filterwarnings('ignore')

### Étape 0: Chargement de donnees Aging_INDICES & Tractometry_allSubs folder

In [None]:
Aging_INDICES = pd.read_csv('Aging_INDICES.csv', sep=",")

In [None]:
tracto_folder_path = './Tractometry_allSubs/'

### Étape 1: Filtrage des groupes "jeune" et "vieux"

In [None]:
# Filtrer les sujets du groupe "Y" (jeunes)
data_young_sub = Aging_INDICES[Aging_INDICES['group'] == 'Y']
list_young_sub = data_young_sub["sub"].tolist()

# Filtrer les sujets du groupe "O" (vieux)
data_old_sub = Aging_INDICES[Aging_INDICES['group'] == 'O']
list_old_sub = data_old_sub["sub"].tolist()

### Étape 2: Fonction de segmentation

In [None]:
def segmentation(dti_metric, list_sub):
    list_CC4_val, list_CC5_val = [], []
    
    for sub in list_sub:
        file_path = os.path.join(tracto_folder_path, f'{sub}_{dti_metric}_tractometry.csv')
        
        try:
            data_sub = pd.read_csv(file_path, sep=";")
            list_CC4_val.append(data_sub['CC_4'].values)
            list_CC5_val.append(data_sub['CC_5'].values)
        except FileNotFoundError:
            print(f"Fichier {file_path} introuvable.")
        except KeyError:
            print(f"Colonnes 'CC_4' ou 'CC_5' manquantes dans {file_path}.")
    
    return list_CC4_val, list_CC5_val

### Étape 3: Extraction pour chaque variable

In [None]:
# Extraction pour la variable FA
list_CC4_FA_young_val, list_CC5_FA_young_val = segmentation('FA', list_young_sub)
list_CC4_FA_old_val, list_CC5_FA_old_val = segmentation('FA', list_old_sub)

# Extraction pour la variable MD
list_CC4_MD_young_val, list_CC5_MD_young_val = segmentation('MD', list_young_sub)
list_CC4_MD_old_val, list_CC5_MD_old_val = segmentation('MD', list_old_sub)

# Extraction pour la variable RD
list_CC4_RD_young_val, list_CC5_RD_young_val = segmentation('RD', list_young_sub)
list_CC4_RD_old_val, list_CC5_RD_old_val = segmentation('RD', list_old_sub)

# Extraction pour la variable AD
list_CC4_AD_young_val, list_CC5_AD_young_val = segmentation('AD', list_young_sub)
list_CC4_AD_old_val, list_CC5_AD_old_val = segmentation('AD', list_old_sub)

### Étape 4: Fonction de binning des données

In [None]:
def bin_data(data, num_bins):
    binned_data = []
    for subject_data in data:
        subject_data = np.array(subject_data, dtype=np.float64)
        splits = np.array_split(subject_data, num_bins)
        bin_means = [np.mean(spl) for spl in splits]
        binned_data.append(bin_means)
    return np.array(binned_data)

In [None]:
num_bins = 12

# Binning des données FA pour CC4
list_CC4_FA_young_binned = bin_data(list_CC4_FA_young_val, num_bins)
list_CC4_FA_old_binned = bin_data(list_CC4_FA_old_val, num_bins)

# Binning des données FA pour CC5
list_CC5_FA_young_binned = bin_data(list_CC5_FA_young_val, num_bins)
list_CC5_FA_old_binned = bin_data(list_CC5_FA_old_val, num_bins)

# Binning des données MD pour CC4
list_CC4_MD_young_binned = bin_data(list_CC4_MD_young_val, num_bins)
list_CC4_MD_old_binned = bin_data(list_CC4_MD_old_val, num_bins)

# Binning des données MD pour CC5
list_CC5_MD_young_binned = bin_data(list_CC5_MD_young_val, num_bins)
list_CC5_MD_old_binned = bin_data(list_CC5_MD_old_val, num_bins)

# Binning des données RD pour CC4
list_CC4_RD_young_binned = bin_data(list_CC4_RD_young_val, num_bins)
list_CC4_RD_old_binned = bin_data(list_CC4_RD_old_val, num_bins)

# Binning des données RD pour CC5
list_CC5_RD_young_binned = bin_data(list_CC5_RD_young_val, num_bins)
list_CC5_RD_old_binned = bin_data(list_CC5_RD_old_val, num_bins)

# Binning des données AD pour CC4
list_CC4_AD_young_binned = bin_data(list_CC4_AD_young_val, num_bins)
list_CC4_AD_old_binned = bin_data(list_CC4_AD_old_val, num_bins)

# Binning des données AD pour CC5
list_CC5_AD_young_binned = bin_data(list_CC5_AD_young_val, num_bins)
list_CC5_AD_old_binned = bin_data(list_CC5_AD_old_val, num_bins)


In [None]:
def create_combined_dataframe(list_young_binned, list_old_binned, variable_name, cc_name):
    """
    Fonction pour combiner les données binned des jeunes et des vieux et créer un DataFrame.
    """
    combined_data = np.vstack([list_young_binned, list_old_binned])
    
    df_combined = pd.DataFrame(combined_data, 
                               columns=[f'Bin_{i+1}' for i in range(combined_data.shape[1])])
    
    df_combined['Group'] = ['Young'] * len(list_young_binned) + ['Old'] * len(list_old_binned)
    
    df_combined['Variable'] = variable_name
    df_combined['CC_Segment'] = cc_name
    
    return df_combined

# FA - CC4
df_CC4_FA_combined = create_combined_dataframe(list_CC4_FA_young_binned, list_CC4_FA_old_binned, 'FA', 'CC4')

# FA - CC5
df_CC5_FA_combined = create_combined_dataframe(list_CC5_FA_young_binned, list_CC5_FA_old_binned, 'FA', 'CC5')

# MD - CC4
df_CC4_MD_combined = create_combined_dataframe(list_CC4_MD_young_binned, list_CC4_MD_old_binned, 'MD', 'CC4')

# MD - CC5
df_CC5_MD_combined = create_combined_dataframe(list_CC5_MD_young_binned, list_CC5_MD_old_binned, 'MD', 'CC5')

# RD - CC4
df_CC4_RD_combined = create_combined_dataframe(list_CC4_RD_young_binned, list_CC4_RD_old_binned, 'RD', 'CC4')

# RD - CC5
df_CC5_RD_combined = create_combined_dataframe(list_CC5_RD_young_binned, list_CC5_RD_old_binned, 'RD', 'CC5')

# AD - CC4
df_CC4_AD_combined = create_combined_dataframe(list_CC4_AD_young_binned, list_CC4_AD_old_binned, 'AD', 'CC4')

# AD - CC5
df_CC5_AD_combined = create_combined_dataframe(list_CC5_AD_young_binned, list_CC5_AD_old_binned, 'AD', 'CC5')

df_all_combined = pd.concat([df_CC4_FA_combined, df_CC5_FA_combined, 
                             df_CC4_MD_combined, df_CC5_MD_combined, 
                             df_CC4_RD_combined, df_CC5_RD_combined, 
                             df_CC4_AD_combined, df_CC5_AD_combined], ignore_index=True)

### Étape 5: Création du jeu de données pour un segment et une variable

##### Cette étape consiste à filtrer les données en fonction d'un **segment** (par exemple, **CC4** ou **CC5**) et d'une **variable** (par exemple, **FA**, **MD**, **RD**, ou **AD**). Le but est de constituer un sous-ensemble de données spécifiques à cette combinaison afin de préparer les données pour une analyse ultérieure, telle que le clustering.

In [None]:
df_filtered = df_all_combined[(df_all_combined['Variable'] == 'FA') & (df_all_combined['CC_Segment'] == 'CC4')]
df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
df_filtered_clean = df_filtered_clean.reset_index(drop=True)

In [None]:
df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
#df_used = pd.concat([Aging_INDICES], axis=1, ignore_index=False)
df_used = df_used.drop(['sub', 'sex','group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 'FAsigCC5','ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1'], axis=1)
df_used = df_used.drop(['TGMv_all','ctxGMv_all','LH_GMv_all', 'RH_GMv_all'], axis=1)
df_used

### Mann-Whitney U Test with FDR Correction on Imputed Data

In [None]:
def impute_median(series):
    return series.fillna(series.median())

df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)

results = []
p_values = []

for col in df_used.columns:
    col1 = df_used[col][Aging_INDICES["group"] == "Y"]  # Groupe Y (jeunes)
    col2 = df_used[col][Aging_INDICES["group"] == "O"]  # Groupe O (vieux)
    
    # Test de Mann-Whitney U
    test_result = scs.mannwhitneyu(col1, col2)
    
    results.append(test_result[0].round(4))  # U-statistique
    p_values.append(test_result[1])          # p-value

# Correction pour tests multiples (alpha 5% et 1%)
c1 = multipletests(p_values, alpha=0.05, method='fdr_bh')  # Correction avec FDR (Benjamini-Hochberg)
c2 = multipletests(p_values, alpha=0.01, method='fdr_bh')  # Correction avec FDR (Benjamini-Hochberg)

res = pd.DataFrame({
    "col_name": df_used.columns,      # Noms des colonnes
    "U-stat": results,                # U-statistiques
    "pvalue": p_values,               # p-values
    "adjusted_p_value": c1[1],        # p-values ajustées (alpha 5%)
    "sign_5%": c1[0],                 # Significatif à 5%
    "sign_1%": c2[0]                  # Significatif à 1%
})

res


---

## Commentaire : 
### Daniela
..............................................................................................

..............................................................................................

..............................................................................................

..............................................................................................

### Youssef

**I still need to complete the study for 12 bins.**

**I still need to complete the study on the variables -> I will replace the bins with statistical variables instead**

---

# Livrable 2

---

## 1. **Préparation des données**

### a. Importation des librairies nécessaires

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture


### b. Chargement et visualisation des données

In [None]:
df_used.head()

## 2. Visualisation de la distribution des variables

In [None]:
df_used.hist(figsize=(15, 10), bins=20)
plt.tight_layout()
plt.show()

## 3. **Analyse de clustering**

### a. Standardisation des données

In [None]:
# Standardisation des données
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_used)
df_scaled_df = pd.DataFrame(df_scaled, columns=df_used.columns)
df_scaled_df.head()

In [None]:
df_used.head()

### b. Clustering avec **K-means**

1. **Choix du nombre de clusters (K)** avec l'inertie et le score de silhouette.

In [None]:
# Choix du nombre de clusters K
inertia = []
silhouette_scores = []
K_range = range(2, 11)  
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)  # Inertie (cohésion)
    silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))  # Score de silhouette

In [None]:
# Visualisation du nombre de clusters optimal (Méthode du coude)
plt.figure(figsize=(10,5))
plt.plot(K_range, inertia, marker='o')
plt.title("Méthode du coude : Inertie vs Nombre de Clusters (K)")
plt.xlabel("Nombre de Clusters (K)")
plt.ylabel("Inertie")
plt.show()

In [None]:
# Visualisation du score de silhouette
plt.figure(figsize=(10,5))
plt.plot(K_range, silhouette_scores, marker='o')
plt.title("Score de Silhouette vs Nombre de Clusters (K)")
plt.xlabel("Nombre de Clusters (K)")
plt.ylabel("Score de Silhouette")
plt.show()

2. **Exécution du clustering avec K-means**

In [None]:
# Appliquer K-means avec K=3
kmeans = KMeans(n_clusters=3 , random_state=42)
kmeans.fit(df_scaled)
df_used['Cluster'] = kmeans.labels_
df_used

## 4. **Interprétation des clusters**

### a. Analyse des variables par cluster (K-means)

In [None]:
# Statistiques descriptives par cluster (K-means)
cluster_stats = df_used.groupby('Cluster').mean()
cluster_stats.T 

### b. Visualisation des clusters

In [None]:
df_clusters = pd.DataFrame({
    'actual': Aging_INDICES["group"], 
    'predicted': df_used['Cluster']    
})
df_clusters = df_clusters.sort_values(by='predicted')
group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
group_distribution


In [None]:
group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
plt.title("Répartition des groupes Young et Old dans chaque Cluster (K-means)")
plt.xlabel("Clusters")
plt.ylabel("Nombre d'individus")
plt.legend(title="Groupes")
plt.show()

## Code complet pour appliquer sur toutes les combinaisons 

In [None]:
# Variables et segments à traiter
variables = ['FA', 'MD', 'RD', 'AD']
segments = ['CC4', 'CC5']
cluster_results = {}

def impute_median(series):
    return series.fillna(series.median())

for variable in variables:
    for segment in segments:

        df_filtered = df_all_combined[(df_all_combined['Variable'] == variable) & (df_all_combined['CC_Segment'] == segment)]
        df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
        df_filtered_clean = df_filtered_clean.reset_index(drop=True)
        df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
        df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
        columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                           'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                           'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
        df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_used)
        df_scaled_df = pd.DataFrame(df_scaled, columns=df_used.columns)
        
        inertia = []
        silhouette_scores = []
        K_range = range(2, 11)
        
        for k in K_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(df_scaled)
            inertia.append(kmeans.inertia_)  # Inertie
            silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))  # Score de silhouette
        
        kmeans = KMeans(n_clusters=3, random_state=42)
        kmeans.fit(df_scaled)
        
        df_used['Cluster'] = kmeans.labels_
        df_clusters = pd.DataFrame({
            'actual': Aging_INDICES["group"], 
            'predicted': df_used['Cluster']    
        })
        df_clusters = df_clusters.sort_values(by='predicted')
        group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
        cluster_results[f'{variable}_{segment}'] = group_distribution
        
        group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
        plt.title(f"Répartition des groupes Young et Old dans chaque Cluster (K-means) pour {variable} et {segment}")
        plt.xlabel("Clusters")
        plt.ylabel("Nombre d'individus")
        plt.legend(title="Groupes")
        plt.show()

for key, result in cluster_results.items():
    print(f"\nRépartition pour {key}:\n")
    print(result)


## Hierarchical Clustering : Agglomerative 

In [None]:
variables = ['FA', 'MD', 'RD', 'AD']
segments = ['CC4', 'CC5']
cluster_results = {}

def impute_median(series):
    return series.fillna(series.median())

for variable in variables:
    for segment in segments:

        df_filtered = df_all_combined[(df_all_combined['Variable'] == variable) & (df_all_combined['CC_Segment'] == segment)]
        
        df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
        df_filtered_clean = df_filtered_clean.reset_index(drop=True)
        
        df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
        
        df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
        
        columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                           'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                           'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
        df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
        
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_used)
        df_scaled_df = pd.DataFrame(df_scaled, columns=df_used.columns)
        
        silhouette_scores = []
        K_range = range(2, 11)
        
        for k in K_range:
            agg_clustering = AgglomerativeClustering(n_clusters=k)
            labels = agg_clustering.fit_predict(df_scaled)
            
            if len(set(labels)) > 1:  
                silhouette_avg = silhouette_score(df_scaled, labels)
                silhouette_scores.append(silhouette_avg)
            else:
                silhouette_scores.append(-1) 
        
        best_k = K_range[np.argmax(silhouette_scores)]
        print(f"Meilleur nombre de clusters pour {variable} et {segment}: {best_k}")
        
        agg_clustering = AgglomerativeClustering(n_clusters=4)
        labels = agg_clustering.fit_predict(df_scaled)
        df_used['Cluster'] = labels
        
        df_clusters = pd.DataFrame({
            'actual': Aging_INDICES["group"], 
            'predicted': df_used['Cluster']    
        })
        df_clusters = df_clusters.sort_values(by='predicted')
        
        group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
        cluster_results[f'{variable}_{segment}'] = group_distribution
        
        group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
        plt.title(f"Répartition des groupes Young et Old dans chaque Cluster (Agglomerative) pour {variable} et {segment}")
        plt.xlabel("Clusters")
        plt.ylabel("Nombre d'individus")
        plt.legend(title="Groupes")
        plt.show()

for key, result in cluster_results.items():
    print(f"\nRépartition pour {key}:\n")
    print(result)

## GaussianMixture Clustering 

In [None]:
from sklearn.mixture import GaussianMixture

# Variables et segments à traiter
variables = ['FA', 'MD', 'RD', 'AD']
segments = ['CC4', 'CC5']
n_clusters = 3
cluster_results = {}

def impute_median(series):
    return series.fillna(series.median())

for variable in variables:
    for segment in segments:

        df_filtered = df_all_combined[(df_all_combined['Variable'] == variable) & (df_all_combined['CC_Segment'] == segment)]
        df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
        df_filtered_clean = df_filtered_clean.reset_index(drop=True)
        df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
        df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
        columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                           'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                           'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
        df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_used)
        df_scaled_df = pd.DataFrame(df_scaled, columns=df_used.columns)
        
        inertia = []
        silhouette_scores = []
        K_range = range(2, 11)
        
        for k in K_range:
            gmm = GaussianMixture(n_components=k, random_state=42)
            gmm.fit(df_scaled)
            labels = gmm.predict(df_scaled)
            silhouette_scores.append(silhouette_score(df_scaled, labels))  # Score de silhouette
        
        gmm = GaussianMixture(n_components=n_clusters, random_state=42)
        gmm.fit(df_scaled)
        df_used['Cluster'] = gmm.predict(df_scaled)
        
        df_clusters = pd.DataFrame({
            'actual': Aging_INDICES["group"], 
            'predicted': df_used['Cluster']    
        })
        df_clusters = df_clusters.sort_values(by='predicted')
        group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
        cluster_results[f'{variable}_{segment}'] = group_distribution
        
        group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
        plt.title(f"Répartition des groupes Young et Old dans chaque Cluster (Gaussian Mixture) pour {variable} et {segment}")
        plt.xlabel("Clusters")
        plt.ylabel("Nombre d'individus")
        plt.legend(title="Groupes")
        plt.show()

"""

for key, result in cluster_results.items():
    print(f"\nRépartition pour {key}:\n")
    print(result)

"""

## DBSCAN Clustering 

In [None]:
from sklearn.cluster import DBSCAN, SpectralClustering
from sklearn.neighbors import NearestNeighbors

variables = ['FA', 'MD', 'RD', 'AD']
segments = ['CC4', 'CC5']
cluster_results = {}

def impute_median(series):
    return series.fillna(series.median())

def dbscan_clustering(df_scaled, epsilon=0.5, min_samples=5):
    """
    Applique l'algorithme DBSCAN pour le clustering.
    """
    dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)
    labels = dbscan.fit_predict(df_scaled)
    return labels

def spectral_clustering(df_scaled, n_clusters=10):
    """
    Applique l'algorithme Spectral Clustering pour le clustering.
    """
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=42)
    labels = spectral.fit_predict(df_scaled)
    return labels

def plot_cluster_distribution(df_clusters, variable, segment, method):
    """
    Visualise la répartition des groupes dans les clusters.
    """
    group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
    group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
    plt.title(f"Répartition des groupes Young et Old dans chaque Cluster ({method}) pour {variable} et {segment}")
    plt.xlabel("Clusters")
    plt.ylabel("Nombre d'individus")
    plt.legend(title="Groupes")
    plt.show()
    return group_distribution

for variable in variables:
    for segment in segments:

        df_filtered = df_all_combined[(df_all_combined['Variable'] == variable) & (df_all_combined['CC_Segment'] == segment)]
        df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
        df_filtered_clean = df_filtered_clean.reset_index(drop=True)
        df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
        df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
        columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                           'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                           'RH_GMvM1', 'RH_GMvS1', 'GMt_M1', 'GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
        df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_used)

        neighbors = NearestNeighbors(n_neighbors=5)
        neighbors_fit = neighbors.fit(df_scaled)
        distances, indices = neighbors_fit.kneighbors(df_scaled)
        distances = np.sort(distances[:, 4])  # 4 correspond à n_neighbors - 1
        plt.figure(figsize=(10, 6))
        plt.plot(distances)
        plt.title(f"Courbe pour déterminer epsilon pour {variable} et {segment}")
        plt.xlabel("Points triés par distance")
        plt.ylabel("Distance au 5e plus proche voisin")
        plt.show()

        epsilon = 0.5  # À ajuster selon la courbe des distances
        min_samples = 5
        labels_dbscan = dbscan_clustering(df_scaled, epsilon, min_samples)
        
        # Spectral Clustering
        n_clusters = 4  # À ajuster selon les besoins
        labels_spectral = spectral_clustering(df_scaled, n_clusters)

        df_used['DBSCAN_Cluster'] = labels_dbscan
        df_used['Spectral_Cluster'] = labels_spectral

        valid_clusters = df_used[df_used['DBSCAN_Cluster'] != -1]
        valid_spectral = df_used[df_used['Spectral_Cluster'] != -1]
        
        if len(set(valid_clusters['DBSCAN_Cluster'])) > 1:
            df_clusters_dbscan = pd.DataFrame({
                'actual': Aging_INDICES.loc[valid_clusters.index, "group"],
                'predicted': valid_clusters['DBSCAN_Cluster']    
            })
            group_distribution_dbscan = plot_cluster_distribution(df_clusters_dbscan, variable, segment, method='DBSCAN')

        if len(set(valid_spectral['Spectral_Cluster'])) > 1:
            df_clusters_spectral = pd.DataFrame({
                'actual': Aging_INDICES.loc[valid_spectral.index, "group"],
                'predicted': valid_spectral['Spectral_Cluster']    
            })
            group_distribution_spectral = plot_cluster_distribution(df_clusters_spectral, variable, segment, method='Spectral Clustering')

"""

for key, result in cluster_results.items():
    print(f"\nRépartition pour {key}:\n")
    print(result)
    
"""


## Commentaire : 

..............................................................................................

..............................................................................................

..............................................................................................

..............................................................................................

### Youssef

**I still need to conduct a study without the bins, using clusters, to see the impact of the dataframe bins on the 8 selected variables.**

---

# Livrable 3

---

In [None]:
# Variables et segments à traiter
variables = ['FA']
segments = ['CC5']
n_clusters = 3
cluster_results = {}

def impute_median(series):
    return series.fillna(series.median())

for variable in variables:
    for segment in segments:

        df_filtered = df_all_combined[(df_all_combined['Variable'] == variable) & (df_all_combined['CC_Segment'] == segment)]
        df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1)
        df_filtered_clean = df_filtered_clean.reset_index(drop=True)
        df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
        df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
        columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                           'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                           'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
        df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df_used)
        df_scaled_df = pd.DataFrame(df_scaled, columns=df_used.columns)
        
        inertia = []
        silhouette_scores = []
        K_range = range(2, 11)
        
        for k in K_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(df_scaled)
            inertia.append(kmeans.inertia_)  # Inertie
            silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))  # Score de silhouette
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(df_scaled)
        
        df_used['Cluster'] = kmeans.labels_
        df_clusters = pd.DataFrame({
            'actual': Aging_INDICES["group"], 
            'predicted': df_used['Cluster']    
        })
        df_clusters = df_clusters.sort_values(by='predicted')
        group_distribution = pd.crosstab(df_clusters['predicted'], df_clusters['actual'])
        cluster_results[f'{variable}_{segment}'] = group_distribution
        
        group_distribution.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
        plt.title(f"Répartition des groupes Young et Old dans chaque Cluster (K-means) pour {variable} et {segment}")
        plt.xlabel("Clusters")
        plt.ylabel("Nombre d'individus")
        plt.legend(title="Groupes")
        plt.show()
        
        df_used = pd.concat([Aging_INDICES[['sub', 'sex', 'group']], df_used], axis=1, ignore_index=False)
        for cluster_num in range(n_clusters):  # Boucle sur chaque cluster
            print(f"\nDataFrame pour Cluster {cluster_num} ({variable}, {segment}):")
            cluster_data = df_used[df_used['Cluster'] == cluster_num]
            cluster_data = cluster_data.drop(columns=['sex', 'group'], errors='ignore')
            cluster_data = cluster_data.merge(Aging_INDICES[['sub', 'sex', 'group']], on='sub', how='left')
            cluster_data = cluster_data[['sub', 'sex', 'group'] + [col for col in cluster_data.columns if col not in ['sub', 'sex', 'group']]]
            display(cluster_data)

"""
# Affichage des résultats de la répartition des clusters
for key, result in cluster_results.items():
    print(f"\nRépartition pour {key}:\n")
    print(result)
"""


## ❓ Question: Quelles caractéristiques contribuent le plus à la séparation entre les clusters ?

In [None]:
pip install shap

In [None]:
import shap
import seaborn as sns

def impute_median(series):
    return series.fillna(series.median())

def SHAP_features_importance(stat_df, kmeans, scaler):
    def model_output(data):
        return kmeans.transform(data)

    stat_df_scaled = scaler.transform(stat_df)
    explainer = shap.KernelExplainer(model_output, stat_df_scaled)
    shap_values = explainer.shap_values(stat_df_scaled)
    feature_importance = np.mean(np.abs(shap_values), axis=0).mean(axis=1)
    if len(feature_importance) != stat_df.shape[1]:
        raise ValueError(f"La longueur des SHAP values ({len(feature_importance)}) ne correspond pas au nombre de colonnes du DataFrame ({stat_df.shape[1]}).")
    feature_importance_df = pd.DataFrame({
        'feature': stat_df.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance_df)
    plt.title('SHAP Feature Importance for KMeans Clustering')
    plt.show()

    shap.summary_plot(shap_values, features=stat_df_scaled, feature_names=stat_df.columns)

def run_shap_analysis(df_combined, Aging_INDICES, variable, segment, n_clusters=3):
    df_filtered = df_combined[(df_combined['Variable'] == variable) & (df_combined['CC_Segment'] == segment)]
    df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1).reset_index(drop=True)
    df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
    df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
    columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                       'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                       'RH_GMvM1', 'RH_GMvS1', 'GMt_M1', 'GMt_S1', 'LH_GM_M1S1', 
                       'RH_GM_M1S1', 'TGMv_all','ctxGMv_all', 'RH_GMv_all','LH_GMv_all']
    df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_used)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(df_scaled)
    SHAP_features_importance(df_used, kmeans, scaler)

run_shap_analysis(df_all_combined, Aging_INDICES, variable='FA', segment='CC5', n_clusters=10)


In [None]:
def impute_median(series):
    return series.fillna(series.median())

def SHAP_cluster_analysis(df_used, kmeans, scaler, n_clusters):
    def model_output(data):
        return kmeans.transform(data)
    
    df_scaled = scaler.transform(df_used)
    explainer = shap.KernelExplainer(model_output, df_scaled)
    shap_values = explainer.shap_values(df_scaled)
    
    for cluster in range(n_clusters):
        print(f"\nAnalyse du cluster {cluster} :")
        cluster_indices = np.where(kmeans.labels_ == cluster)[0]
        cluster_shap_values = np.mean(np.abs(shap_values)[cluster_indices], axis=0)
        shap.summary_plot(shap_values[cluster_indices], features=df_scaled[cluster_indices], feature_names=df_used.columns)
        feature_importance_cluster = np.mean(np.abs(shap_values[cluster_indices]), axis=0).mean(axis=1)
        feature_importance_df = pd.DataFrame({
            'feature': df_used.columns,
            'importance': feature_importance_cluster
        }).sort_values(by='importance', ascending=False)
        
        print(f"Top variables qui caractérisent le cluster {cluster} :\n")
        print(feature_importance_df.head(10))

def run_cluster_analysis(df_combined, Aging_INDICES, variable, segment, n_clusters=10):
    df_filtered = df_combined[(df_combined['Variable'] == variable) & (df_combined['CC_Segment'] == segment)]
    df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1).reset_index(drop=True)
    df_used = pd.concat([Aging_INDICES, df_filtered_clean], axis=1, ignore_index=False)
    df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
    columns_to_drop = ['sub', 'sex', 'group', 'age', 'FAsigCC4', 'MDsigCC4', 'RDsigCC4', 
                       'FAsigCC5', 'ADsigCC5', 'RDsigCC5', 'LH_GMvM1', 'LH_GMvS1', 
                       'RH_GMvM1', 'RH_GMvS1', 'GMt_M1','GMt_S1', 'LH_GM_M1S1', 'RH_GM_M1S1']
    df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_used)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(df_scaled)
    SHAP_cluster_analysis(df_used, kmeans, scaler, n_clusters)

# Exécuter l'analyse pour FA et CC5
run_cluster_analysis(df_all_combined, Aging_INDICES, variable='FA', segment='CC5', n_clusters=10)


## Commentaire : 

..............................................................................................

..............................................................................................

..............................................................................................

..............................................................................................

### Youssef

**I'm still stuck here. I want to look at the variables that separate the clusters without giving attention to age and understand which variable is responsible for this separation. Additionally, I want to understand which variable allows the grouping within each cluster. Ideally, if one variable helps identify a cluster and another helps create the cluster, I could better understand why the data is distributed in this way. Ask me Daniela for more details :D**

## Rep 1

In [None]:
import seaborn as sns
import ppscore as pps  

important_cols = ['sub', 'sex', 'group', 'JND_P', 'IHD', 'FA_body', 'MD_body','AD_body', 'RD_body']
#important_cols = ['sub', 'sex', 'group', 'JND_P', 'IHD', 'FA_body', 'MD_body','AD_body', 'RD_body', 'TGMv_all', 'ctxGMv_all', 'LH_GMv_all','RH_GMv_all']

def impute_median(series):
    return series.fillna(series.median())

def heatmap(df, target=None):

    if target is None:
        df = df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
        title = 'PPS matrix'
    else:
        df = df[df['y'] == target][['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
        title = f"PPS matrix for target: {target}"

    plt.figure(figsize=(20, 8))
    sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=False, square=True, cbar_kws={"shrink": .5})
    plt.title(title)
    plt.xlabel("Feature")
    plt.ylabel("Target")
    plt.show()

def prepare_data_for_clustering(df_combined, Aging_INDICES, variable, segment):
    df_filtered = df_combined[(df_combined['Variable'] == variable) & (df_combined['CC_Segment'] == segment)]
    df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1).reset_index(drop=True)
    df_used = pd.concat([Aging_INDICES[important_cols], df_filtered_clean], axis=1)
    df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
    columns_to_drop = ['sub', 'sex', 'group']
    df_used_clustering = df_used.drop(columns=columns_to_drop, errors='ignore')
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_used_clustering)
    return df_used, df_scaled

def apply_kmeans(df_used, df_scaled, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df_used['Cluster'] = kmeans.fit_predict(df_scaled)
    df_used['cluster'] = df_used['Cluster'].apply(lambda x: f'Cluster_{x}')
    df_used.drop('Cluster', axis=1, inplace=True)
    return df_used

def analyze_pps(df_used):
    columns_to_drop = ['sub', 'sex', 'group']
    df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
    pps_matrix = pps.matrix(df_used)
    heatmap(pps_matrix)
    
    pps_scores = []
    for feature in df_used.columns:
        if feature != 'cluster': 
            pps_score = pps.score(df_used, feature, "cluster")
            pps_scores.append((feature, pps_score["ppscore"]))

    pps_scores_df = pd.DataFrame(pps_scores, columns=["Feature", "PPS Score"]).sort_values(by="PPS Score", ascending=False)
    pps_scores_df.reset_index(drop=True, inplace=True)
    
    print(f"\nTop 10 des caractéristiques les plus contributives à la séparation des clusters :\n")
    print(pps_scores_df.head(10))

    return pps_scores_df

def run_cluster_analysis(df_combined, Aging_INDICES, variable, segment):
    df_used, df_scaled = prepare_data_for_clustering(df_combined, Aging_INDICES, variable, segment)
    df_used = apply_kmeans(df_used, df_scaled, n_clusters=10)
    analyze_pps(df_used)

run_cluster_analysis(df_all_combined, Aging_INDICES, variable='FA', segment='CC5')


## Rep 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import ppscore as pps  # Assurez-vous d'installer ppscore : pip install ppscore
import matplotlib.pyplot as plt

# Liste des colonnes importantes pour l'analyse
important_cols = ['sub', 'sex', 'group', 'JND_P', 'IHD', 'FA_body', 'MD_body',
                  'AD_body', 'RD_body', 'TGMv_all', 'ctxGMv_all', 'LH_GMv_all', 'RH_GMv_all']

# Fonction d'imputation de la médiane pour les valeurs manquantes
def impute_median(series):
    return series.fillna(series.median())

# Fonction pour générer et afficher la heatmap PPS
def heatmap(df, target=None):
    if target is None:
        df = df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
        title = 'PPS matrix'
    else:
        df = df[df['y'] == target][['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
        title = f"PPS matrix for target: {target}"

    plt.figure(figsize=(20, 8))
    sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=False, square=True, cbar_kws={"shrink": .5})
    plt.title(title)
    plt.xlabel("Feature")
    plt.ylabel("Target")
    plt.show()

# Préparation des données pour le clustering
def prepare_data_for_clustering(df_combined, Aging_INDICES, variable, segment):
    df_filtered = df_combined[(df_combined['Variable'] == variable) & (df_combined['CC_Segment'] == segment)]
    df_filtered_clean = df_filtered.drop(['Variable', 'CC_Segment', 'Group'], axis=1).reset_index(drop=True)
    
    # Merge avec les colonnes importantes
    df_used = pd.concat([Aging_INDICES[important_cols], df_filtered_clean], axis=1)
    
    # Imputation de la médiane pour les valeurs manquantes
    df_used = df_used.apply(lambda col: impute_median(col) if col.dtype in ['float64', 'int64'] else col)
    
    # Suppression des colonnes non pertinentes pour le clustering
    columns_to_drop = ['sub', 'sex', 'group']
    df_used_clustering = df_used.drop(columns=columns_to_drop, errors='ignore')
    
    # Standardisation des données
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_used_clustering)
    
    return df_used, df_scaled

# Appliquer KMeans pour créer des clusters
def apply_kmeans(df_used, df_scaled, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df_used['Cluster'] = kmeans.fit_predict(df_scaled)
    
    # Renommer les clusters pour un affichage clair
    df_used['cluster'] = df_used['Cluster'].apply(lambda x: f'Cluster_{x}')
    df_used.drop('Cluster', axis=1, inplace=True)
    
    return df_used

# Analyse des caractéristiques avec PPS et affichage de la heatmap
def analyze_pps(df_used):
    # Suppression des colonnes non pertinentes pour l'analyse PPS
    columns_to_drop = ['sub', 'sex', 'group']
    df_used = df_used.drop(columns=columns_to_drop, errors='ignore')
    
    # Calcul de la matrice PPS
    pps_matrix = pps.matrix(df_used)
    heatmap(pps_matrix)
    
    # Calcul du score PPS pour chaque feature par rapport à 'cluster'
    pps_scores = []
    for feature in df_used.columns:
        if feature != 'cluster':  # Ignorer la colonne cluster
            pps_score = pps.score(df_used, feature, "cluster")
            pps_scores.append((feature, pps_score["ppscore"]))

    # Créer un DataFrame des scores PPS
    pps_scores_df = pd.DataFrame(pps_scores, columns=["Feature", "PPS Score"]).sort_values(by="PPS Score", ascending=False)
    pps_scores_df.reset_index(drop=True, inplace=True)
    
    # Afficher les 10 meilleures caractéristiques
    print(f"\nTop 10 des caractéristiques les plus contributives à la séparation des clusters :\n")
    print(pps_scores_df.head(10))

    return pps_scores_df

# Fonction principale pour l'analyse des clusters
def run_cluster_analysis(df_combined, Aging_INDICES, variable, segment):
    # Préparer les données pour FA et CC5
    df_used, df_scaled = prepare_data_for_clustering(df_combined, Aging_INDICES, variable, segment)
    
    # Appliquer KMeans pour trouver les clusters
    df_used = apply_kmeans(df_used, df_scaled, n_clusters=10)
    
    # Analyser les caractéristiques qui contribuent aux clusters avec PPS
    analyze_pps(df_used)

# Exécution du script pour la variable 'FA' et le segment 'CC5'
run_cluster_analysis(df_all_combined, Aging_INDICES, variable='FA', segment='CC5')
