CLARA Clustering multiprocessing

In [4]:
!pip install -U ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Using cached comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
   ----- ---------------------------------- 20.5/139.8 kB ? eta -:--:--
   ----------- --------------------------- 41.0/139.8 kB 495.5 kB/s eta 0:00:01
   ----------------- --------------------- 61.4/139.8 kB 469.7 kB/s eta 0:00:01
   ------------------------- ------------- 92.2/139.8 kB 581.0 kB/s eta 0:00:01
   ------------------------------------

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn_extra.cluster import CLARA
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score,normalized_mutual_info_score,silhouette_score,davies_bouldin_score,calinski_harabasz_score
import ray
import dill
import time

#Initiation du framework Ray pour gerer les taches paralleles
ray.init(ignore_reinit_error=True,  local_mode=False,object_store_memory=512 * 1024 ** 2)#512 Mo pour stocker les objets intermediaires dans des tâches parallelisees

#Chargement et preparation du dataset
def load_and_prepare_data(file_path):
    data = pd.read_csv(file_path) #Chargement du fichier CSV
    data.drop(columns=['CUST_ID'], inplace=True)  #Suppression de la colonnes d'identifiant inutile
    data.fillna(data.median(), inplace=True)  #Remplacement des valeurs manquantes par la mediane
    numeric_data = data.select_dtypes(include=[np.number])  #Selection des colonnes numeriques uniquement
    return (numeric_data - numeric_data.mean()) / numeric_data.std() #Normalisation du data

#Appliquation de l'algorithme CLARA Clustering
@ray.remote #transforme la fonction apply_clara en une tache qui peut etre excecutee en parallle
def apply_clara(data_array, k, sampling_size, random_state):
    data = pd.DataFrame(data_array)  
    clara = CLARA(n_clusters=k, n_sampling=sampling_size, random_state=random_state)
    labels = clara.fit_predict(data)#Effectuer le Clustering 
    return clara, labels

# Sauvegarde du modèle
def save_model(model, filename="model_clara.pkl"):
    with open(filename, 'wb') as file:
        dill.dump(model, file)
    
def clara_model(file, n_clusters):
    #Chargement des donnees
    data = load_and_prepare_data(file)
    data_array = data.to_numpy()

    random_states = [42, 43, 44, 45]

    #Chronometrer l'exécution
    start_time = time.time()

    sampling_size = max(int(0.05 * len(data)), n_clusters + 1)  #Taille des sous-echantillons(samples)

    #Creation des taches paralleles pour Ray, chaque tache s'excecute avec un state different
    
    tasks = [apply_clara.remote(data_array, n_clusters, sampling_size, state) for state in random_states]

    #Execution des taches en parallele avec Ray
    results = ray.get(tasks)

    end_time = time.time()

    temps_execution = end_time - start_time

    # Comparaison des résultats pour choisir le meilleur modèle
    best_score = -1
    best_clara, best_labels = None, None

    for clara_model_result, labels in results:
        score = silhouette_score(data, labels)
        if score > best_score:
            best_score = score
            best_clara, best_labels = clara_model_result, labels

    #Enregistrer Model CLARA Clustering
    save_model(best_clara, 'model_clara.pkl')

    #Evaluation des clusters
    metrics = evaluate_clustering(data, labels)

    ray.shutdown()

    return best_clara, labels, metrics, temps_execution

#Metriques d'evaluation
def evaluate_clustering(data, labels):
    #Usupervised
    silhouette = silhouette_score(data, labels)
    db_index = davies_bouldin_score(data, labels)
    ch_score = calinski_harabasz_score(data, labels)

    print(f"Silhouette Coefficient : {silhouette:.4f}")
    print(f"Davies-Bouldin Index : {db_index:.4f}")
    print(f"Calinski-Harabasz Index : {ch_score:.4f}")
    
    #Supervised (on n'a pas des labels deja classifier/Clustered)
    true_labels = None  
    if true_labels is not None:
        ari = adjusted_rand_score(true_labels, labels)
        nmi = normalized_mutual_info_score(true_labels, labels)
        print(f"Adjusted Rand Index (ARI) : {ari:.4f}")
        print(f"Normalized Mutual Information (NMI) : {nmi:.4f}")

model, labels, metrics, temps_execution = clara_model("credit_cards.csv", n_clusters=6)



2025-01-17 20:30:59,561	INFO worker.py:1821 -- Started a local Ray instance.


Silhouette Coefficient : 0.1757
Davies-Bouldin Index : 1.7529
Calinski-Harabasz Index : 1218.9400


In [14]:
# Chargement du modèle sauvegardé
def load_model(filename):
    with open(filename, 'rb') as file:
        model = dill.load(file)
    print("📦 Modèle CLARA chargé avec succès !")
    return model
clara_loaded = load_model('model_clara.pkl')
clara_loaded

📦 Modèle CLARA chargé avec succès !
