### K-means

In [7]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
import numpy as np
from sklearn.base import clone
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_iris, load_wine, load_digits, load_breast_cancer, fetch_openml

### Align labels 

In [2]:
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix

def align_clusters(y_true, y_pred):
    # Créer une matrice de confusion entre les vrais labels et les labels prédits (clusters)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Utiliser l'algorithme Hongrois pour trouver la meilleure correspondance
    row_ind, col_ind = linear_sum_assignment(-conf_matrix)
    
    # Créer un mapping des clusters vers les vrais labels
    cluster_to_label_mapping = {row: col for row, col in zip(row_ind, col_ind)}
    
    # Appliquer ce mapping aux prédictions pour aligner les clusters avec les vrais labels
    aligned_labels = [cluster_to_label_mapping[cluster] for cluster in y_pred]
    
    return aligned_labels


In [3]:
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from sklearn.base import clone
from sklearn.cluster import KMeans
import numpy as np

# Ajoutez vos définitions de distance personnalisées ici si nécessaire
def hellinger_distance(p, q):
    p = np.abs(p)
    q = np.abs(q)
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)

def pearson_chi2(x, y):
    return np.sum((x - y) ** 2 / (x + y))

def vicis_symmetric_1(x, y):
    total = 0
    for xi, yi in zip(x, y):
        if min(xi, yi) != 0:
            total += (xi - yi) ** 2 / (min(xi, yi) ** 2)
        else:
            total += (xi - yi) ** 2  # Handle the case when min(xi, yi) == 0
    return total

def hassanat_distance(x, y, epsilon=1e-10):
    return np.mean(np.abs(x - y) / (x + y + epsilon))

def test(X, y, k):
    kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    param_grid_gini = {'nu':  np.arange(1.1, 5.1, 0.1)}  # Recherche sur nu

    precision_results = {distance: [] for distance in [
        'gini', 'generalized_gini', 'euclidean', 'manhattan', 'minkowski', 'cosine',
        'canberra', 'hellinger', 'jensen_shannon', 'pearson_chi2', 'vicis_symmetric_1', 'hassanat'
    ]}
    recall_results = {distance: [] for distance in precision_results.keys()}
    steps_results = {distance: [] for distance in precision_results.keys()}
    best_nu_values = []  # Pour stocker les meilleurs nu

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for _ in range(500):
            kmeans_initial = KMeans(n_clusters=k, init='random', max_iter=300, n_init=1, algorithm='lloyd')
            kmeans_initial.fit(X_train)
            initial_centroids = kmeans_initial.cluster_centers_

            # Créer des instances de KMeans avec les centroïdes initialisés
            kmeans_models = {
                'gini': KMeans(n_clusters=k, metric="gini", init=initial_centroids, algorithm='lloyd'),
                'generalized_gini': KMeans(n_clusters=k, metric="gini", init=initial_centroids, algorithm='lloyd'),
                'euclidean': KMeans(n_clusters=k, metric="euclidean", init=initial_centroids, algorithm='lloyd'),
                'manhattan': KMeans(n_clusters=k, metric="manhattan", init=initial_centroids, algorithm='lloyd'),
                'minkowski': KMeans(n_clusters=k, metric="minkowski", p=3, init=initial_centroids, algorithm='lloyd'),
                'cosine': KMeans(n_clusters=k, metric="cosine", init=initial_centroids, algorithm='lloyd'),
                'canberra': KMeans(n_clusters=k, metric='canberra', init=initial_centroids, algorithm='lloyd'),
                'hellinger': KMeans(n_clusters=k, metric="hellinger", init=initial_centroids, algorithm='lloyd'),
                'jensen_shannon': KMeans(n_clusters=k, metric='jensen_shannon', init=initial_centroids, algorithm='lloyd'),
                'pearson_chi2': KMeans(n_clusters=k, metric='pearson_chi2', init=initial_centroids, algorithm='lloyd'),
                'vicis_symmetric_1': KMeans(n_clusters=k, metric='vicis_symmetric_1', init=initial_centroids, algorithm='lloyd'),
                'hassanat': KMeans(n_clusters=k, metric="hassanat", init=initial_centroids, algorithm='lloyd'),
            }
            
            # GridSearch de paramètre nu pour generalized_gini
            grid_gini = GridSearchCV(estimator=clone(kmeans_models['generalized_gini']), param_grid=param_grid_gini, scoring=make_scorer(f1_score, average='weighted'), cv=kf)
            grid_gini.fit(X_train, y_train)
            best_ginikmeans = grid_gini.best_estimator_

            # Enregistrer le meilleur nu trouvé
            best_nu_values.append(grid_gini.best_params_['nu'])

            # Fit sur les données d'entraînement et récupérer le nombre de steps (iterations)
            for distance, model in kmeans_models.items():
                model.fit(X_train, y_train)
                steps_results[distance].append(model.n_iter_)  # Nombre de steps avant convergence

            # Prédictions
            y_preds = {}
            for distance, model in kmeans_models.items():
                y_preds[distance] = model.predict(X_test)

            # Évaluation des performances
            for distance, y_pred in y_preds.items():
                y_pred_aligned = align_clusters(y_test, y_pred)
                precision_results[distance].append(precision_score(y_test, y_pred_aligned, average='weighted'))
                recall_results[distance].append(recall_score(y_test, y_pred_aligned, average='weighted'))

    # Moyennes et affichage des résultats
    print(f"Meilleur nu pour Gini : {np.mean(best_nu_values)}")

    for distance in precision_results.keys():
        precision_mean = np.mean(precision_results[distance])
        recall_mean = np.mean(recall_results[distance])
        steps_mean = np.mean(steps_results[distance])

        print(f"Distance {distance}:")
        print(f"  Precision: {precision_mean}")
        print(f"  Recall: {recall_mean}")
        print(f"  Steps to Convergence: {steps_mean}")

    return precision_results, recall_results, steps_results


### Iris dataset

In [4]:
iris = load_iris()
X = iris.data
y = iris.target
test(X, y, k=3)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Meilleur nu pour Gini : 2.7125000000000012
Distance gini:
  Precision: 0.616678648081423
  Recall: 0.61084
  Steps to Convergence: 2.361
Distance generalized_gini:
  Precision: 0.616678648081423
  Recall: 0.61084
  Steps to Convergence: 2.361
Distance euclidean:
  Precision: 0.6031524266439763
  Recall: 0.5978666666666665
  Steps to Convergence: 1.0
Distance manhattan:
  Precision: 0.6005012838217401
  Recall: 0.59408
  Steps to Convergence: 1.796
Distance minkowski:
  Precision: 0.5989537289410151
  Recall: 0.5958666666666667
  Steps to Convergence: 1.49
Distance cosine:
  Precision: 0.6182793728602554
  Recall: 0.6219733333333334
  Steps to Convergence: 2.461
Distance canberra:
  Precision: 0.6338511111111111
  Recall: 0.6293333333333333
  Steps to Convergence: 4.165
Distance hellinger:
  Precision: 0.5844926017262713
  Recall: 0.5639466666666667
  Steps to Convergence: 300.0
Distance jensen_shannon:
  Precision: 0.6529991019304985
  Recall: 0.6248400000000001
  Steps to Convergence:

({'gini': [0.07526881720430106,
   0.017543859649122806,
   0.907187323146576,
   0.07777777777777778,
   0.888888888888889,
   0.9071873231465762,
   0.907187323146576,
   0.907187323146576,
   0.017543859649122806,
   0.888888888888889,
   0.9071873231465762,
   0.907187323146576,
   0.888888888888889,
   0.07526881720430106,
   0.9071873231465762,
   0.9071873231465762,
   0.888888888888889,
   0.07526881720430106,
   0.888888888888889,
   0.07526881720430106,
   0.07526881720430106,
   0.9071873231465762,
   0.017543859649122806,
   0.07526881720430106,
   0.907187323146576,
   0.017543859649122806,
   0.9071873231465762,
   0.07526881720430106,
   0.017543859649122806,
   0.017543859649122806,
   0.9071873231465762,
   0.9071873231465762,
   0.017543859649122806,
   0.9071873231465762,
   0.9071873231465762,
   0.9071873231465762,
   0.07777777777777778,
   0.907187323146576,
   0.9071873231465762,
   0.017543859649122806,
   0.017543859649122806,
   0.888888888888889,
   0.907187