In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from pyckmeans import CKmeans
from pyckmeans import MultiCKMeans
import ClusterEnsembles as CE
from sklearn.metrics import jaccard_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import calinski_harabasz_score

In [None]:
# load the iris data
iris=load_iris()
df=pd.DataFrame(data=iris.data, columns=['sepal length','sepal width','petal length','petal width'])
df['target']=pd.Series(iris.target)
def define_species(df, target_column):
    species = ["cluster_" + str(val) for val in target_column]
    df['Species'] = species
define_species(df, np.array(df.target))

In [None]:
#  transform the actual data and display it
data = iris.data
target = iris.target
X_reduced = PCA(n_components=2).fit_transform(data)
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
# KMeans from sklearn to cluster the iris data with k = [2, 3, 4, 5]
KMeans_k2 = KMeans(n_clusters=2)
k2_model = KMeans_k2.fit(iris.data)

KMeans_k3 = KMeans(n_clusters=3)
k3_model = KMeans_k3.fit(iris.data)

KMeans_k4 = KMeans(n_clusters=4)
k4_model = KMeans_k4.fit(iris.data)

KMeans_k5 = KMeans(n_clusters=5)
k5_model = KMeans_k5.fit(iris.data)


In [None]:
# display the result of using KMeans with k = 3
define_species(df, k3_model.labels_)
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
#Compare the actual Data with the result of using KMeans with k = 3
print("Jacard Score = ", jaccard_score(np.array(df['target']), k3_model.labels_, average='weighted'))
print("DBI = ", davies_bouldin_score(X_reduced, k3_model.labels_))
print("silhouette_score = " , silhouette_score(X_reduced, k3_model.labels_))
# to see the different with the actual data
pd.crosstab(iris.target, k3_model.labels_)

In [None]:
# Ensemble Clustering using CKmeans with a single k = 3
ckm = CKmeans(k=3, n_rep=100)
ckm.fit(iris.data)
ckm_res = ckm.predict(iris.data)
define_species(df, ckm_res.cl)
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
# the different between the CKmeans and the actual data
pd.crosstab(iris.target, ckm_res.cl)

In [None]:
# Ensemble Clustering using CKmeans with a multi k = [2, 3, 4, 5]
mckm = MultiCKMeans(k=[2, 3, 4, 5], n_rep=100)
mckm.fit(iris.data)
mckm_res = mckm.predict(iris.data)
df_species = []
for i in range(4):
    define_species(df, mckm_res.ckmeans_results[i].cl)
    df_species.append(df['Species'])

# plot clustering metrics against k
# BIC, DB: lower is better
# SIL, CH: higher is better
mckm_res.plot_metrics(figsize=(10,5))

In [None]:
# display the result for each k
df['Species'] = df_species[0]
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
df['Species'] = df_species[1]
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
df['Species'] = df_species[2]
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
df['Species'] = df_species[3]
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Species'])

In [None]:
'''
Ensemble Clustering using algo_name ['cspa', 'mcla', 'hbgf', 'nmf'] from ClusterEnsembles
Input = results of Kmeans models from earlier with k = [2, 3, 4, 5]
compare the result of algo_name with the actual data and display the difference for k = 3 (best k)
'''
def useEnsembleClustering(algo_name, best_k, ensemble_input, k_list):
    print(f'The result of {algo_name}')
    scores = []
    for k in k_list:
        label_pred = CE.cluster_ensembles(ensemble_input, nclass=k, solver=algo_name)
        if k == best_k:
            print(pd.crosstab(iris.target, label_pred))
        scores_for_k = []
        #TODO nur 3 stellen nach der komma  auch Mittelwert und Varianz berechnen. 
        scores_for_k.append(jaccard_score(iris.target, label_pred, average='weighted'))
        scores_for_k.append(davies_bouldin_score(X_reduced, label_pred))
        scores_for_k.append(silhouette_score(X_reduced, label_pred))
        scores_for_k.append(normalized_mutual_info_score(iris.target, label_pred, average_method='geometric'))
        scores_for_k.append(adjusted_rand_score(iris.target, label_pred))
        scores_for_k.append(fowlkes_mallows_score(iris.target, label_pred))
        scores_for_k.append(calinski_harabasz_score(X_reduced, label_pred))
        scores.append(scores_for_k)
    print("jaccard_score, davies_bouldin_score, silhouette_score, normalized_mutual_info_score, adjusted_rand_score, fowlkes_mallows_score, calinski_harabasz_score")
    for element in scores:
        print(element)
ensemble_input = np.array([k2_model.labels_, k3_model.labels_, k4_model.labels_, k5_model.labels_])
useEnsembleClustering('cspa', 3, ensemble_input, [2, 3, 4, 5])

In [None]:
useEnsembleClustering('mcla', 3, ensemble_input, [2, 3, 4, 5])

In [None]:
useEnsembleClustering('hbgf', 3, ensemble_input, [2, 3, 4, 5])

In [None]:
useEnsembleClustering('nmf', 3, ensemble_input, [2, 3, 4, 5])