In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scripts import dist, calculate_centroid, ICV, split_in_clusters

In [2]:
def split_in_clusters(cluster_df: pd.DataFrame) -> list:
    """Returns a dict with the clusters as values and the cluster number as key"""
    result = {} 
    for i in range(len(cluster_df.cluster.unique())):
        result[i] = cluster_df.loc[cluster_df['cluster'] == i]\
            .drop(columns = ['cluster'], axis=1)
    return result

Calculating the inter-cluster-variance using the mean euclidiean distance from each point to the centroid:

ICA = $\sum{dist(x, centroid)^2}$ (correct this function)

In [4]:
# Not used yet. Works though.
def ICV(cluster: pd.DataFrame) -> float:  
    """Calculate the Intra-Cluster-Variance (ICV) of the provided cluster.
    This is calculated as the mean of the distances of each data point in a cluster,
    to every other data point in the same cluster."""
    
    average_distances = []
    
    for sample in cluster.values.tolist():
        current = sample
        
        distances_from_current = []
        for point in cluster.values.tolist():
            if current != point:
                distances_from_current.append(dist(current, point))
        average_distances.append(np.mean(distances_from_current))

    return average_distances

In [5]:
def evalutate_clusters(clustered_df):
    """Calculate the silhouette score of a clustered dataframe.
    The dataframe needs to have a 'cluster' column."""
    s = silhouette_score(clustered_df.drop(columns=['cluster'], axis=1), clustered_df['cluster'])
    c = calinski_harabasz_score(clustered_df.drop(columns=['cluster'], axis=1), clustered_df['cluster'])
    d = davies_bouldin_score(clustered_df.drop(columns=['cluster'], axis=1), clustered_df['cluster'])
    return s, c, d

# Cluster evaluation pipeline

see [docs](https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient) for explanation of the scores

In [9]:

clustered_df = pd.read_csv('../data/country-data-pca-w-clusters.csv')
cluster_dict = split_in_clusters(clustered_df)
s, c, d = evalutate_clusters(clustered_df)
print(f"Silhouette score: {s} (between -1 and 1)\nCalinski-Harabasz score: {c} (the higher the score the better)\nDavies-Bouldin score: {d} (the closer to zero the better)")

Silhouette score: 0.3759586560872584 (between -1 and 1)
Calinski-Harabasz score: 70.59187091546146 (the higher the score the better)
Davies-Bouldin score: 0.8537992597234965 (the closer to zero the better)
