In [1]:
from mitools import clustering as cl
from pandas import DataFrame
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import pandas as pd

# Load Data

In [2]:
n_samples = 1_000
n_features = 16
n_centers = 5

In [3]:
data, _ = make_blobs(n_samples=n_samples, centers=n_centers, n_features=n_features, random_state=0)
data = DataFrame(data, columns=[f"feature_{i}" for i in range(n_features)], index=[f"sample_{i}" for i in range(n_samples)])

In [None]:
data

# Search N Clusters

In [None]:
kmeans_scores, kmeans_inertia = cl.clustering_ncluster_search(data, max_clusters=25, clustering_method=cl.kmeans_clustering)
cl.plot_clustering_ncluster_search(kmeans_scores, kmeans_inertia, max_clusters=25, algorithm_name="K-Means Clustering")
plt.show()

In [None]:
aggcl_scores, aggcl_inertia = cl.clustering_ncluster_search(data, max_clusters=25, clustering_method=cl.agglomerative_clustering)
cl.plot_clustering_ncluster_search(aggcl_scores, aggcl_inertia, max_clusters=25, algorithm_name="Agglomerative Clustering")
plt.show()

# Define N Clusters

In [7]:
n_clusters = n_centers
kmeans, kmeans_labels = cl.kmeans_clustering(data, n_clusters=n_clusters)
aggcl, aggcl_labels = cl.agglomerative_clustering(data, n_clusters=n_clusters)

# Add Labels to DataFrame Index

In [8]:
data.index = pd.MultiIndex.from_arrays([data.index, [f"cluster_{c}" for c in kmeans_labels], [f"cluster_{c}" for c in aggcl_labels]], names=[data.index.name, 'kmeans_cluster', 'agg_cluster'])

In [None]:
data

# Evaluate Clusters

In [None]:
centroids = cl.get_clusters_centroids(data, 'kmeans_cluster')
centroids

In [None]:
distances_between_centroids = cl.get_distances_between_centroids(centroids)
distances_between_centroids

In [None]:
distances_to_cetroids = cl.get_distances_to_centroids(data, centroids, 'kmeans_cluster')
distances_to_cetroids

In [None]:
_ = cl.plot_dfs_col_distribution([g for _, g in distances_to_cetroids.groupby('kmeans_cluster')], column=0, normed=False, bins=None, colors=None)

In [None]:
cl.get_clusters_size(data, 'kmeans_cluster')

In [None]:
cluster_cosine_sims = cl.get_cosine_similarities(data.loc[data.index.get_level_values('kmeans_cluster') == 'cluster_0'], id_level=0, as_vector=False)
cluster_cosine_sims

In [None]:
cluster_cosine_sims = cl.get_cosine_similarities(data.loc[data.index.get_level_values('kmeans_cluster') == 'cluster_0'], id_level=0, as_vector=True)
cluster_cosine_sims

In [None]:
_ = cl.plot_df_col_distribution(cluster_cosine_sims, column=0, normed=False, bins=100, color="green")

In [None]:
clusters_cosine_sims = {cluster: cl.get_cosine_similarities(group, id_level=0, as_vector=True) for cluster, group in data.groupby('kmeans_cluster')}
clusters_cosine_sims['cluster_0']

In [None]:
_ = cl.plot_dfs_col_distribution(clusters_cosine_sims.values(), column=0, normed=False, bins=None, colors=None)

In [None]:
ax = cl.plot_clusters(data, 'kmeans_cluster', 'feature_0', 'feature_1')
ax = cl.add_clusters_ellipse(ax, data, 'kmeans_cluster', 'feature_0', 'feature_1', linestyle='--')
ax = cl.add_clusters_centroids(ax, centroids, 0, 'feature_0', 'feature_1', s=5000, marker='o', alpha=0.66, zorder=99, edgecolor='k', linewidth=2, linestyle='--')
plt.show()