In [1]:
import os
import numpy as np

import time
from sklearn.metrics import davies_bouldin_score

from cuml import KMeans as cuml_kmeans
from cuml.cluster import DBSCAN as cuml_dbscan
from cuml import AgglomerativeClustering as cuml_agglomerative
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score, cython_silhouette_samples

import joblib
import pandas as pd

import json


import plotly
import plotly.express as px

# reset GPU
import subprocess
from numba import cuda
import threading

In [2]:
def get_simple_fraction(f, detail_level=1000.):
	return int(detail_level * f) / detail_level

In [3]:
def find_the_best_eps(eps_chosen, n_clusters, silhouette_scores, davies_bouldin_scores, n_noise_points):
	data_interested = []
	for each in list(np.argwhere(n_clusters == np.amax(n_clusters))[:, 0]):
		data_interested.append(each)

	for each in list(np.argwhere(n_noise_points == np.amin(n_noise_points))[:, 0]):
		data_interested.append(each)

	for each in list(np.argwhere(silhouette_scores == np.amax(silhouette_scores))[:, 0]):
		data_interested.append(each)

	#for each in list(np.argwhere(calinski_harabasz_scores == np.amax(calinski_harabasz_scores))[:, 0]):
	#	data_interested.append(each)

	for each in list(np.argwhere(davies_bouldin_scores == np.amin(davies_bouldin_scores))[:, 0]):
		data_interested.append(each)

	most_frequent_data_point = np.bincount(data_interested).argmax()  # first of most frequent data point

	return eps_chosen[most_frequent_data_point]


In [4]:
def cluster_analysis_dbscan(pre_cluster_DataFrame, min_samples, eps_range=[0.001, 10, 0.2]):
	all_eps_ranges = np.arange(eps_range[0], eps_range[1], eps_range[2])

	outputs = []
	for i in range(all_eps_ranges.shape[0]):
		eps_val = all_eps_ranges[i]
		dbscan_rapids = cuml_dbscan(eps=eps_val, min_samples=min_samples, verbose=0)
		# assume that the min. number of neighbors in N-D can be used here, too.
		label_outputs = dbscan_rapids.fit(pre_cluster_DataFrame)
		outputs.append(label_outputs.labels_)

	real_eps = []
	n_clusters = []
	n_noise_points = []
	silhouette_scores = []
	calinski_harabasz_scores = []
	davies_bouldin_scores = []

	for i in range(all_eps_ranges.shape[0]):

		labels = outputs[i]

		num_unique = len(np.unique(labels))

		if num_unique == 1:
			# print ('eps dropped: ', eps_val)
			continue  # silhouette_score needs at least 2 clusters

		real_eps.append(all_eps_ranges[i])
		n_clusters.append(np.max(labels) + 1)
		n_noise_points.append(np.sum(labels == -1))
		silhouette_scores.append(cython_silhouette_score(pre_cluster_DataFrame, labels))
		# calinski_harabasz_scores.append(calinski_harabasz_score(pre_cluster_DataFrame, labels))
		davies_bouldin_scores.append(davies_bouldin_score(pre_cluster_DataFrame, labels))

	return find_the_best_eps(real_eps, n_clusters, silhouette_scores, davies_bouldin_scores, n_noise_points)



In [38]:
def plot_cluster_dbscan_result(final_df, cluster_max):
	
	fig = px.scatter(final_df, x="x", y="y", color="prediction", title='clustering')

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='DistilBERT, Korean Food Training Dataset, DBSCAN Clustering')

	for each in cluster_max:
		
		x_centroids = final_df[final_df['prediction'] == each][['x']].mean()[0]
		y_centroids = final_df[final_df['prediction'] == each][['y']].mean()[0]
		
		if each != -1:
			fig.add_annotation(dict(font=dict(color='White',size=15),
											x=x_centroids,
											y=y_centroids,
											showarrow=False,
											text=str(each),
											textangle=0,
											bgcolor="Blue",
                							opacity=0.74))
		else:
			noise_count = len(final_df[final_df['prediction'] == -1])
			noise_cluster = final_df[final_df['prediction'] == -1].reset_index(drop=True)
			
			for j in range(noise_count):
				fig.add_annotation(dict(font=dict(color='White',size=10),
											x=noise_cluster.x[j],
											y=noise_cluster.y[j],
											showarrow=False,
											text='X',
											textangle=0,
											bgcolor="White",
                							opacity=0.64))

	fig.write_image('post-clustering.webp')
	fig.write_image('post-clustering.png')
	plotly.offline.plot(fig, filename = 'post-clustering.html', auto_open=False)

In [6]:
def get_euclidean(p1, p2):
	return np.sqrt(pow(np.linalg.norm(p1[0] - p2[0]), 2) + pow(np.linalg.norm(p1[1] - p2[1]), 2))

In [8]:
df = pd.read_csv("/mnt/d/works/paperworks/2023/aging/dtrain_manifold.csv")

In [9]:
df

Unnamed: 0,x,y
0,-117.752180,-44.765297
1,3.223371,94.655080
2,-80.087160,-69.563360
3,-75.593560,65.009050
4,15.957396,125.094986
...,...,...
2332,22.260605,-0.802777
2333,-12.615958,-71.338540
2334,-17.663586,16.931108
2335,-31.762358,-33.692554


In [28]:
the_best_eps = cluster_analysis_dbscan(df, 5, eps_range=[0.001, 50, 0.2])

In [29]:
the_best_eps

3.801

In [30]:
clusterer = cuml_dbscan(eps=the_best_eps, min_samples=5, verbose=0)
label_outputs = clusterer.fit_predict(df)

In [31]:
label_outputs

0         0
1         1
2         2
3         3
4         4
       ... 
2332    148
2333     -1
2334     26
2335     -1
2336    150
Length: 2337, dtype: int32

In [32]:
df['prediction'] = label_outputs

In [33]:
df

Unnamed: 0,x,y,prediction
0,-117.752180,-44.765297,0
1,3.223371,94.655080,1
2,-80.087160,-69.563360,2
3,-75.593560,65.009050,3
4,15.957396,125.094986,4
...,...,...,...
2332,22.260605,-0.802777,148
2333,-12.615958,-71.338540,-1
2334,-17.663586,16.931108,26
2335,-31.762358,-33.692554,-1


In [34]:
cluster_max = np.unique(label_outputs).shape[0]
cluster_max

163

In [35]:
df[df.prediction==-1].shape

(669, 3)

In [40]:
df[df.prediction==7].shape

(347, 3)

In [36]:
df['prediction'].unique().tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 -1,
 7,
 151,
 8,
 115,
 159,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 108,
 98,
 19,
 20,
 21,
 55,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 58,
 35,
 36,
 153,
 37,
 38,
 39,
 40,
 89,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 56,
 57,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 161,
 68,
 69,
 70,
 71,
 72,
 111,
 73,
 74,
 75,
 76,
 77,
 86,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 101,
 87,
 88,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 99,
 100,
 102,
 103,
 104,
 105,
 106,
 107,
 155,
 109,
 156,
 110,
 112,
 150,
 113,
 114,
 116,
 157,
 117,
 149,
 118,
 119,
 160,
 120,
 121,
 122,
 123,
 124,
 125,
 139,
 126,
 127,
 133,
 128,
 129,
 130,
 131,
 132,
 146,
 134,
 135,
 158,
 136,
 137,
 138,
 140,
 141,
 142,
 143,
 144,
 145,
 147,
 148,
 152,
 154]

In [39]:
plot_cluster_dbscan_result(df[['x', 'y', 'prediction']], df['prediction'].unique().tolist())

df.to_csv('/mnt/d/works/paperworks/2023/aging/post-clustering.csv', index=False)