# 2-1. Selecting best deep clustering models
Silhouette score with euclidean and dynamic time warping (dtw) and DB Index are computed to select the best clustering models candidates. Here, you can know the models' name with best scores. Scores are stored in data folder to record them.

OUTPUT PATH: data

In [None]:
pip install n2d

In [None]:
pip install fastdtw

In [None]:
import n2d
import numpy as np
import pandas as pd
from tqdm import tqdm
import easydict
# import umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.spatial.distance import cdist
from tslearn.metrics import cdist_dtw
import seaborn as sns
import tensorflow as tf
from tqdm import tqdm

from scipy.spatial.distance import euclidean
from fastdtw import fastdtw

In [None]:
# log return data
df_lr = pd.read_csv("coin_data/coin_log_return.csv")
df_lr_f = pd.read_csv("coin_data/coin_log_return_filtered.csv")

df_cum = df_lr.cumsum(axis=0)
df_cum_f = df_lr_f.cumsum(axis=0)

In [None]:
# import every embedding data
import glob
import os

path = "embedding_data" # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

In [None]:
print(len(all_files))

240


In [None]:
# score in silhouette_score or davies_bouldin_score
SEED = 0
np.random.seed(SEED)
tf.random.set_seed(SEED)

division = [i for i in range(2,10)]

name_list = []
score_list = []
cluster_list = []

for n, hle in zip(tqdm(all_files), li):
    name = n.split("/")[-1][4:-4]

    average_s = []
    average_db = []

    for k in division:

        # clustering for checking the cluster scores
        f_c = AgglomerativeClustering(n_clusters=k).fit(hle)
        preds = f_c.labels_
        
        # Data space to compare 
        X = df_cum.T
        
        # score
        score = silhouette_score(X, preds) # silhouette_score / davies_bouldin_score

        name_list.append(name)
        score_list.append(score)
        cluster_list.append(k)

# method name
method = "DeepClustering-cf"
measuring_data_space = "r" # r / f

# save a csv
df = pd.DataFrame(columns=["name","silhouette_score","num_clusters"]) # silhouette_score / davies_bouldin_score

df["name"] = name_list
df["silhouette_score"] = ch_score_list # silhouette_score / davies_bouldin_score
df["num_clusters"] = cluster_list

df.to_csv(f"data/silhouette_score_{method}_{measuring_data_space}.csv") # silhouette_score / davies_bouldin_score

In [None]:
# score in fast dtw
SEED = 0
np.random.seed(SEED)
tf.random.set_seed(SEED)

division = [i for i in range(2,10)]

name_list = []
s_score_list = []
cluster_list = []

for i, (n, hle) in enumerate(zip(tqdm(all_files), li)):
    name = n.split("/")[-1][4:-4]

    for k in division:

        # clustering for checking the cluster scores
        f_c = AgglomerativeClustering(n_clusters=k).fit(hle)
        preds = f_c.labels_
        
        # Data space to compare 
        X = np.array(df_cum_f.T)

        # Silhouette score
        sklearn_X = dtw.distance_matrix_fast(X)
        s_score = silhouette_score(sklearn_X, preds, metric="precomputed")
        # print(f"{k}-div Silhouette score:", s_score, end="")

        name_list.append(name)
        s_score_list.append(s_score)
        cluster_list.append(k)

# save
df = pd.DataFrame(columns=["name","silhouette_score_fastdtw","num_clusters"])

df["name"] = name_list
df["silhouette_score_fastdtw"] = s_score_list
df["num_clusters"] = cluster_list

df.to_csv(f"data/silhouette_score_fastdtw_{method}_{measuring_data_space}.csv",index=False)

In [None]:
df_dtw_deep_cf_r = pd.read_csv("data/silhouette_score_fastdtw_DeepClustering-cf_r.csv")
df_dtw_deep_cf_r[df_dtw_deep_cf_r["num_clusters"] == 4].sort_values(by="silhouette_score_fastdtw", ascending=False)[:20]

Unnamed: 0.1,Unnamed: 0,name,silhouette_score_fastdtw,num_clusters
658,658,"umap20_arch[200, 200, 50]_c7",0.378983,4
1906,1906,"umap10_arch[100, 100, 50, 20]_c20",0.37599,4
90,90,"umap10_arch[300, 300, 100]_c6",0.374605,4
1730,1730,"umap10_arch[200, 200, 100, 10]_c10",0.37056,4
818,818,"umap10_arch[200, 200, 50]_c20",0.369079,4
1194,1194,"umap10_arch[100, 100, 20]_c8",0.367357,4
1794,1794,"umap10_arch[100, 100, 50, 20]_c4",0.361392,4
1890,1890,"umap10_arch[100, 100, 50, 20]_c10",0.358407,4
762,762,"umap10_arch[200, 200, 50]_c10",0.357792,4
642,642,"umap10_arch[200, 200, 50]_c7",0.356046,4


In [None]:
df_deep_cf_r = pd.read_csv("data/score_DeepClustering-cf_r.csv")
df_deep_cf_r[df_deep_cf_r["num_clusters"] == 4].sort_values(by="silhouette_score", ascending=False)[:20]

Unnamed: 0.1,Unnamed: 0,name,silhouette_score,dbindex_score,num_clusters
1706,1706,"umap20_arch[200, 200, 50]_c7",0.305608,0.997223,4
34,34,"umap10_arch[200, 200, 50]_c20",0.299111,0.869228,4
1642,1642,"umap20_arch[100, 100, 20]_c20",0.283904,0.897764,4
210,210,"umap10_arch[100, 100, 50, 20]_c20",0.281921,0.96978,4
1906,1906,"umap10_arch[100, 100, 20]_c8",0.279073,0.91065,4
98,98,"umap10_arch[100, 100, 20]_c4",0.279047,1.229735,4
978,978,"umap10_arch[200, 200, 50]_c6",0.278316,0.976865,4
1010,1010,"umap10_arch[300, 300, 100]_c6",0.277503,0.914141,4
106,106,"umap10_arch[100, 100, 50, 20]_c4",0.274982,0.951195,4
498,498,"umap10_arch[200, 200, 100, 10]_c10",0.274002,1.130581,4


In [None]:
df_deep_cf_r[df_deep_cf_r["num_clusters"] == 4].sort_values(by="dbindex_score", ascending=True)[:20]

Unnamed: 0.1,Unnamed: 0,name,silhouette_score,dbindex_score,num_clusters
34,34,"umap10_arch[200, 200, 50]_c20",0.299111,0.869228,4
1642,1642,"umap20_arch[100, 100, 20]_c20",0.283904,0.897764,4
1906,1906,"umap10_arch[100, 100, 20]_c8",0.279073,0.91065,4
1010,1010,"umap10_arch[300, 300, 100]_c6",0.277503,0.914141,4
1818,1818,"umap10_arch[200, 200, 50]_c10",0.273857,0.941074,4
106,106,"umap10_arch[100, 100, 50, 20]_c4",0.274982,0.951195,4
146,146,"umap10_arch[200, 200, 100, 10]_c6",0.239842,0.952904,4
730,730,"umap20_arch[100, 100, 20]_c9",0.24614,0.966737,4
210,210,"umap10_arch[100, 100, 50, 20]_c20",0.281921,0.96978,4
458,458,"umap10_arch[100, 100, 20]_c20",0.267586,0.972663,4


In [None]:
can1 = df_deep_cf_r[df_deep_cf_r["num_clusters"] == 4].sort_values(by="silhouette_score", ascending=False)[:20]["name"].to_list()
can2 = df_deep_cf_r[df_deep_cf_r["num_clusters"] == 4].sort_values(by="dbindex_score", ascending=True)[:20]["name"].to_list()

candidates = set(can1+can2)
print(len(candidates))
candidates

29


{'umap10_arch[100, 100, 20]_c20',
 'umap10_arch[100, 100, 20]_c4',
 'umap10_arch[100, 100, 20]_c8',
 'umap10_arch[100, 100, 50, 20]_c10',
 'umap10_arch[100, 100, 50, 20]_c20',
 'umap10_arch[100, 100, 50, 20]_c4',
 'umap10_arch[200, 200, 100, 10]_c10',
 'umap10_arch[200, 200, 100, 10]_c6',
 'umap10_arch[200, 200, 50]_c10',
 'umap10_arch[200, 200, 50]_c20',
 'umap10_arch[200, 200, 50]_c6',
 'umap10_arch[200, 200, 50]_c7',
 'umap10_arch[2000, 2000, 2000, 8000]_c5',
 'umap10_arch[300, 300, 100, 10]_c20',
 'umap10_arch[300, 300, 100, 10]_c9',
 'umap10_arch[300, 300, 100]_c20',
 'umap10_arch[300, 300, 100]_c6',
 'umap20_arch[100, 100, 20]_c20',
 'umap20_arch[100, 100, 20]_c3',
 'umap20_arch[100, 100, 20]_c8',
 'umap20_arch[100, 100, 20]_c9',
 'umap20_arch[100, 100, 50, 20]_c7',
 'umap20_arch[100, 100, 50, 20]_c8',
 'umap20_arch[200, 200, 100, 10]_c3',
 'umap20_arch[200, 200, 50]_c20',
 'umap20_arch[200, 200, 50]_c6',
 'umap20_arch[200, 200, 50]_c7',
 'umap20_arch[300, 300, 100, 10]_c2',
 'um