# 2-0. Deep clustering
A manifold clustering, N2D, is chosen as deep clustering. Tensorflow [n2d package](https://pypi.org/project/n2d/) is used. Two types of autoencoder are deployed, which are overcomplete and undercomplete. N2D clustering results are obtained.

OUTPUT PATH: embedding_data

In [None]:
# n2d package
pip install n2d

In [None]:
import n2d
from utils.create_directory import create_directory
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
import easydict
import umap
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
import seaborn as sns
import tensorflow as tf

In [None]:
SEED = 0
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
r_coin = pd.read_csv('coin_data/coin_log_return.csv')
r_coin_f = pd.read_csv('coin_data/coin_log_return_filtered.csv')

In [None]:
# 실험이름
exp = "overcomplete_autoencoder_me_non_filtered_1"

# 실험 데이터
x = r_coin.T
print(x.shape)

(146, 365)


In [None]:
param_grid = {
    "latent_dim": [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    "architecture": [
        [300, 300, 100], [200, 200, 50], [100, 100, 20], [300, 300, 100, 10], [200, 200, 100, 10], [100, 100, 50, 20],
        [500, 500, 2000], [1000, 1000, 4000], [2000, 2000, 8000], [500, 500, 500, 2000], [1000, 1000, 1000, 4000], [2000, 2000, 2000, 8000]],
    "act": ["relu"],
    "umap_neighbour": [10, 20]
}

arguments = ParameterGrid(param_grid)

In [None]:
# n2d clustering
for j, args in enumerate(arguments):
    args = easydict.EasyDict(args)
    
    # F_a: autoencoder
    ae = n2d.AutoEncoder(
        input_dim=x.shape[-1], 
        latent_dim=args.latent_dim,
        architecture=args.architecture,
        act=args.act
    )
    
    # F_m: umap arguments
    umap_args = {"metric":"euclidean", "n_components":2, "n_neighbors":args.umap_neighbour, "min_dist":0, 'random_state':SEED} # default값 찾아서 사용
    
    # F_c: hierarchical clustering(agglomerative) arguments
    method = 'HC'
    hc_args = {"n_clusters": 2, "linkage": "ward"
    
    # F_m and F_c: manifold and cluster object
    hc_clust = n2d.manifold_cluster_generator(umap.UMAP, umap_args, AgglomerativeClustering, hc_args)

    # n2d object
    n2d_HC = n2d.n2d(ae, hc_clust)
    
    # train ae
    epochs = 1000
    n2d_HC.fit(x, weight_id = f"weights/umap{args.umap_neighbour}_arch{args.architecture}_c{args.latent_dim}", epochs=epochs, batch_size=256, loss='mse', optimizer='adam', patience=20)
    
    # save embeddings
    hle = hc_clust.hle
    
    create_directory(f"embedding_data/{exp}")
    pd.DataFrame(hle).to_csv(
        f"embedding_data/{exp}/hle_umap{args.umap_neighbour}_arch{args.architecture}_c{args.latent_dim}.csv", index=False)