In [1]:
import itertools
import numpy as np
from tqdm import tqdm

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


In [2]:
from DL_Datasets import *
from DL_Models import *
from DL_ModelTrainer import ModelTrainer

dataset = ImageDataset("heredenene")
model = PowerOf2s256andAbove()
mt = ModelTrainer(num_of_epochs=100, lr=0.001,
                  batch_size=16, loss_type="mse",
                  dataset=dataset, model=model,
                  ckpt_path="heredenene_PowerOf2s256andAbove_mse_05:16:19:01:53/min_loss:0.06544473022222519_epoch:19.pth")

features = mt()
paths = list(features.keys())
reps = np.array(list(features.values()))

In [3]:
def get_models(method="", number_of_clusters=[None], max_iter=[200], DBSCAN_eps=[0.5], DBSCAN_min_samples=[5], HDBSCAN_min_cluster_size=[5], HDBSCAN_max_cluster_size=[None], verbose=0):
    def calculate_grid_search():
        param_grid = []
        if method == "kmeans":
            param_grid = list(itertools.product(number_of_clusters, max_iter))
        elif method == "hierarchy":
            param_grid = number_of_clusters
        elif method == "DBSCAN":
            param_grid = list(itertools.product(DBSCAN_eps, DBSCAN_min_samples))
        elif method == "gaussian":
            param_grid = list(itertools.product(number_of_clusters, max_iter))
        elif method == "HDBSCAN":
            param_grid = list(itertools.product(HDBSCAN_min_cluster_size, HDBSCAN_max_cluster_size))
        
        return param_grid
    param_grid = calculate_grid_search()

    models = []
    for params in param_grid:
        if method == "kmeans":
            n_clusters, max_iter = params
            models.append(KMeans(n_clusters=n_clusters, max_iter=max_iter, verbose=verbose))
        elif method == "hierarchy":
            n_clusters = params
            models.append(AgglomerativeClustering(n_clusters=n_clusters))
        elif method == "DBSCAN":
            eps, min_samples = params
            models.append(DBSCAN(eps=eps, min_samples=min_samples))
        elif method == "gaussian":
            n_clusters, max_iter = params
            models.append(GaussianMixture(n_components=n_clusters, max_iter=max_iter, verbose=verbose))
        elif method == "HDBSCAN":
            min_cluster_size, max_cluster_size = params
            models.append(HDBSCAN(min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size))

    return models

In [4]:
def find_best_model(models, data):
    silhouette_scores, db_scores, ch_scores = [], [], []
    for model in tqdm(models):
        labels = model.fit_predict(data)
        
        silhouette_scores.append(silhouette_score(data, labels))
        db_scores.append(davies_bouldin_score(data, labels))
        ch_scores.append(calinski_harabasz_score(data, labels))

    silhouette_scores, db_scores, ch_scores = np.array(silhouette_scores), np.array(db_scores), np.array(ch_scores)
    silhouette_scores = (silhouette_scores - silhouette_scores.min())/(silhouette_scores.max()-silhouette_scores.min())
    db_scores = (db_scores - db_scores.min())/(db_scores.max()-db_scores.min())
    ch_scores = (ch_scores - ch_scores.min())/(ch_scores.max()-ch_scores.min())
    
    combined_scores = silhouette_scores - db_scores + ch_scores
    best_model = models[np.argmax(combined_scores)]
    
    return best_model

In [None]:
def get_labels():
    models = get_models("hierarchy", number_of_clusters=[3,4,5,6,7,8,9], max_iter=[50,75,100], verbose=0)
    best_model = find_best_model(models, reps)
    labels = best_model.fit_predict(reps)
    return labels