In [None]:
import os
import shutil
import itertools
import numpy as np
from tqdm import tqdm

from helper_exceptions import *
from helper_functions import write_clusters, print_verbose

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
from DL_Datasets import *
from DL_Models import *
from DL_ModelTrainer import ModelTrainer

dataset = ImageDataset("heredenene")
model = PowerOf2s256andAbove()
mt = ModelTrainer(num_of_epochs=1, lr=0.001,
                  batch_size=16, loss_type="mse",
                  dataset=dataset, model=model,
                  ckpt_path="heredenene_PowerOf2s256andAbove_mse_05:16:19:01:53/min_loss:0.06544473022222519_epoch:19.pth")

In [None]:
class DL_Clustering():
    def __init__(self, model_trainer, method, batch_size, number_of_clusters=[10], max_iter=[200], DBSCAN_eps=[0.5],
                DBSCAN_min_samples=[5], HDBSCAN_min_cluster_size=[5], HDBSCAN_max_cluster_size=[None], option="",
                transfer="copy", overwrite=False, verbose=0):
        self.model_trainer = model_trainer
        self.method = method
        self.batch_size = batch_size
        self.number_of_clusters = number_of_clusters
        self.max_iter = max_iter
        self.DBSCAN_eps = DBSCAN_eps
        self.DBSCAN_min_samples = DBSCAN_min_samples
        self.HDBSCAN_min_cluster_size = HDBSCAN_min_cluster_size
        self.HDBSCAN_max_cluster_size = HDBSCAN_max_cluster_size    
        self.option = option  
        self.transfer = transfer
        self.overwrite = overwrite
        self.verbose = verbose  
        
        if self.option == "merge":
            self.result_container_folder = self.model_trainer.dataset.root_dir
        else:
            base_folder, images_folder_name = os.path.split(self.model_trainer.dataset.root_dir)
            self.result_container_folder = os.path.join(base_folder, images_folder_name + "_clustered")

        self.arguman_check()
    
    def __str__(self, verbose=0):
        """casting to string method for printing/debugging object attributes

        Returns:
            str: object attribute information
        """
        attributes = vars(self)
        attr_strings = [f"{key}: {value}" for key, value in attributes.items()]
        return "-"*70 + "\n" + "\n".join(attr_strings) + "\n" + "-"*70

    def arguman_check(self, verbose=0):
        valid_methods = ["kmeans", "hierarchy", "DBSCAN", "gaussian", "HDBSCAN"]
        if self.method not in valid_methods:
            6/0
        
    def get_models(self, verbose=0):
        def calculate_grid_search():
            param_grid = []
            if self.method == "kmeans":
                param_grid = list(itertools.product(self.number_of_clusters, self.max_iter))
            elif self.method == "hierarchy":
                param_grid = self.number_of_clusters
            elif self.method == "DBSCAN":
                param_grid = list(itertools.product(self.DBSCAN_eps, self.DBSCAN_min_samples))
            elif self.method == "gaussian":
                param_grid = list(itertools.product(self.number_of_clusters, self.max_iter))
            elif self.method == "HDBSCAN":
                param_grid = list(itertools.product(self.HDBSCAN_min_cluster_size, self.HDBSCAN_max_cluster_size))
            
            return param_grid
        param_grid = calculate_grid_search()

        models = []
        for params in param_grid:
            if self.method == "kmeans":
                n_clusters, max_iter = params
                models.append(KMeans(n_clusters=n_clusters, max_iter=max_iter, verbose=verbose))
            elif self.method == "hierarchy":
                n_clusters = params
                models.append(AgglomerativeClustering(n_clusters=n_clusters))
            elif self.method == "DBSCAN":
                eps, min_samples = params
                models.append(DBSCAN(eps=eps, min_samples=min_samples))
            elif self.method == "gaussian":
                n_clusters, max_iter = params
                models.append(GaussianMixture(n_components=n_clusters, max_iter=max_iter, verbose=verbose))
            elif self.method == "HDBSCAN":
                min_cluster_size, max_cluster_size = params
                models.append(HDBSCAN(min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size))

        return models

    def find_best_model(self, models, reps, verbose=0):
        silhouette_scores, db_scores, ch_scores = [], [], []
        for model in tqdm(models):
            print(model)
            labels = model.fit_predict(reps)
            
            silhouette_scores.append(silhouette_score(reps, labels))
            db_scores.append(davies_bouldin_score(reps, labels))
            ch_scores.append(calinski_harabasz_score(reps, labels))

        silhouette_scores, db_scores, ch_scores = np.array(silhouette_scores), np.array(db_scores), np.array(ch_scores)
        silhouette_scores = (silhouette_scores - silhouette_scores.min())/(silhouette_scores.max()-silhouette_scores.min())
        db_scores = (db_scores - db_scores.min())/(db_scores.max()-db_scores.min())
        ch_scores = (ch_scores - ch_scores.min())/(ch_scores.max()-ch_scores.min())
        
        combined_scores = silhouette_scores - db_scores + ch_scores
        best_model = models[np.argmax(combined_scores)]
        
        return best_model


    def calculate_template_similarity(self, verbose=0):
        pass
    def merge_clusters_my_templates(self, verbose=0):
        pass

    def create_clusters(self, batch_idx, start, end, verbose=0):
        features = self.model_trainer.get_features(start, end)
        paths = list(features.keys())
        reps = np.array(list(features.values()))

        models = self.get_models()
        best_model = self.find_best_model(models, reps)
        labels = best_model.fit_predict(reps)



        clusters = {}
        for file, cluster_id in zip(paths, labels):
            if cluster_id not in clusters:
                clusters[cluster_id] = []
            clusters[cluster_id].append(file)
        clusters = list(clusters.values())


        
        write_clusters(clusters, batch_idx, self.result_container_folder, [], self.transfer, verbose=verbose-1)
        if verbose > 0:
            print("-"*70)

    def process(self, verbose=0):
        self.model_trainer.train()

        # creating result folder
        if self.option != "merge":
            if os.path.exists(self.result_container_folder) and not self.overwrite:
                raise(OverwritePermissionException("Overwriting permission not granted to overwrite " + self.result_container_folder))
            else:
                if os.path.exists(self.result_container_folder):
                    shutil.rmtree(self.result_container_folder)
                os.makedirs(self.result_container_folder)

        if self.option != "merge":
            for batch_idx, start in enumerate(range(0, len(self.model_trainer.dataset), self.batch_size)):
                self.create_clusters(batch_idx, start, start + self.batch_size, verbose=self.verbose-1)

            # if images are done in one batch terminate the code after organizing result folders
            if self.batch_size >= len(self.model_trainer.dataset):
                for file in os.listdir(self.result_container_folder):
                    new_file_name = file.replace("batch_0", "result")
                    os.rename(os.path.join(self.result_container_folder, file), os.path.join(self.result_container_folder, new_file_name))
                os.remove(os.path.join(self.result_container_folder, "image_similarities_result.json"))
                print_verbose("f", "no merge needed to single batch", self.verbose)
            
        if self.option == "dontmerge":
            print_verbose("f", "finishing because of no merge request", self.verbose)

        # # gets each batchs folder
        # batch_folder_paths = sorted([os.path.join(self.result_container_folder, f)
        #                             for f in os.listdir(self.result_container_folder)
        #                             if os.path.isdir(os.path.join(self.result_container_folder, f))])

        # # merge each batch to get which clusters folders should be merged together
        # template_cluster_folders_to_merge_list = self.merge_clusters_by_templates(batch_folder_paths, verbose=self.verbose-1)

        # if self.option != "merge":
        #     print_verbose("r", str(len(template_cluster_folders_to_merge_list) - 1) + " cluster found at result", self.verbose)
        # if self.option == "merge":
        #     print_verbose("m", str(len(template_cluster_folders_to_merge_list) - 1) + " cluster found at result", self.verbose)
        
        # # creating result folder and merging cluster folders
        # result_folder_path = os.path.join(self.result_container_folder, "results")
        # os.mkdir(result_folder_path)
        # for e, template_cluster_folders_to_merge in enumerate(template_cluster_folders_to_merge_list):
        #     cluster_folder_path = os.path.join(result_folder_path, "cluster_" + str(e))
        #     if e == len(template_cluster_folders_to_merge_list) - 1:
        #         cluster_folder_path = os.path.join(result_folder_path, "outliers")

        #     os.mkdir(cluster_folder_path)
        #     for template_cluster_folder in template_cluster_folders_to_merge:
        #         for file in os.listdir(template_cluster_folder):
        #             image_transfer(self.transfer, os.path.join(template_cluster_folder, file), os.path.join(cluster_folder_path, file))
                    
        # # removing unnecessary files and folders after merging results
        # for folder in os.listdir(self.result_container_folder):
        #     if folder != "results":
        #         if os.path.isdir(os.path.join(self.result_container_folder, folder)):
        #             shutil.rmtree(os.path.join(self.result_container_folder, folder))
        #         if os.path.isfile(os.path.join(self.result_container_folder, folder)):
        #             os.remove(os.path.join(self.result_container_folder, folder))


    def __call__(self, verbose=0):
        self.process()

In [None]:
# check functions line by line
# exceptions
# batch size'ları karıştırma
# checkpoint dosyalarını sil
# verbose passes
# if ckpt valid then no training at the beginning of clustering

# .

In [None]:
dlc = DL_Clustering(model_trainer=mt, method="kmeans", batch_size=100, overwrite=True)

In [None]:
dlc()