In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import clone
import torch

from sklearn.metrics import roc_curve
from sklearn import metrics

from src.data_utility import delete_directory, prepare_data, distribute_classes, train_submodels, ism_post_process, evaluate_ism, clustclass_post_process
from src import utility_functions
import json
import os
from os.path import join
from datetime import datetime
import sqlite3
import pandas as pd
from tqdm import tqdm

In [None]:
%%capture
with open("./config/config.json", "r") as config_file:
    config = json.load(config_file)

dataset_name = config['dataset_name']
overwrite = config['overwrite']
m = config['distance_measure']

for iter in range(10):
    for n_classes, n_clusters in zip([5000, 10000, 20000, 50000], [2, 5, 10, 25]):
        # for method in ["ISM", "clustclass"]:
        method = "ISM"
        scenario = str(n_classes) + '_' + method + str(n_clusters)
        data_scenario_path = join(config[dataset_name]["scenario_embs"], scenario)
        model_scenario_path = join(config[dataset_name]["scenario_submodels"], scenario)
        super_scenario_path = join(config[dataset_name]["scenario_embs"], str(n_classes))

        delete_directory(data_scenario_path)
        delete_directory(model_scenario_path)

        query = None
        conn = sqlite3.connect("./results/" + dataset_name + "_results.db")
        df = None

        query = "select ism_end_timestamp from results where iteration=? and n_classes=? and n_clusters=?"
        df = pd.read_sql_query(query, conn, params=(iter, n_classes, n_clusters))
        if not df.empty and not df['ism_end_timestamp'].item() == None:
            print("already done", method, iter, n_classes, n_clusters)
            continue
        else:
            cursor = conn.cursor()
            cursor.execute("INSERT INTO results(iteration, n_classes, n_clusters, dataset_name, ism_start_timestamp) \
                            VALUES (?, ?, ?, ?, ?)", (iter, n_classes, n_clusters, dataset_name, datetime.now()))

            conn.commit()
            conn.close()

            utility_functions.pprint(("-------------------------------"), dataset_name)
            utility_functions.pprint(("dataset_name = ", dataset_name), dataset_name)
            utility_functions.pprint(("meth = ", method), dataset_name)
            utility_functions.pprint(("n_classes = ", n_classes), dataset_name)
            utility_functions.pprint(("n_clusters = ", n_clusters), dataset_name)

            # Create necessary directories
            os.makedirs(super_scenario_path, exist_ok=True)
            os.makedirs(data_scenario_path, exist_ok=True)
            os.makedirs(model_scenario_path, exist_ok=True)

            # Prepare data, distribute classes, and train submodels
            trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall = prepare_data(n_classes)
            parts = distribute_classes(method, n_classes, n_clusters, trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall)
            train_submodels(method, n_classes, parts, trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall)

            # Perform ISM post-processing and evaluation on the test set
            test_softmax_classes = ism_post_process(m, n_classes, parts, testx, 'test')
            evaluate_ism(iter, m, n_classes, n_clusters, testl, test_softmax_classes)

            if n_clusters == 1:
                continue
            # sims = utility_functions.cos_sim(torch.Tensor(testx), torch.Tensor(traincenterx))

            # Find the nearest class to each test sample
            batch_size = 5000
            batch_numbers = len(testx) // batch_size + (1 if (len(testx) % batch_size != 0) else 0)

            sim = []
            sim_values = []

            pre_path = data_scenario_path

            for batch in tqdm(range(batch_numbers)):
                if batch == batch_numbers - 1 and (len(testx) % batch_size):
                    batch_clusters = [0] * (len(testx) % batch_size)
                else:
                    batch_clusters = [0] * batch_size
                if m == 'cosine':
                    batch_sim = utility_functions.cos_sim(torch.Tensor(testx[batch*batch_size:np.min([len(testx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))
                else:
                    batch_sim = utility_functions.euc_sim(torch.Tensor(testx[batch*batch_size:np.min([len(testx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))

                    v = batch_sim
                    v_min, v_max = v.min(), v.max() #(dim=1)[0], v.max(dim=1)[0]
                    new_min, new_max = 0, 0.9
                    v_p = ((v.transpose(0,1) - v_min)/(v_max - v_min)*(new_max - new_min) + new_min).transpose(0,1)
                    batch_sim = v_p

                batch_classes = (batch_sim.max(1)[1]).numpy()
                sim.extend(list(batch_classes))

            utility_functions.pprint(("KNN : ", np.sum(np.array(sim) == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # knn acc
            knn_report = metrics.classification_report(testl, np.array(sim), output_dict=True, zero_division=0)
            utility_functions.pprint((knn_report['macro avg']), config[dataset_name])

            conn = sqlite3.connect("./results/" + dataset_name + "_results.db")
            cursor = conn.cursor()
            cursor.execute("UPDATE results set knn_end_timestamp = ?, knn_recall = ?, knn_precision =?, knn_fscore = ? where iteration = ? and dataset_name= ? and n_classes = ? and n_clusters = ?", (datetime.now(), knn_report['macro avg']['recall'], knn_report['macro avg']['precision'], knn_report['macro avg']['f1-score'], iter, dataset_name, n_classes, n_clusters))
            conn.commit()
            conn.close()


            # sim = sims.max(1)[1].numpy()
            sim = np.array(sim)
            clusters = []
            for s in sim:
                for j in parts:
                    if s in parts[j]:
                        clusters.append(j)

            max_softmax = dict()
            argmax_softmax = dict()
            softmax_values = dict()
            for idx in range(n_clusters):
                max_softmax[idx] = np.load(join(data_scenario_path, str(idx) + '_test_predicted_max.npz'))['res']
                argmax_softmax[idx] = np.load(join(data_scenario_path, str(idx) + '_test_predicted_argmax.npz'))['res']

            res = []
            for i in range(n_classes*5):
                res.append(parts[clusters[i]][argmax_softmax[clusters[i]][i]])
            utility_functions.pprint(("intellig ism: " , np.sum(np.array(res) == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # intellig ism
            intellig_ism_report = metrics.classification_report(testl, np.array(res), output_dict=True, zero_division=0)
            utility_functions.pprint((intellig_ism_report['macro avg']), config[dataset_name])

            conn = sqlite3.connect("./results/" + dataset_name + "_results.db")
            cursor = conn.cursor()
            cursor.execute("UPDATE results set intellig_ism_end_timestamp = ?, intellig_ism_recall = ?, intellig_ism_precision =?, intellig_ism_fscore = ? where iteration = ? and dataset_name= ? and n_classes = ? and n_clusters = ?", (datetime.now(), intellig_ism_report['macro avg']['recall'], intellig_ism_report['macro avg']['precision'], intellig_ism_report['macro avg']['f1-score'], iter, dataset_name, n_classes, n_clusters))
            conn.commit()
            conn.close()

        if n_clusters == 1:
            continue

        method = "clustclass"
        scenario = str(n_classes) + '_' + method + str(n_clusters)
        data_scenario_path = join(config[dataset_name]["scenario_embs"], scenario)
        model_scenario_path = join(config[dataset_name]["scenario_submodels"], scenario)
        super_scenario_path = join(config[dataset_name]["scenario_embs"], str(n_classes))

        delete_directory(data_scenario_path)
        delete_directory(model_scenario_path)

        utility_functions.pprint(("-------------------------------"), dataset_name)
        utility_functions.pprint(("iter = " , iter), dataset_name)
        utility_functions.pprint(("dataset_name = ", dataset_name), dataset_name)
        utility_functions.pprint(("meth = ", method), dataset_name)
        utility_functions.pprint(("n_classes = ", n_classes), dataset_name)
        utility_functions.pprint(("n_clusters = ", n_clusters), dataset_name)

        # Create necessary directories
        os.makedirs(super_scenario_path, exist_ok=True)
        os.makedirs(data_scenario_path, exist_ok=True)
        os.makedirs(model_scenario_path, exist_ok=True)

        # Prepare data, distribute classes, and train submodels
        trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall = prepare_data(n_classes)
        parts = distribute_classes(method, n_classes, n_clusters, trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall)
        train_submodels(method, n_classes, parts, trainx, trainy, trainl, traincenterx, traincentery, traincenterl, testx, testy, testl, valx, valy, vall)

        val_sim_classes, val_sim_values, val_sim_softmax, val_softmax_values, val_softmax_sims, val_softmax_classes = clustclass_post_process(m, n_classes, parts, traincenterx, valx, 'val')

        test_sim_classes, test_sim_values, test_sim_softmax, test_softmax_values, test_softmax_sims, test_softmax_classes = clustclass_post_process(m, n_classes, parts, traincenterx, testx, 'test')

        utility_functions.pprint(("Clustered clustclass : ", np.sum(np.array(test_softmax_classes) == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # clustered ism
        maxmax_clustclass_report = metrics.classification_report(testl, np.array(test_softmax_classes), output_dict=True, zero_division=0)
        utility_functions.pprint((maxmax_clustclass_report['macro avg']), config[dataset_name])

        conn = sqlite3.connect("./results/" + dataset_name + "_results.db")
        cursor = conn.cursor()
        cursor.execute("UPDATE results set maxmax_clustclass_end_timestamp = ?, maxmax_clustclass_recall = ?, maxmax_clustclass_precision =?, maxmax_clustclass_fscore = ? where iteration = ? and dataset_name= ? and n_classes = ? and n_clusters = ?", (datetime.now(), maxmax_clustclass_report['macro avg']['recall'], maxmax_clustclass_report['macro avg']['precision'], maxmax_clustclass_report['macro avg']['f1-score'], iter, dataset_name, n_classes, n_clusters))
        conn.commit()
        conn.close()

        # Find the nearest class to each test sample
        batch_size = 5000
        batch_numbers = len(testx) // batch_size + (1 if (len(testx) % batch_size != 0) else 0)

        sim = []
        sim_values = []

        pre_path = data_scenario_path

        for batch in tqdm(range(batch_numbers)):
            if batch == batch_numbers - 1 and (len(testx) % batch_size):
                batch_clusters = [0] * (len(testx) % batch_size)
            else:
                batch_clusters = [0] * batch_size
            if m == 'cosine':
                batch_sim = utility_functions.cos_sim(torch.Tensor(testx[batch*batch_size:np.min([len(testx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))
            else:
                batch_sim = utility_functions.euc_sim(torch.Tensor(testx[batch*batch_size:np.min([len(testx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))

                v = batch_sim
                v_min, v_max = v.min(), v.max() #(dim=1)[0], v.max(dim=1)[0]
                new_min, new_max = 0, 0.9
                v_p = ((v.transpose(0,1) - v_min)/(v_max - v_min)*(new_max - new_min) + new_min).transpose(0,1)
                batch_sim = v_p

            batch_classes = (batch_sim.max(1)[1]).numpy()
            sim.extend(list(batch_classes))

        utility_functions.pprint(("KNN clustclass : ", np.sum(np.array(sim) == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # knn acc
        max_max_report = metrics.classification_report(testl, np.array(sim), output_dict=True, zero_division=0)
        utility_functions.pprint((max_max_report['macro avg']), config[dataset_name])


        # sims = utility_functions.cos_sim(torch.Tensor(testx), torch.Tensor(traincenterx))
        # utility_functions.pprint(("KNN clustclass : ", np.sum(sims.max(1)[1].numpy() == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # knn acc

        # sim = sims.max(1)[1].numpy()
        sim = np.array(sim)
        clusters = []
        for s in sim:
            for j in parts:
                if s in parts[j]:
                    clusters.append(j)

        max_softmax = dict()
        argmax_softmax = dict()
        softmax_values = dict()
        for idx in range(n_clusters):
            max_softmax[idx] = np.load(join(data_scenario_path, str(idx) + '_test_predicted_max.npz'))['res']
            argmax_softmax[idx] = np.load(join(data_scenario_path, str(idx) + '_test_predicted_argmax.npz'))['res']

        res = []
        for i in range(n_classes*5):
            res.append(parts[clusters[i]][argmax_softmax[clusters[i]][i]])
        utility_functions.pprint(("intellig clustclass: " , np.sum(np.array(res) == np.array([[i] * 5 for i in range(n_classes)]).flatten()) / (n_classes * 5)), dataset_name) # intellig ism
        final_clustclass_report = metrics.classification_report(testl, np.array(res), output_dict=True, zero_division=0)
        utility_functions.pprint((final_clustclass_report['macro avg']), config[dataset_name])

        conn = sqlite3.connect("./results/" + dataset_name + "_results.db")
        cursor = conn.cursor()
        cursor.execute("UPDATE results set final_clustclass_end_timestamp = ?, final_clustclass_recall = ?, final_clustclass_precision =?, final_clustclass_fscore = ? where iteration = ? and dataset_name= ? and n_classes = ? and n_clusters = ?", (datetime.now(), final_clustclass_report['macro avg']['recall'], final_clustclass_report['macro avg']['precision'], final_clustclass_report['macro avg']['f1-score'], iter, dataset_name, n_classes, n_clusters))
        conn.commit()
        conn.close()

        # for th in range(20):
        #     evaluate_clustclass(iter, confident_ism_thr, th, testl, n_classes, n_clusters, test_sim_classes, test_sim_values, test_sim_softmax, test_softmax_values, test_softmax_sims, test_softmax_classes)
        # evaluate1_clustclass(iter, confident_ism_thr, thr, testl, n_classes, n_clusters, test_sim_classes, test_sim_values, test_sim_softmax, test_softmax_values, test_softmax_sims, test_softmax_classes)

In [295]:
batch_size = 5000
batch_numbers = len(trainx) // batch_size + (1 if (len(trainx) % batch_size != 0) else 0)

sim = []
sim_values = []

pre_path = data_scenario_path

for batch in tqdm(range(batch_numbers)):
    if batch == batch_numbers - 1 and (len(trainx) % batch_size):
        batch_clusters = [0] * (len(trainx) % batch_size)
    else:
        batch_clusters = [0] * batch_size
    if m == 'cosine':
        batch_sim = utility_functions.cos_sim(torch.Tensor(trainx[batch*batch_size:np.min([len(trainx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))
    else:
        batch_sim = utility_functions.euc_sim(torch.Tensor(trainx[batch*batch_size:np.min([len(trainx), (batch+1)*batch_size])]), torch.Tensor(traincenterx))

        v = batch_sim
        v_min, v_max = v.min(), v.max() #(dim=1)[0], v.max(dim=1)[0]
        new_min, new_max = 0, 0.9
        v_p = ((v.transpose(0,1) - v_min)/(v_max - v_min)*(new_max - new_min) + new_min).transpose(0,1)
        batch_sim = v_p

    batch_classes = (batch_sim.max(1)[1]).numpy()
    sim.extend(list(batch_classes))

print(("KNN : ", np.sum(np.array(sim) == np.array([[i] * 20 for i in range(n_classes)]).flatten()) / (n_classes * 20))) # knn acc
# knn_report = metrics.classification_report(testl, np.array(sim), output_dict=True, zero_division=0)
# utility_functions.pprint((knn_report['macro avg']), config[dataset_name])


  0%|          | 0/80 [00:00<?, ?it/s]

100%|██████████| 80/80 [00:47<00:00,  1.70it/s]

('KNN : ', 0.9379075)



