In [None]:
# -------------------------------------------------------------------------------------
# 1. LIBRARY
# -------------------------------------------------------------------------------------

# https://numpy.org/devdocs/user/absolute_beginneencoder_rs.html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
# https://matplotlib.org/stable/gallery/mplot3d/scatter3d.html
# from mpl_toolkits.mplot3d import Axes3D
import os
num_cores = os.cpu_count()
import glob
# File processing
import sys
# import scipy.spatial.distance as sc

# KDTree
# from sklearn.neighbors import KDTree
from scipy.spatial import cKDTree

# Distance Calculation
import math, statistics
from scipy.spatial import distance

# from chebyshev import Chebyshev
import numpy.polynomial.chebyshev 

# Data Transformation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, normalize
from sklearn.decomposition import PCA

# Baselines (Undersampling)
# https://imbalanced-learn.org/
from imblearn.under_sampling import AllKNN
from imblearn.under_sampling import ClusterCentroids 
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import NearMiss 
from imblearn.under_sampling import NeighbourhoodCleaningRule 
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.under_sampling import TomekLinks 
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import StratifiedKFold # train_test_split, GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold # train_test_split, GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.model_selection import RepeatedKFold # train_test_split, GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.model_selection import KFold # train_test_split, GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

# Performance
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.metrics import make_scorer, cohen_kappa_score

# Complexity
import time
import tracemalloc

# Multiprocessing
import multiprocessing
from joblib import Parallel, delayed
from pathlib import Path

# Warning
import warnings

# -------------------------------------------------------------------------------------
# 2. FUNCTIONS
# -------------------------------------------------------------------------------------

def custom_distance(x, y):
    if x[-1] == y[-1]:
        return np.inf
    else:
        return np.linalg.norm(x[:-1] - y[:-1])

def find_minimum_distance(d,i): # https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
    min_dist = 999999999999999.0
    min_vec = -1
    if (d == 0): # Cityblock
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
#                new_dist = distance.cityblock(tempdata[i],tempdata[j])
                new_dist = distance.minkowski(tempdata[i],tempdata[j],1)
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j
    elif (d == 1): # Chebyshev
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
                new_dist = distance.chebyshev(tempdata[i],tempdata[j])
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j
    elif (d == 2): # Correlation
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
                new_dist = distance.correlation(tempdata[i],tempdata[j])
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j
    elif (d == 3): # Cosine
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
                new_dist = distance.cosine(tempdata[i],tempdata[j])
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j
    elif (d == 4): # Euclidean
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
#                new_dist = distance.euclidean(tempdata[i],tempdata[j]) # slower
                new_dist = distance.minkowski(tempdata[i],tempdata[j],2)
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j
    elif (d == 5): # Minkowski p=3
        for j in range(tempdata.shape[0]):
            if (templabel[i] != templabel[j]):
                new_dist = distance.minkowski(tempdata[i],tempdata[j],3)
                
                if (new_dist < min_dist):
                    min_dist = new_dist
                    min_vec = j    
    return min_vec

def mcov(i): # https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
    min_dist = 999999999999999.0
    min_vec = -1
    for j in range(tempdata.shape[0]):
        if (templabel[i] != templabel[j]):
            new_dist = distance.minkowski(tempdata[i],tempdata[j],2)
            if (new_dist < min_dist):
                min_dist = new_dist
                min_vec = j                   
    return i, min_vec, templabel[i], templabel[min_vec], min_dist

def plot_data_distribution_comparison(X1, y1, n1, X2, y2, n2, dataset):
    plt.figure(figsize=(8, 6))
    for label in np.unique(y2):
        plt.scatter(X2[y2 == label, 0], X2[y2 == label, 1], 
                    color=colors[label], label=f'Class {label}', s=1, alpha=0.7)
    
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.title("Data Distribution (D"+str(dataset+1).zfill(2)+"): "+n2,fontsize=16,fontweight='bold')
    plt.legend(loc='upper right')
    plt.savefig(n1+"-"+n2+"-data-distribution-"+str(dataset+1).zfill(2)+".pdf", format="pdf", dpi=None, facecolor="w", edgecolor="w", orientation="portrait", transparent=True, bbox_inches="tight", pad_inches=0.02, metadata=None)
    plt.show()
    plt.close()
    

def plot_class_distribution_comparison(X1, y1, n1, X2, y2, n2, dataset):
    classes = np.union1d(np.unique(y1), np.unique(y2))
    x = np.arange(len(classes))
    width = 0.35
    
    # Normalize counts
    X1_counts = np.array([np.sum(y1 == c) for c in classes]) / len(y1)
    X2_counts  = np.array([np.sum(y2 == c) for c in classes]) / len(y2)

    plt.figure(figsize=(8, 6))

    bars1 = plt.bar(x - width/2, X1_counts, width=width, label=n1)
    bars2 = plt.bar(x + width/2, X2_counts, width=width, label=n2)

    # Add value labels above bars
    for i, bar in enumerate(bars1):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 f'{X1_counts[i]:.2f}', ha='center', va='bottom', fontsize=9)

    for i, bar in enumerate(bars2):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 f'{X2_counts[i]:.2f}', ha='center', va='bottom', fontsize=9)

    plt.xlabel("Class")
    plt.ylabel("Proportion")
    plt.title("Class Distribution (D"+str(dataset+1).zfill(2)+"): "+n1+" vs. "+n2+" ", fontsize=16, fontweight='bold')
    plt.xticks(x, classes)
    plt.ylim(0, 1)
    plt.legend(loc='upper right')
    plt.savefig(n1+"-"+n2+"-class-distribution-"+str(dataset+1).zfill(2)+".pdf", format="pdf", dpi=None, facecolor="w", edgecolor="w", orientation="portrait", transparent=True, bbox_inches="tight", pad_inches=0.02, metadata=None)
    plt.show()
    plt.close()
    
# -------------------------------------------------------------------------------------
# 3. PARAMETERS
# -------------------------------------------------------------------------------------

# Switch
pca = 0
dis_mod = 0
bdis_mod = 0
mcov_mod = 1
bmcov_mod = 1
training = 1
performance_save = 1
figure_save = 1

# Values
start_set = 0
start_type = 0
max_type = 7 #18 = MCOV, 19 = BMCOV alwasy +1
max_k = 5
kdratio = 0.02
max_distance = 1
reduction_load = 0
output_dataset_save = 1
encoder = 1 # label = 1, onehot = 2
pca_components = 0.99
normalization = 1
cross_validation = 1
cross_validation_fold = 5
parallel = 1
model = 'knn' # knn, svm, nn

d = 4
train_ratio = 0.3
train_ratio_100k = 0.1
train_ratio_500k = 0.01

colors = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
    '#393b79', '#637939', '#8c6d31', '#843c39', '#7b4173',
    '#5254a3', '#9c9ede', '#cedb9c', '#e7ba52', '#e7969c',
    '#a55194', '#de9ed6', '#f7b6d2', '#c7c7c7', '#dbdb8d',
    '#9edae5', '#ffbb78', '#ff9896', '#c5b0d5', '#c49c94',
    '#f2b701', '#f28e2b', '#e15759', '#76b7b2', '#59a14f',
    '#edc948', '#af7aa1', '#ff9da7', '#9c755f', '#bab0ac',
    '#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f',
    '#edc948', '#b07aa1', '#ff9da7', '#9c755f', '#bab0ac',
    '#6b6ecf', '#b5cf6b'
]

# Labels
label_title = ('Accuracy','Precision','Recall','F1-Score','AUC (One-vs-Rest)','AUC (One-vs-One)','Kappa')
# Subscript = _, Superscript = ^, use {} for multiple letters
label_dataset = ('Synthetic','Banknote','Car','Crowndsourced Mapping','Letter Recognition','Optical Digits Recognition','Pen Digits Recognition','Statlog Landsat','Tic-Tac-Toe','Tezpur Android Malware','Wave Form');
# label_traintype = ('ORG','CNN','ENN','RENN','All KNN','TL','OSS','NCL','NM$_1$','NM$_2$','NM$_3$','IHT','CC','RUS','DIS$_{1}$','BDIS$_{1}$','IBP$_{1}$','BIBP$_{1}$','MCOV','BMCOV')
label_traintype = ('ORG','PFMCOV','BFMCOV','PAMCOV','BAMCOV','PMCOV','BMCOV')

# -------------------------------------------------------------------------------------
# 4. LOADING DATA SETS
# -------------------------------------------------------------------------------------

# Reset performance files(Cross Validation)
if (performance_save == 1):

    file_removal = glob.glob('*.csv')
    for f in file_removal:
        os.remove(f)    
    file_removal = glob.glob('*.txt')
    for f in file_removal:
        os.remove(f)    
    file_removal = glob.glob('*.pdf')
    for f in file_removal:
        os.remove(f)

# Define the location of the dataset
# https://archive.ics.uci.edu/ml/datasets.php
# http://www.timeseriesclassification.com/dataset.php
files = []
file_header = []
file_separator = []
file_label = []
file_big = []

# DATA SETS
pca = 0
files.append("https://piyabute.com/data/research/syntactic-wave.csv"); file_header.append(None); file_separator.append("\t"); file_label.append(-1); file_big.append(0); # 6A4C
files.append("https://piyabute.com/data/research/banknote_authentication.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 6A4C
files.append("https://piyabute.com/data/research/car.data.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 6A4C
files.append("https://piyabute.com/data/research/crowdsourced_mapping.custom.csv"); file_header.append(1); file_separator.append(","); file_label.append(0); file_big.append(0); # 6A4C
files.append("https://piyabute.com/data/research/letter-recognition.data.csv"); file_header.append(None); file_separator.append(","); file_label.append(0); file_big.append(0); # 8A5C
files.append("https://piyabute.com/data/research/optdigits.custom.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 64A10C
files.append("https://piyabute.com/data/research/pendigits.custom.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 16A2C
files.append("https://piyabute.com/data/research/statlog_landsat.custom.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 36A6C
files.append("https://piyabute.com/data/research/tic-tac-toe.data.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 6A4C
files.append("https://piyabute.com/data/research/TUANDROMD.csv"); file_header.append(0); file_separator.append(","); file_label.append(-1); file_big.append(0); # 36A6C
files.append("https://piyabute.com/data/research/waveform-+noise.v2.data.csv"); file_header.append(None); file_separator.append(","); file_label.append(-1); file_big.append(0); # 6A4C

max_set = len(files)

print("Done")

In [None]:
# -------------------------------------------------------------------------------------
# 5. LOOP DATASETS
# -------------------------------------------------------------------------------------

kd_query_miss = np.zeros((max_set),dtype = np.intc)

for dataset in range(start_set,max_set):

    # -------------------------------------------------------------------------------------
    # Initialize performance variables
    # -------------------------------------------------------------------------------------
    # perf_confusion_matrix=np.zeros(10,4,10,10)
    # perf_confusion_matrix = np.zeros((max_set,max_type,max_k),dtype=np.float64)
    reduction = np.zeros((max_type),dtype = np.intc)
    perf_fit_time = np.zeros((max_type,max_k),dtype = np.float64)
    perf_score_time = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_accuracy = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_precision = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_recall = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_f1 = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_aucovr = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_aucovo = np.zeros((max_type,max_k),dtype = np.float64)
    perf_train_kappa = np.zeros((max_type,max_k),dtype = np.float64)
    time_training = np.zeros((max_type,max_k),dtype = np.float64)
    time_reduction = np.zeros((max_type),dtype = np.float64)
    mem_training_current = np.zeros((max_type,max_k),dtype = np.float64)
    mem_training_peak = np.zeros((max_type,max_k),dtype = np.float64)
    k_array = np.zeros((max_k),dtype = np.intc)
    d_array = np.zeros((max_set),dtype = np.intc)
    l_array = np.zeros((max_type),dtype = np.intc)
    l_reduction = np.zeros((max_type),dtype = np.float64)
    
    # -------------------------------------------------------------------------------------
    # Load data file to DataFrame
    # -------------------------------------------------------------------------------------

    inputfile = files[dataset]
    print("Dataset:",dataset)
    print("Data Source:",inputfile)
    if (inputfile[-2:] == "gz"):
        df = pd.read_csv(inputfile, header = file_header[dataset], sep = file_separator[dataset], compression="gzip")
    elif (inputfile[-3:] == ".zip"):
        df = pd.read_csv(inputfile, header = file_header[dataset], sep = file_separator[dataset], compression="zip")
    elif (inputfile[-4:] == ".xls") or (inputfile[-4:] == "xlsx"):
        df = pd.read_excel(inputfile, header = file_header[dataset])      
    else:
        df = pd.read_csv(inputfile, header = file_header[dataset], sep = file_separator[dataset])

    # -------------------------------------------------------------------------------------
    # Pre-process DataFrame
    # -------------------------------------------------------------------------------------
        
    # Drop specific rows
    if (inputfile == "https://piyabute.com/data/research/TUANDROMD.txt"): df.drop(df.index[2533], inplace=True)
    if (inputfile == "https://piyabute.com/data/research/yeast.data"): df.drop(labels=None, axis=0, inplace=True)
   
    # Stratified sampling
    if (file_big[dataset] > 0): 
        # Generate column index
        df.columns = df.columns.map(str)
        df = df.groupby(df.columns[file_label[dataset]], group_keys=False).apply(lambda x: x.sample(file_big[dataset]))
    
    # Categorical to Numerical
    
    if (encoder == 1):
        # Initialize the LabelEncoder
        labelencoder = LabelEncoder()
        # Loop through each column in the DataFrame and apply label encoding
        for column in df.columns:
            df[column] = labelencoder.fit_transform(df[column])
    elif (encoder == 2): # Incomplete!
        # Convert the label column into one hot encoding
        one_hot = pd.get_dummies(df.iloc[:, file_label[dataset]])
        # Drop the label column
        last_column_name = df.columns[file_label[dataset]]
        df.drop(columns=[last_column_name], inplace=True)
        df = df.join(one_hot)
        
    # Fill missing values (slow for big data) with the mode (most frequent value) of the column
    for column in df.columns:
        df.fillna({column: df[column].mode()[0]}, inplace=True)

    # Convert dataframe to numpy
    raw = df.to_numpy()

    # Extract the label attribute
    trainsetlabel = raw[:,file_label[dataset]]
    trainsetlabel = trainsetlabel.astype(int)

    # Remove the label attribute
    trainsetdata = np.delete(raw,file_label[dataset],1)
    
    # Calculate PCA
    if (pca == 1):
        postpca = PCA(n_components=pca_components)
        traindata = postpca.fit_transform(trainsetdata)
        trainlabel = trainsetlabel
    else:
        traindata = trainsetdata
        trainlabel = trainsetlabel
    
    # Normalize normal attributes using MinMaxScaler
    if (normalization == 1):
        scaler = MinMaxScaler(feature_range=(0.01, 0.99))
        scaler.fit(traindata)
        traindata = scaler.transform(traindata)
        trainlabel = trainlabel - np.amin(trainlabel)
 
    # -------------------------------------------------------------------------------------
    # Prepare initial training set and test set
    # -------------------------------------------------------------------------------------

    # Construct the training set and test set
    if (cross_validation == 1):
        testdata = []
        testlabel = []
        trainsize = traindata.shape[0]
        traindimension = traindata.shape[1]
        reduction[0] = trainsize
    else:
        if ((traindata.shape[0]*traindata.shape[1])<100000):
            traindata, testdata, trainlabel, testlabel = train_test_split(traindata, trainlabel, train_size=train_ratio, stratify=trainlabel, random_state=42)
        elif ((traindata.shape[0]*traindata.shape[1])<500000):
            traindata, testdata, trainlabel, testlabel = train_test_split(traindata, trainlabel, train_size=train_ratio_100k, stratify=trainlabel, random_state=42)
        else:
            traindata, testdata, trainlabel, testlabel = train_test_split(traindata, trainlabel, train_size=train_ratio_500k, stratify=trainlabel, random_state=42)
        trainsize = traindata.shape[0]
        traindimension = traindata.shape[1]
        testsize = testdata.shape[0]
        testdimension = testdata.shape[1]
        reduction[0] = trainsize     

    print("Preprocessing: Attribute =",traindimension,"(PCA)" if pca==1 else "(Non-PCA)","(Norm)" if normalization==1 else "(Non-Norm)")
    print("Preprocessing: Sample    =",trainsize)
    print("Preprocessing: Class     =", len(np.unique(trainlabel)), np.unique(trainlabel))
    reduction_class = np.zeros((max_type,len(np.unique(trainlabel))),dtype = np.float64)

    # Print class balance
    if (cross_validation == 1):
        unique_train, counts_train = np.unique(trainlabel, return_counts=True)
        print("Preprocessing: Train     =",dict(zip(unique_train, counts_train)))
    else:
        unique_test, counts_test = np.unique(testlabel, return_counts=True)
        print("Preprocessing: Test      =",dict(zip(unique_test, counts_test)))

    # -------------------------------------------------------------------------------------
    # Generate DISTANCE-BASED INSTANCE SELECTION (DIS) training sets 
    # -------------------------------------------------------------------------------------

    if (dis_mod == 1):
        tempset = raw
        tempdata = traindata
        templabel = trainlabel
        scatter_x = []
        scatter_y = []
    
        # Start clock
        time_start = time.perf_counter()
    
        # Parallel or serialized reduction
        if (parallel == 1):
            if __name__ == "__main__":
                d = 4; # 4 = Euclidean
                ProcessList = Parallel(n_jobs=num_cores)(delayed(find_minimum_distance)(d,i) for i in range(tempdata.shape[0]))
        else:
            ProcessList = [0]*tempdata.shape[0]
            for i in range(0,tempdata.shape[0]):
                ProcessList[i] = find_minimum_distance(i)
                print(i,templabel[i],ProcessList[i],templabel[ProcessList[i]])
    
        # Remove duplicates
        dis_vec = np.unique(ProcessList)
    
        # Stop clock
        time_stop = time.perf_counter()
    
        # Save execution time
        time_reduction[14] = time_stop - time_start
    
        # Construct reduced data set
        disdata = tempdata[dis_vec]
        dislabel = templabel[dis_vec]
        
        # Level 1 only
        disdata1 = disdata
        dislabel1 = dislabel
        tempsetbefore = tempdata.shape[0]
        tempdata1 = np.delete(tempdata,dis_vec,0)
        templabel1 = np.delete(templabel,dis_vec,0)
        tempdata = tempdata1
        templabel = templabel1
        tempsetafter = tempdata1.shape[0]
    
        print("DIS Reduction: Time = ","{:0.2f}".format(time_stop - time_start)," seconds / Size = ",disdata.shape[0]," / Original = ",tempsetbefore," / Remain = ", tempsetafter, sep = "")
    
        # Save output data sets
        if (output_dataset_save == 1):
            if (pca == 1):
                outputfile = Path(inputfile).stem+"_pca_dis.txt"
            else:
                outputfile = Path(inputfile).stem+"_dis.txt"
            with open(outputfile, "ab") as file:
                np.savetxt(file, np.c_[disdata1, np.array(dislabel1)], delimiter=",", fmt="%1.4f")            
                print("Saving reduced dataset (DIS) to "+outputfile, sep=" ")

    # -------------------------------------------------------------------------------------
    # Generate BOOSTING DISTANCED-BASED INSTANCE SELECTION (BDIS) training sets 
    # -------------------------------------------------------------------------------------

    if (bdis_mod == 1):
        tempset = raw
        tempdata = traindata
        templabel = trainlabel
        scatter_x = []
        scatter_y = []
    
        # Initialize arrays to store the nearest instance and its distance for each instance
        bdis_vec = np.zeros(len(tempdata), dtype=np.int32)
        bdis_dis = np.ones(len(tempdata)) * np.inf
    
        # Start clock
        time_start = time.perf_counter()
    
        # Create a cKDTree object from the feature data
        tree = cKDTree(tempdata)

        # Define the k-d query size
        kdsize = max(len(np.unique(templabel)),int(len(templabel)*kdratio))
        
        # Loop through each instance in the feature data
        for i in range(len(tempdata)):
            # Find the distance and index of the nearest neighbor
            min_dis, min_vec = tree.query(tempdata[i], k=kdsize)
    
            # Check if the nearest neighbor belongs to a different class
            for j in range(1, len(min_vec)):
                if (templabel[i] != templabel[min_vec[j]]):
                    bdis_vec[i] = min_vec[j]
                    bdis_dis[i] = min_dis[j]
                    break
    
        # Remove duplicates
        bdis_vec = np.unique(bdis_vec)
    
        # Stop clock
        time_stop = time.perf_counter()
    
        # Save execution time
        time_reduction[15] = time_stop - time_start
    
        # Construct reduced data set
        bdisdata = tempdata[bdis_vec]
        bdislabel = templabel[bdis_vec]
    
        # Level 1 only
        bdisdata1 = bdisdata
        bdislabel1 = bdislabel
        tempsetbefore = tempdata.shape[0]
        tempdata1 = np.delete(tempdata,bdis_vec,0)
        templabel1 = np.delete(templabel,bdis_vec,0)
        tempdata = tempdata1
        templabel = templabel1
        tempsetafter = tempdata1.shape[0]
    
        print("BDIS Reduction (k=",kdsize,"): Time = ","{:0.2f}".format(time_stop - time_start)," seconds / Size = ",bdisdata.shape[0]," / Original = ",tempsetbefore," / Remain = ", tempsetafter, sep = "")
    
        # Save output data sets
        if (output_dataset_save == 1):
            if (pca == 1):
                outputfile = Path(inputfile).stem+"_pca_bdis.txt"
            else:
                outputfile = Path(inputfile).stem+"_bdis.txt"
            with open(outputfile, "ab") as file:
                np.savetxt(file, np.c_[bdisdata1, np.array(bdislabel1)], delimiter=",", fmt="%1.4f")            
                print("Saving reduced dataset (BDIS) to "+outputfile, sep=" ")

    # -------------------------------------------------------------------------------------
    # Generate MULTI-CLASS CONTOUR PRESERVING CLASSIFICATION (MCOV) training sets
    # -------------------------------------------------------------------------------------

    if (mcov_mod == 1):
        tempset = raw
        tempdata = traindata
        templabel = trainlabel
        scatter_x = []
        scatter_y = []

        # Start clock FMCOV
        time_start = time.perf_counter()
    
        # Parallel or serialized reduction
        if (parallel == 1):
            if __name__ == "__main__":
                ProcessList = Parallel(n_jobs=num_cores)(delayed(mcov)(i) for i in range(tempdata.shape[0]))

        # Synthesize FMCOV
        fmcovkappa = 0.40
        fmcovdata = np.array([(1 - fmcovkappa) * tempdata[x[0]] + fmcovkappa * tempdata[x[1]] for x in ProcessList])       
        fmcovlabel = np.array([x[2] for x in ProcessList])

        # Remove duplicates
        fmcov = np.column_stack((fmcovdata,fmcovlabel))
        fmcov = np.unique(fmcov, axis=0)

        # Split fmcov back into fmcovdata and fmcovlabel
        fmcovdata = fmcov[:, :-1]
        fmcovlabel = fmcov[:, -1].astype(int)

        # Stop clock FMCOV
        time_stop = time.perf_counter()
        
        # Save execution time FMCOV
        time_reduction[1] = time_stop - time_start

        # Start clock AMCOV
        time_start = time.perf_counter()

        # Synthesize AMCOV
        amcovkappa = 0.40
        amcovdist = {}

        for x in ProcessList:
            key = (x[1])  # Create a tuple key
            if key not in amcovdist or x[4] < amcovdist[key]:
                amcovdist[key] = x[4]

        amcovdata, amcovlabel = zip(*[
            (
                amcovkappa * amcovdist[x[1]] / x[4] * tempdata[x[0]] +
                (1 - (amcovkappa * amcovdist[x[1]] / x[4])) * tempdata[x[1]],
                x[3]
            )
            for x in ProcessList if x[4] != 0
        ])
        
        # Convert them back to numpy arrays
        amcovdata = np.array(amcovdata)
        amcovlabel = np.array(amcovlabel)

        amcov = np.column_stack((amcovdata, amcovlabel))
        amcov = np.unique(amcov, axis=0)
        
        # Split amcov back into amcovdata and amcovlabel
        amcovdata = amcov[:, :-1]
        amcovlabel = amcov[:, -1].astype(int)
       
        # Stop clock AMCOV
        time_stop = time.perf_counter()
        
        # Save execution time AMCOV
        time_reduction[2] = time_stop - time_start

        # Save execution time MCOV
        time_reduction[3] = time_reduction[1] + time_reduction[2]
   
        mcovdata = np.concatenate((fmcovdata,amcovdata), axis=0)
        mcovlabel = np.concatenate((fmcovlabel,amcovlabel), axis=0)
        tempsetbefore = tempdata.shape[0]
        tempsetafter = mcovdata.shape[0]
    
        print("MCOV Synthesization: Time = ","{:0.2f}".format(time_stop - time_start)," seconds / Size = ",mcovdata.shape[0]," / Original = ",tempsetbefore," / Remain = ", tempsetafter, sep = "")
    
    # -------------------------------------------------------------------------------------
    # Generate BOOSTING MULTI-CLASS CONTOUR PRESERVING CLASSIFICATION (BMCOV) training sets
    # -------------------------------------------------------------------------------------

    if (bmcov_mod == 1):

        tempset = raw
        tempdata = traindata
        templabel = trainlabel
        scatter_x = []
        scatter_y = []
    
        # Start clock BFMCOV
        time_start = time.perf_counter()
    
        # Create a cKDTree object from the feature data
        tree = cKDTree(tempdata)

        # Define the k-d query size
        kdsize = max(len(np.unique(templabel)),int(len(templabel)*kdratio))
        
        # Loop through each instance in the feature data
        ProcessList = []
        
        for i in range(len(tempdata)):
            # Find the distance and index of the nearest neighbor
            min_dis, min_vec = tree.query(tempdata[i], k=kdsize)
    
            # Check if the nearest neighbor belongs to a different class
            hit = False
            for j in range(1, len(min_vec)):
                if (templabel[i] != templabel[min_vec[j]]):
                    ProcessList.append([np.int64(i),np.int64(min_vec[j]),templabel[i],templabel[min_vec[j]],min_dis[j]])
                    hit = True
                    break
            if hit == False:
                kd_query_miss[dataset] += 1
    
        # Synthesize fmcov
        bfmcovkappa = 0.40
        bfmcovdata = np.array([(1 - bfmcovkappa) * tempdata[x[0]] + bfmcovkappa * tempdata[x[1]] for x in ProcessList])
        bfmcovlabel = np.array([x[2] for x in ProcessList])

        # Remove duplicates
        bfmcov = np.column_stack((bfmcovdata,bfmcovlabel))
        bfmcov = np.unique(bfmcov, axis=0)

        # Split fmcov back into fmcovdata and fmcovlabel
        bfmcovdata = bfmcov[:, :-1]
        bfmcovlabel = bfmcov[:, -1].astype(int)
        
        # Stop clock BFMCOV
        time_stop = time.perf_counter()
        
        # Save execution time BFMCOV
        time_reduction[4] = time_stop - time_start

        # Start clock BAMCOV
        time_start = time.perf_counter()

        # Synthesize BAMCOV
        bamcovkappa = 0.40
        bamcovdist = {}

        for x in ProcessList:
            key = (x[1])  # Create a tuple key
            if key not in bamcovdist or x[4] < bamcovdist[key]:
                bamcovdist[key] = x[4]

        # Create a dictionary to store phi(i) mappings
        phi_dict = {x[0]: x[1] for x in ProcessList}
        
        bamcovdata, bamcovlabel = zip(*[
            (
                bamcovkappa * bamcovdist[x[1]] / x[4] * tempdata[x[0]] +
                (1 - (bamcovkappa * bamcovdist[x[1]] / x[4])) * tempdata[x[1]],
                x[3]
            )
            for x in ProcessList if x[4] != 0 and phi_dict.get(x[1], -1) != x[0]  # phi(phi(i)) != i
        ])
        
        # Convert them back to numpy arrays
        bamcovdata = np.array(amcovdata)
        bamcovlabel = np.array(amcovlabel)

        bamcov = np.column_stack((bamcovdata, bamcovlabel))
        bamcov = np.unique(bamcov, axis=0)
        
        # Split amcov back into amcovdata and amcovlabel
        bamcovdata = bamcov[:, :-1]
        bamcovlabel = bamcov[:, -1].astype(int)
       
        # Stop clock BAMCOV
        time_stop = time.perf_counter()
        
        # Save execution time BAMCOV
        time_reduction[5] = time_stop - time_start

        # Save execution time BMCOV
        time_reduction[6] = time_reduction[4] + time_reduction[5]
   
        bmcovdata = np.concatenate((bfmcovdata,bamcovdata), axis=0)
        bmcovlabel = np.concatenate((bfmcovlabel,bamcovlabel), axis=0)
        tempsetbefore = tempdata.shape[0]
        tempsetafter = bmcovdata.shape[0]
    
        print("BMCOV Synthesization: kd = ",kdsize," / Time = ","{:0.2f}".format(time_stop - time_start)," seconds / Size = ",bmcovdata.shape[0]," / Original = ",tempsetbefore," / Remain = ", tempsetafter, sep = "")
    
    # -------------------------------------------------------------------------------------
    # Generate plots and save output and meta
    # -------------------------------------------------------------------------------------
        
    if (mcov_mod == 1):
        # Plot the MCOV variants
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", mcovdata, mcovlabel, "PMCOV", dataset)
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", fmcovdata, fmcovlabel, "PFMCOV", dataset)
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", amcovdata, amcovlabel, "PAMCOV", dataset)
        
        # Plot the class distribution comparison
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", mcovdata, mcovlabel, "PMCOV", dataset)
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", fmcovdata, fmcovlabel, "PFMCOV", dataset)
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", amcovdata, amcovlabel, "PAMCOV", dataset)

        unique_mcov, counts_mcov = np.unique(mcovlabel, return_counts=True)
        unique_fmcov, counts_fmcov = np.unique(fmcovlabel, return_counts=True)
        unique_amcov, counts_amcov = np.unique(amcovlabel, return_counts=True)

        outputfile = "data-fmcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[fmcovdata, np.array(fmcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (FMCOV) to "+outputfile, sep=" ")    

        outputfile = "data-amcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[amcovdata, np.array(amcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (AMCOV) to "+outputfile, sep=" ")    

        outputfile = "data-mcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[mcovdata, np.array(mcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (MCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-fmcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_fmcov.reshape(1,counts_fmcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (FMCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-amcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_amcov.reshape(1,counts_amcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (AMCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-mcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_mcov.reshape(1,counts_mcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (MCOV) to "+outputfile, sep=" ")    
    
    if (bmcov_mod == 1):      
        # Plot the BMCOV variants
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", bmcovdata, bmcovlabel, "BMCOV", dataset)
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", bfmcovdata, bfmcovlabel, "BFMCOV", dataset)
        plot_data_distribution_comparison(traindata, trainlabel, "ORG", bamcovdata, bamcovlabel, "BAMCOV", dataset)
        
        # Plot the class distribution comparison
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", bmcovdata, bmcovlabel, "BMCOV", dataset)
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", bfmcovdata, bfmcovlabel, "BFMCOV", dataset)
        plot_class_distribution_comparison(traindata, trainlabel, "ORG", bamcovdata, bamcovlabel, "BAMCOV", dataset)
        
        # Calculate output meta
        unique_bmcov, counts_bmcov = np.unique(bmcovlabel, return_counts=True)
        unique_bfmcov, counts_bfmcov = np.unique(bfmcovlabel, return_counts=True)
        unique_bamcov, counts_bamcov = np.unique(bamcovlabel, return_counts=True)

        # outputfile = Path(inputfile).stem+"-bfmcov.txt"
        outputfile = "data-bfmcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[bfmcovdata, np.array(bfmcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (BFMCOV) to "+outputfile, sep=" ")    

        outputfile = "data-bamcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[bamcovdata, np.array(bamcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (BAMCOV) to "+outputfile, sep=" ")    

        outputfile = "data-bmcov-"+str(dataset+1).zfill(2)+".txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, np.c_[bmcovdata, np.array(bmcovlabel)], delimiter=",", fmt="%1.4f")            
            print("Saving reduced dataset (BMCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-bfmcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_bfmcov.reshape(1,counts_bfmcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (BFMCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-bamcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_bamcov.reshape(1,counts_bamcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (BAMCOV) to "+outputfile, sep=" ")    

        outputfile = "class-balance-bmcov.txt"
        with open(outputfile, "ab") as file:
            np.savetxt(file, counts_bmcov.reshape(1,counts_bmcov.shape[0]), delimiter="\t", fmt="%0.0f")
            print("Saving class balance (BMCOV) to "+outputfile, sep=" ")    
 
    # -------------------------------------------------------------------------------------
    # Generate final training sets
    # -------------------------------------------------------------------------------------

    # https://imbalanced-learn.org/
    for traintype in range(0,max_type):
        if traintype == 0: # Original
            xready = traindata
            tready = trainlabel
            temp, reduction_class_org = np.unique(tready, return_counts=True)
        if traintype == 1: # FMCOV
            xready = fmcovdata
            tready = fmcovlabel
        if traintype == 2: # BFMCOV
            xready = bfmcovdata
            tready = bfmcovlabel
        if traintype == 3: # AMCOV
            xready = amcovdata
            tready = amcovlabel
        if traintype == 4: # BAMCOV
            xready = bamcovdata
            tready = bamcovlabel
        if traintype == 5: # MCOV
            xready = mcovdata
            tready = mcovlabel
        if traintype == 6: # BMCOV
            xready = bmcovdata
            tready = bmcovlabel

        unique_tready, counts_tready = np.unique(tready, return_counts=True)

        # -------------------------------------------------------------------------------------
        # Generate BDIS and BIBP using OverSampling
        # -------------------------------------------------------------------------------------

        # OverSampling for DIS and IBP
        if (traintype == 16 or traintype ==17):

            # Maintain minimum values for n-fold cross-validation
            time_start = time.perf_counter()

            unique_tready, counts_tready = np.unique(tready, return_counts=True)
            mydict = dict(zip(unique_tready,counts_tready))
            print("Oversampling (Pre) : Set=",dataset,"; Type=",traintype," ",label_traintype[traintype],"; sample=", len(tready)," ({:0.2f}".format(len(tready)/len(trainlabel)*100), "%) ", dict(zip(unique_tready, counts_tready)), sep = "");
            for i in mydict: 
                mydict[i]=np.maximum(mydict[i],cross_validation_fold*max_k*4)

            # mymodel_smote = SMOTE(n_jobs=-1,sampling_strategy=mydict)
            mymodel_ros = RandomOverSampler(sampling_strategy=mydict)
            xready, tready = mymodel_ros.fit_resample(xready, tready)
            unique_tready, counts_tready = np.unique(tready, return_counts=True)

            time_stop = time.perf_counter()
            time_reduction[traintype] = time_reduction[traintype] + time_stop - time_start

            print("Oversampling (Post): Set=",dataset,"; Type=",traintype," ",label_traintype[traintype],"; sample=", len(tready)," ({:0.2f}".format(len(tready)/len(trainlabel)*100), "%) ", dict(zip(unique_tready, counts_tready)), sep = "");
        else:
            # Maintain minimum values for n-fold cross-validation
            time_start = time.perf_counter()

            unique_tready, counts_tready = np.unique(tready, return_counts=True)
            mydict = dict(zip(unique_tready,counts_tready))
            print("Oversampling (Pre) : Set=",dataset,"; Type=",traintype," ",label_traintype[traintype],"; sample=", len(tready)," ({:0.2f}".format(len(tready)/len(trainlabel)*100), "%) ", dict(zip(unique_tready, counts_tready)), sep = "");
            for i in mydict: 
                mydict[i]=np.maximum(mydict[i],cross_validation_fold)

            # mymodel_smote = SMOTE(n_jobs=-1,sampling_strategy=mydict)
            mymodel_ros = RandomOverSampler(sampling_strategy=mydict)
            xready, tready = mymodel_ros.fit_resample(xready, tready)
            unique_tready, counts_tready = np.unique(tready, return_counts=True)

            time_stop = time.perf_counter()
            time_reduction[traintype] = time_reduction[traintype] + time_stop - time_start

            print("Oversampling (Post): Set=",dataset,"; Type=",traintype," ",label_traintype[traintype],"; sample=", len(tready)," ({:0.2f}".format(len(tready)/len(trainlabel)*100), "%) ", dict(zip(unique_tready, counts_tready)), sep = "");
                   
        if (len(np.unique(tready))<len(np.unique(trainlabel))):
            print("Training: Cannot maintain all class labels!")
            # continue

        reduction[traintype] = len(tready)

        # Fixing empty class
        j = 0;
        for i in unique_tready:
            reduction_class[traintype][i] = counts_tready[j]
            j += 1
    
        # -------------------------------------------------------------------------------------
        # Plot training sets
        # -------------------------------------------------------------------------------------

        if (figure_save == 1):
            if (traindimension == 2):
                scatter_x = xready[:,0]
                scatter_y = xready[:,1]
                scatter_x_min = 0
                scatter_x_max = 1
                scatter_y_min = 0
                scatter_y_max = 1
                group = tready

                plt.figure(figsize=(8, 6))
                for label in np.unique(tready):
                    mask = tready == label
                    plt.scatter(scatter_x[mask], scatter_y[mask], 
                                color=colors[label], label=f'Class {label}', s=1, alpha=0.7)
                
                plt.xlabel('X')
                plt.ylabel('Y')
                plt.xlim([scatter_x_min,scatter_x_max])
                plt.ylim([scatter_y_min,scatter_y_max])
                plt.title(label_traintype[traintype],fontsize=16,fontweight='bold')
                plt.legend(loc='upper right')
                plt.savefig("instance-"+str(dataset+1).zfill(2)+"-"+str(traintype+1).zfill(2)+".pdf", format="pdf", dpi=None, facecolor="w", edgecolor="w", orientation="portrait", transparent=True, bbox_inches="tight", pad_inches=0.02, metadata=None)
                plt.show()
                plt.close()
           
            if (traindimension == 3):
                scatter_x = xready[:,0]
                scatter_y = xready[:,1]
                scatter_z = xready[:,2]
                scatter_x_min = 0
                scatter_x_max = 1
                scatter_y_min = 0
                scatter_y_max = 1
                scatter_z_min = 0
                scatter_z_max = 1
                group = tready
                plt.figure(figsize=(8, 6))
                for label in np.unique(tready):
                    mask = tready == label
                    plt.scatter(scatter_x[mask], scatter_y[mask], scatter_z[mask],
                                color=colors[label], label=f'Class {label}', s=1, alpha=0.7)
                plt.xlabel('X')
                plt.ylabel('Y')
                plt.zlabel('Z')
                plt.xlim([scatter_x_min,scatter_x_max])
                plt.ylim([scatter_y_min,scatter_y_max])
                plt.zlim([scatter_z_min,scatter_z_max])
                plt.title(label_traintype[traintype],fontsize=16,fontweight='bold')
                plt.savefig("instance-"+str(dataset+1).zfill(2)+"-"+str(traintype+1).zfill(2)+".pdf", format="pdf", dpi=None, facecolor="w", edgecolor="w", orientation="portrait", transparent=True, bbox_inches="tight", pad_inches=0.02, metadata=None)
                plt.show()
                plt.close()

        # -------------------------------------------------------------------------------------
        # LAZY LEARNING ALGORITHMS
        # -------------------------------------------------------------------------------------

        # -------------------------------------------------------------------------------------
        # Record performance
        # -------------------------------------------------------------------------------------

        # Execute loop only once for non-knn
        if (model != 'knn'): max_k = 0

        for k in range(0,max_k,1):

            # Training Cross validation
            tracemalloc.start()
            time_start = time.perf_counter()
            if (model == 'knn'):
                clf = KNeighborsClassifier(n_neighbors=(2*k)+1,n_jobs=-1)
            elif (model == 'svm'):
                clf = svm.SVC(kernel='linear', C=1, random_state=0,probability=True)
            elif (model == 'nn'):  # Neural Network Model
                clf = Sequential([
                    Dense(64, activation='relu', input_shape=(input_dim,)),
                    Dense(32, activation='relu'),
                    Dense(num_classes, activation='softmax')
                ])
                clf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

            if min(counts_tready) >= 2:
                cross_validation_fold_final = min(cross_validation_fold, min(counts_tready))
                cross_validation_error = 0
            else:
                print(f"Skipping traintype={traintype} due to insufficient samples (min={min(counts_tready)})")
                cross_validation_error = 1
                continue

                
    
            # Custom scorer for Cohen's Kappa
            kappa_scorer = make_scorer(cohen_kappa_score)

            scoring = {"accuracy":"accuracy",
                       "precision":"precision_weighted",
                       "recall":"recall_weighted",
                       "f1":"f1_weighted",
                       "aucovr":"roc_auc_ovr_weighted",
                       "aucovo":"roc_auc_ovo_weighted",
                       "kappa": kappa_scorer}

            skf = StratifiedKFold(n_splits=cross_validation_fold_final, random_state=0, shuffle=True)
            scores = cross_validate(clf, xready, tready, scoring=scoring, cv=skf, return_train_score=True, n_jobs=-1)

            time_stop = time.perf_counter()
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            mem_training_current[traintype,k] = current
            mem_training_peak[traintype,k] = peak
            time_training[traintype,k] = time_stop - time_start

            if (cross_validation_error == 0):
                perf_fit_time[traintype,k] = np.nanmean(scores["fit_time"])
                perf_score_time[traintype,k] = np.nanmean(scores["score_time"])
                perf_train_accuracy[traintype,k] = np.nanmean(scores["train_accuracy"])
                perf_train_precision[traintype,k] = np.nanmean(scores["train_precision"])
                perf_train_recall[traintype,k] = np.nanmean(scores["train_recall"])
                perf_train_f1[traintype,k] = np.nanmean(scores["train_f1"])
                perf_train_aucovr[traintype,k] = np.nanmean(scores["train_aucovr"])
                perf_train_aucovo[traintype,k] = np.nanmean(scores["train_aucovo"])
                perf_train_kappa[traintype,k] = np.nanmean(scores["train_kappa"])
            else:
                perf_fit_time[traintype,k] = np.nan
                perf_score_time[traintype,k] = np.nan
                perf_train_accuracy[traintype,k] = np.nan
                perf_train_precision[traintype,k] = np.nan
                perf_train_recall[traintype,k] = np.nan
                perf_train_f1[traintype,k] = np.nan
                perf_train_aucovr[traintype,k] = np.nan
                perf_train_aucovo[traintype,k] = np.nan
                perf_train_kappa[traintype,k] = np.nan

            if (model == 'knn'):
                print("Training: D=", dataset,
                      "; Type=",label_traintype[traintype],
                      "; cv=",cross_validation_fold_final,
                      "; d=",d,
                      "; k=",(2*k)+1,
                      "; Train=",reduction[traintype],
                      "; Acc=","{:0.2f}".format(perf_train_accuracy[traintype,k]),
                      "; Pre=","{:0.2f}".format(perf_train_precision[traintype,k]),
                      "; Recall=","{:0.2f}".format(perf_train_recall[traintype,k]),
                      "; F1=","{:0.2f}".format(perf_train_f1[traintype,k]),
                      "; AUC OVR=","{:0.2f}".format(perf_train_aucovr[traintype,k]),
                      "; AUC OVO=","{:0.2f}".format(perf_train_aucovo[traintype,k]),sep = "")
            elif (model == 'svm2'):
                print("Training: D=", dataset,
                      "; Type=",label_traintype[traintype],
                      "; cv=",cross_validation_fold_final,
                      "; d=",d,
                      "; Train=",reduction[traintype],
                      "; Acc=","{:0.2f}".format(sum(perf_train_accuracy[traintype])/len(perf_train_accuracy[traintype])),
                      "; Pre=","{:0.2f}".format(sum(perf_train_precision[traintype]/len(perf_train_precision[traintype]))),
                      "; Recall=","{:0.2f}".format(sum(perf_train_recall[traintype]/len(perf_train_recall[traintype]))),
                      "; F1=","{:0.2f}".format(sum(perf_train_f1[traintype]/len(perf_train_f1[traintype]))),
                      "; AUC OVR=","{:0.2f}".format(sum(perf_train_aucovr[traintype]/len(perf_train_aucovr[traintype]))),
                      "; AUC OVO=","{:0.2f}".format(sum(perf_train_aucovo[traintype]/len(perf_train_aucovo[traintype]))), sep = "")

        # -------------------------------------------------------------------------------------
        # Save training performance and execution time
        # -------------------------------------------------------------------------------------

        if (performance_save == 1):
            with open("cross_fit_time.txt", "ab") as file:
                np.savetxt(file, perf_fit_time[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.8f")
            with open("cross_score_time.txt", "ab") as file:
                np.savetxt(file, perf_score_time[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.8f")
            with open("cross_train_accuracy.txt", "ab") as file:
                np.savetxt(file, perf_train_accuracy[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")
            with open("cross_train_precision.txt", "ab") as file:
                np.savetxt(file, perf_train_precision[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("cross_train_recall.txt", "ab") as file:
                np.savetxt(file, perf_train_recall[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("cross_train_f1.txt", "ab") as file:
                np.savetxt(file, perf_train_f1[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("cross_train_aucovr.txt", "ab") as file:
                np.savetxt(file, perf_train_aucovr[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("cross_train_aucovo.txt", "ab") as file:
                np.savetxt(file, perf_train_aucovo[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("cross_train_kappa.txt", "ab") as file:
                np.savetxt(file, perf_train_kappa[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("training_time.txt", "ab") as file:
                np.savetxt(file, time_training[traintype].reshape(1,max_k), delimiter="\t", fmt="%1.4f")        
            with open("training_mem_current.txt", "ab") as file:
                np.savetxt(file, mem_training_current[traintype].reshape(1,max_k), delimiter="\t", fmt="%0.0f")        
            with open("training_mem_peak.txt", "ab") as file:
                np.savetxt(file, mem_training_peak[traintype].reshape(1,max_k), delimiter="\t", fmt="%0.0f")        
            print("Saving: Set=",dataset,"; type=",traintype,sep = "")
                  
        # Print average scores
        if (model == 'knn'):
            print("Training: D=", dataset,
                  "; Type=",label_traintype[traintype],
                  "; cv=",cross_validation_fold_final,
                  "; d=",d,
                  "; k=average",
                  "; Train=",reduction[traintype],
                  "; Acc=","{:0.2f}".format(np.nanmean(perf_train_accuracy[traintype],axis=0)),
                  "; Pre=","{:0.2f}".format(np.nanmean(perf_train_precision[traintype],axis=0)),
                  "; Recall=","{:0.2f}".format(np.nanmean(perf_train_recall[traintype],axis=0)),
                  "; F1=","{:0.2f}".format(np.nanmean(perf_train_f1[traintype],axis=0)),
                  "; AUC OVR=","{:0.2f}".format(np.nanmean(perf_train_aucovr[traintype],axis=0)),
                  "; AUC OVO=","{:0.2f}".format(np.nanmean(perf_train_aucovo[traintype],axis=0)),sep = "")
        elif (model == 'svm'):
            print("Training: D=", dataset,
                  "; Type=",label_traintype[traintype],
                  "; cv=",cross_validation_fold_final,
                  "; d=",d,
                  "; Train=",reduction[traintype],
                  "; Acc=","{:0.2f}".format(np.nanmean(perf_train_accuracy[traintype],axis=0)),
                  "; Pre=","{:0.2f}".format(np.nanmean(perf_train_precision[traintype],axis=0)),
                  "; Recall=","{:0.2f}".format(np.nanmean(perf_train_recall[traintype],axis=0)),
                  "; F1=","{:0.2f}".format(np.nanmean(perf_train_f1[traintype],axis=0)),
                  "; AUC OVR=","{:0.2f}".format(np.nanmean(perf_train_aucovr[traintype],axis=0)),
                  "; AUC OVO=","{:0.2f}".format(np.nanmean(perf_train_aucovo[traintype],axis=0)),sep = "")
        print()

    # -------------------------------------------------------------------------------------
    # Save data reduction performance
    # -------------------------------------------------------------------------------------

    if (performance_save == 1):
        with open("class-balance.txt", "ab") as file:
            np.savetxt(file, counts_train.reshape(1,counts_train.shape[0]), delimiter="\t", fmt="%0.0f")
        with open("class-balance_percentage.txt", "ab") as file:
            np.savetxt(file, (counts_train/traindata.shape[0]).reshape(1,counts_train.shape[0]), delimiter="\t", fmt="%0.4f")
        with open("reduction-size.txt", "ab") as file:
            np.savetxt(file, reduction.reshape(1,reduction.shape[0]), delimiter="\t", fmt="%0.0f")
        with open("reduction-class.txt", "ab") as file:
            np.savetxt(file, np.transpose(reduction_class), delimiter="\t", fmt="%0.0f")
        with open("reduction-time.txt", "ab") as file:
            np.savetxt(file, time_reduction.reshape(1,max_type), delimiter="\t", fmt="%1.4f")        

# -------------------------------------------------------------------------------------
# Save dataset-level data reduction performance
# -------------------------------------------------------------------------------------

outputfile = "reduction_kd_query_miss.txt"
with open(outputfile, "ab") as file:
    np.savetxt(file, kd_query_miss, delimiter="\t", fmt="%0.0f")  

print("Evaluation: Dataset",dataset,"is done!")
print("------------------------------------------------------------------------------\n\r")
