In [2]:
%pip install JPype1

Note: you may need to restart the kernel to use updated packages.


In [3]:
import jpype
from jpype.types import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import normalized_mutual_info_score, accuracy_score
import argparse
import glob
import os
import numpy as np
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
import sys
import time


In [4]:
ELKI_JAR = "/home/esrp2024/tmp/elki-bundle-0.8.0.jar"
#DATA_FOLDER = "Clustering_pipeline/sub_sampling_cv(nick)"  # Folder containing CSV files
# 1. JVM Management
if jpype.isJVMStarted():
    print("JVM already running. Kernel restart recommended.")
else:
    jpype.startJVM(
        "--add-opens=java.base/java.lang=ALL-UNNAMED",
        "--add-opens=java.base/java.util=ALL-UNNAMED",
        classpath=[ELKI_JAR], 
        convertStrings=True
    )

# Load ELKI classes
wcss = jpype.JClass('elki.clustering.kmeans.quality.WithinClusterVariance')
LloydKMeans = jpype.JClass('elki.clustering.kmeans.LloydKMeans')
StaticArrayDatabase = jpype.JClass('elki.database.StaticArrayDatabase')
ArrayAdapterDatabaseConnection = jpype.JClass('elki.datasource.ArrayAdapterDatabaseConnection')
EuclideanDistance = jpype.JClass('elki.distance.minkowski.EuclideanDistance')
KMeansPlusPlus = jpype.JClass('elki.clustering.kmeans.initialization.KMeansPlusPlus')
RandomFactory = jpype.JClass('elki.utilities.random.RandomFactory')
NumberVector = jpype.JClass('elki.data.NumberVector')
RandomlyChosen = jpype.JClass('elki.clustering.kmeans.initialization.RandomlyChosen')
ElkanKMeans  = jpype.JClass('elki.clustering.kmeans.ElkanKMeans')
YinYangKMeans = jpype.JClass('elki.clustering.kmeans.YinYangKMeans')
AnnulusKMeans = jpype.JClass('elki.clustering.kmeans.AnnulusKMeans')
HamerlyKMeans = jpype.JClass('elki.clustering.kmeans.HamerlyKMeans')
ShallotKMeans = jpype.JClass('elki.clustering.kmeans.ShallotKMeans')
ExponionKMeans = jpype.JClass('elki.clustering.kmeans.ExponionKMeans')
BestOfMultipleKMeans = jpype.JClass('elki.clustering.kmeans.BestOfMultipleKMeans')
BetulaLloydKMeans = jpype.JClass('elki.clustering.kmeans.BetulaLloydKMeans')
BisectingKMeans = jpype.JClass('elki.clustering.kmeans.BisectingKMeans')
CompareMeans = jpype.JClass('elki.clustering.kmeans.CompareMeans')
FuzzyCMeans = jpype.JClass('elki.clustering.kmeans.FuzzyCMeans')
GMeans = jpype.JClass('elki.clustering.kmeans.GMeans')
HartiganWongKMeans = jpype.JClass('elki.clustering.kmeans.HartiganWongKMeans')
KDTreeFilteringKMeans = jpype.JClass('elki.clustering.kmeans.KDTreeFilteringKMeans')
KDTreePruningKMeans = jpype.JClass('elki.clustering.kmeans.KDTreePruningKMeans')
KMeansMinusMinus = jpype.JClass('elki.clustering.kmeans.KMeansMinusMinus')
KMediansLloyd = jpype.JClass('elki.clustering.kmeans.KMediansLloyd')
MacQueenKMeans = jpype.JClass('elki.clustering.kmeans.MacQueenKMeans')
SimplifiedElkanKMeans = jpype.JClass('elki.clustering.kmeans.SimplifiedElkanKMeans')
SingleAssignmentKMeans = jpype.JClass('elki.clustering.kmeans.SingleAssignmentKMeans')
XMeans = jpype.JClass('elki.clustering.kmeans.XMeans')
SortMeans = jpype.JClass('elki.clustering.kmeans.SortMeans')
ParallelLloydKMeans = jpype.JClass('elki.clustering.kmeans.parallel.ParallelLloydKMeans')
KMeansQualityMeasure = jpype.JClass('elki.clustering.kmeans.quality.KMeansQualityMeasure')
SimplifiedElkanKMeans = jpype.JClass('elki.clustering.kmeans.SimplifiedElkanKMeans')
WithinClusterVariance = JClass("elki.clustering.kmeans.quality.WithinClusterVariance")
KDTreePruningKMeansSplit = JClass("elki.clustering.kmeans.KDTreePruningKMeans$Split")

In [5]:
def process_file(file):
    print(file)
    df = pd.read_csv(file)  # Use the file variable from the loop
    # Drop the first row (which might contain labels or bad data)
    df = df.drop(index=0).reset_index(drop=True)

    # Assume the last column is the target/label column
    true_labels = df.iloc[:, -1].values  # Extract labels from last column
    df = df.iloc[:, :-1]                 # Drop last column (features only remain)

    # Count number of unique labels
    k_centroids = len(np.unique(true_labels))
    print(f"k = {k_centroids}")
    """
    # Extract ground truth labels if available
    if 'target' in df.columns:
        true_labels = df['target'].values  # Ground truth labels for evaluation
        df = df.drop(columns=['target'])   # Drop label column before clustering
        df = df.drop(index=0).reset_index(drop=True)
        k_centroids =len(np.unique(true_labels))
        print(f"k=",k_centroids)
    else:
        true_labels = None
    """

    # Ensure only numeric columns are used
    data_values = df.select_dtypes(include=['number']).values.astype(float)

    # Convert to 2D Java array (double[][])
    java_data = jpype.JArray(JDouble, 2)(data_values)

    # Prepare ELKI database
    adapter = ArrayAdapterDatabaseConnection(java_data)
    database = StaticArrayDatabase(adapter, [])
    database.initialize()

    # Find the correct relation of type NumberVector
    relation = None
    for rel in database.getRelations():
        if NumberVector.class_.isAssignableFrom(rel.getDataTypeInformation().getRestrictionClass()):
            relation = rel
            break

    if relation is None:
        raise ValueError("No valid Relation<NumberVector> found in the database!")

    # Debugging: Print the number of relations in the database
    #print("Total Relations in Database:", len(database.getRelations()))

    # Debugging: Check if relation exists and print details
    if relation:
        #print("✅ Relation successfully found!")
        #print("Total Data Points in Relation:", relation.size())

        # Fetch a few data points from relation (convert from ELKI to Python)
        iter_dbid = relation.iterDBIDs()  # Get all IDs
        sample_points = []
        for _ in range(5):  # Print first 5 points
            if not iter_dbid.valid():
                break  # Stop if there are no more data points
            obj = relation.get(iter_dbid)
            sample_points.append([obj.doubleValue(i) for i in range(obj.getDimensionality())])
            iter_dbid.advance()

    return relation, data_values, true_labels, k_centroids

In [6]:
def get_centroids_pos(result):
    centroids = []
    for cluster in result.getAllClusters():
        if not cluster.isNoise():
            model = cluster.getModel()  # ✅ Extract KMeansModel
            if model is not None:
                centroid = centroid = list(model.getMean()) 
                centroids.append(centroid)  # ✅ Store centroid
    centroids = np.array(centroids)
    return centroids


In [7]:
def clustering(relation,data_values,kmeans):     
   
    
    
    # Start the timer (tic)
    start_time = time.perf_counter()
    result = kmeans.run(relation)

    end_time = time.perf_counter()
    clustering_time = end_time - start_time

    # Extract cluster assignments
    cluster_labels = np.full(len(data_values), -1)  # Initialize with -1 (unassigned)
    # Build the cluster labels array based on the relation's DBIDs.
    dbid_range = relation.getDBIDs()        # This is an IntegerDBIDRange for all data points.
    num_dbids = dbid_range.size()             # Should be 150 for the iris dataset.
    cluster_labels = np.full(num_dbids, -1)   # Initialize with -1 (unassigned)

    for cluster_id, cluster in enumerate(result.getAllClusters()):
        #print("Processing cluster_ID:", cluster_id)
        if not cluster.isNoise():
            # Loop over all DBIDs (in the same order as the data_values)
            for i in range(num_dbids):
                dbid = dbid_range.get(i)  # Get the DBID corresponding to the i-th data point.
                # Check if this DBID is in the cluster's set.
                if cluster.getIDs().contains(dbid):
                    cluster_labels[i] = cluster_id
                    #print(f"Assigned cluster {cluster_id} to index {i}")

    centroids_pos = get_centroids_pos(result)
    
    return result, cluster_labels,centroids_pos, clustering_time

In [8]:
def calculate_inertia(relation, result):
    SquaredErrors = jpype.JClass('elki.evaluation.clustering.internal.SquaredErrors')
    NoiseHandling = jpype.JClass('elki.evaluation.clustering.internal.NoiseHandling')
    available_constants = [const.name() for const in NoiseHandling.values()]
    #print("Available NoiseHandling constants:", available_constants)

    # Suppose you have:
    #   database:    a StaticArrayDatabase
    #   relation:    a Relation<NumberVector>
    #   clustering:  the result of LloydKMeans.run(relation)

    # 1. Choose a distance and noise handling
    distance = EuclideanDistance()
    # List all enum values if you're unsure which is available:
    #   print([v.name() for v in NoiseHandling.values()])
    noise_handling = NoiseHandling.valueOf("MERGE_NOISE")  # or another valid enum

    # 2. Create the evaluator
    sse_evaluator = SquaredErrors(distance, noise_handling)

    # 3. Compute SSE (sum of squared errors)
    sse = sse_evaluator.evaluateClustering(relation, result)
    return sse

In [9]:
def compute_accuracy(true_labels, predicted_labels):
    if true_labels is None:
         return None
    # Create a contingency (confusion) matrix
    C = confusion_matrix(true_labels, predicted_labels)
    # print(C)
    # Use the Hungarian algorithm to maximize the total correct assignments
    #row_ind and col_ind are 2 numpy array
    row_ind, col_ind = linear_sum_assignment(-C)  # We use negative because we want to maximize
    #print("row_ind ", row_ind)
    #print("col_ind ",col_ind)
    # Sum the counts from the optimal assignment
    total_correct = C[row_ind, col_ind].sum()
    #print("total_correct ", total_correct)
    # Calculate accuracy as the ratio of correctly assigned samples
    accuracy = total_correct / np.sum(C)
    #print ("sum(C) ", np.sum(C))
    #print(accuracy)
    return accuracy


In [10]:
def print_evaluation_scores(true_labels, predicted_labels):
    nmi_score = normalized_mutual_info_score(true_labels, predicted_labels) if true_labels is not None else None
    accuracy_score= compute_accuracy(true_labels, predicted_labels)
    print(f"NMI: ",nmi_score)
    print(f"accuracy_score: ",accuracy_score)

In [11]:
def choose_best_algorithm_and_write_centroids(algorithms_map, output_file="centroids.txt"):
    """
    Evaluates a dictionary of algorithms (each mapped to a tuple of 
    (time, accuracy, centroids)) to find the best algorithm based on 
    the lowest time/accuracy ratio. After picking the best algorithm, 
    this function writes the positions of its k centroids into a text file.

    Parameters:
        algorithms_map (dict): Dictionary where each key is an algorithm name and 
                               each value is a tuple (time, accuracy, centroids).
                               For example:
                               {
                                   "Algorithm A": (120, 0.85, [(x1, y1), (x2, y2), ...]),
                                   "Algorithm B": (100, 0.80, [(x1, y1), (x2, y2), ...]),
                                   ...
                               }
        output_file (str): The file path to write the centroids' positions (default "centroids.txt").

    Returns:
        tuple: (best_algo_name, best_ratio) where best_algo_name is the name of the best algorithm,
               and best_ratio is its time/accuracy ratio.
    """
    best_algo_name = None
    best_ratio = 0
    
    # Iterate over each algorithm to compute the time/accuracy ratio.
    for algo_name, metrics in algorithms_map.items():
        # Since metrics is a tuple: (time, accuracy, centroids)
        time_val, accuracy_val, centroids = metrics
        
        # Skip algorithms with missing metrics or accuracy of zero.
        if time_val is None or accuracy_val is None or accuracy_val == 0:
            continue
        
        ratio = accuracy_val
        print(f"Algorithm: {algo_name}, Time: {time_val}, Accuracy: {accuracy_val}, Ratio: {ratio:.3f}")
        """ what is this find max :)))))
        if ratio < best_ratio:
            best_ratio = ratio
            best_algo_name = algo_name
        """
        if ratio > best_ratio:
            best_ratio=ratio 
            best_algo_name = algo_name
            
    if best_algo_name is None:
        print("No valid algorithm found.")
        return None, None

    # Retrieve centroids from the best algorithm's tuple.
    _, _, centroids_best = algorithms_map[best_algo_name]
    
    if centroids_best is None:
        print("Best algorithm does not contain centroid information.")
        return best_algo_name, best_ratio

    # Write the centroid positions to the specified text file.
    with open(output_file, "w") as file:
        #file.write(f"Centroids for {best_algo_name} (Ratio: {best_ratio:.3f}):\n")
        for idx, centroid in enumerate(centroids_best, start=1):
            file.write(f"{centroid}\n")
    
    print(f"Centroid positions written to {output_file}")
    print(f"\nThe best algorithm is {best_algo_name} with a time/accuracy ratio of {best_ratio:.3f}.")
    return best_algo_name, best_ratio



        


In [12]:
max_iter=300 
algorithm_constructors = [
        lambda k_centroids: LloydKMeans(EuclideanDistance(), k_centroids, max_iter, RandomlyChosen(RandomFactory(42))),
        lambda k_centroids: LloydKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: ElkanKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: SimplifiedElkanKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: YinYangKMeans(k_centroids, max_iter, RandomlyChosen(RandomFactory(42)), 5),
        lambda k_centroids: AnnulusKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: HamerlyKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: ShallotKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: ExponionKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: BestOfMultipleKMeans(10,LloydKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),wcss()),
        lambda k_centroids: CompareMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: FuzzyCMeans(k_centroids, 10, max_iter, 0.0001, 2,False, KMeansPlusPlus(RandomFactory(42)) ),             
        lambda k_centroids: HartiganWongKMeans( k_centroids,KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: KDTreeFilteringKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), KDTreePruningKMeans.Split.SSQ,40),
        lambda k_centroids: KMeansMinusMinus(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)),0.05, True),
        lambda k_centroids: KDTreePruningKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), KDTreePruningKMeansSplit.SSQ,40),
        lambda k_centroids: KMediansLloyd(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: MacQueenKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: SimplifiedElkanKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True),
        lambda k_centroids: SingleAssignmentKMeans(EuclideanDistance(), k_centroids,KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: SortMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),
        lambda k_centroids: ParallelLloydKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)))
]


    

In [None]:
"""
lambda k_centroids: GMeans(EuclideanDistance(),0.05, k_centroids, k_centroids,max_iter,
                                AnnulusKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42)), True), 
                                KMeansPlusPlus(RandomFactory(42)), RandomFactory(42)),
 lambda k_centroids: XMeans(
        EuclideanDistance(),  # Distance function
        k_centroids,  # Minimum number of clusters
        k_centroids * 2,  # Maximum number of clusters (XMeans can return up to 2*k)
        max_iter,  # Maximum iterations
        LloydKMeans(EuclideanDistance(), k_centroids, max_iter, KMeansPlusPlus(RandomFactory(42))),  # Inner K-Means variant
        KMeansPlusPlus(RandomFactory(42)),  # Initialization method
        WithinClusterVariance(),  # Correct quality measure
        RandomFactory(42)  # Random factory
        ),
"""

In [18]:
def main(input_dir, output_dir="results"):
    # Create results directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    

    # Automatically find all CSV files in the provided directory.
    csv_files = glob.glob(os.path.join(input_dir, '*.csv'))
    if not csv_files:
        print("No CSV files found in the specified directory.")
        return
    
    # Process each CSV file individually

    for file in csv_files:
        #process the file 

        #uncomment to mass process
        relation,data_values,true_labels,k_centroids = process_file(file)

        # uncomment to run any single dataset
        #relation,data_values,true_labels,k_centroids = process_file("/home/esrp2024/tmp/train/train_dataset_1485.csv")
        
        #list of algors
        #algorithms_list=["LloydKMeans","KMeansPlusPlus","ElkanKmeans","YinYangKMeans","AnnulusKMeans","HamerlyKMeans"]
        #empty map
        algorithms_map = {}

        #for algorithm_name in algorithms_list:
        for constructor in algorithm_constructors:
            kmeans = constructor(k_centroids)
            # Get the base name of the file (i.e., the file name without the path)
            #basename = os.path.basename(file)
            
            
            
            result,predicted_labels,centroids_pos,clustering_time = clustering(relation, data_values,kmeans)
            #inertia=calculate_inertia(relation, result)
            acc=compute_accuracy(true_labels, predicted_labels)
            
            #map
            algorithms_map[kmeans] = (clustering_time, acc, centroids_pos)
            
         # Generate a unique output file for each dataset inside the "results" folder
        dataset_name = os.path.basename(file).replace('.csv', '')  # Extract dataset name
        centroids_filename = os.path.join(output_dir, f"centroids_{dataset_name}.csv")  # Save best centroids

        # Choose the best algorithm and write only the best centroids to a dataset-specific file
        choose_best_algorithm_and_write_centroids(algorithms_map, output_file=centroids_filename)

        print(f"Best centroids saved to {centroids_filename}")

        

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Process multiple CSV datasets from a directory.')
    parser.add_argument('--input_dir', type=str, default='cv/train_cv_w_header', 
                        help='Directory containing CSV files (default is "sub_sampling_cv(nick)")')
    parser.add_argument('--output_dir', type=str, default='cv/train_cv_result', 
                        help='Directory to save the results (default is "results")')
    args, unknown = parser.parse_known_args()
    main(args.input_dir, args.output_dir)


cv/train_cv_w_header/train_cv_dataset_46591.csv
k = 2
Algorithm: elki.clustering.kmeans.LloydKMeans@74f6c5d8, Time: 0.000747166108340025, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.LloydKMeans@19b89d4, Time: 0.0006053256802260876, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.ElkanKMeans@2415fc55, Time: 0.0013804808259010315, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.SimplifiedElkanKMeans@93081b6, Time: 0.0011626733466982841, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.YinYangKMeans@15a04efb, Time: 0.0007271845825016499, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.AnnulusKMeans@26adfd2d, Time: 0.000694845337420702, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.HamerlyKMeans@6950ed69, Time: 0.0006415559910237789, Accuracy: 0.9551971326164874, Ratio: 0.955
Algorithm: elki.clustering.kmeans.Shal

In [19]:
import os
import pandas as pd

# Input and output folders
input_folder = "cv/train_cv_w_header"  # Folder containing CSV files
output_folder = "cv/train_cv_no_header"
#output_folder2 = "test_w_exact_true_labels"
# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process all .csv files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)
        true_labels_path = os.path.join(output_folder2, "exact_true_labels_" + filename)
        # Read without assuming header
        df = pd.read_csv(input_path, header=None)

        # Drop first row and last column
        df = df.drop(index=0).reset_index(drop=True)
        true_labels = df.iloc[:, -1].values  # Extract labels from last column
        df = df.iloc[:, :-1]

        # Save to output folder with the same filename
        df.to_csv(output_path, index=False, header=False)
        #pd.Series(true_labels).to_csv(true_labels_path, index=False, header=False)
        print(f"Processed: {filename}")

Processed: train_cv_dataset_46591.csv
Processed: train_cv_dataset_46542.csv
Processed: train_cv_dataset_46540.csv
Processed: train_cv_dataset_46876.csv


In [25]:
def count_csv_files(folder_path):
    return sum(1 for file in os.listdir(folder_path) if file.endswith('.csv'))
folder = "/home/esrp2024/tmp/results_for_test_data"
print(f"CSV files found: {count_csv_files(folder)}")

CSV files found: 45
