# Bachlor Thesis: Extending K-Means Clustering with Ptolemy’s Inequality
## Performance Evaluation 

## 1. Introduction
In this notebook, we evaluate the performance of the proposed K-Means Extension with Ptolemy's Inequality by applying it to various synthetic and real-world datasets. 
Several versions of the K-Means clustering algorithm are compared:
- **Classical Lloyd's K-Means**
- **Elkan's Algorithm**
- **Elkan's Algorithm using Ptolemy for Upper Bounds**
- **Elkan's Algorithm using Ptolemy for Lower Bounds**
- **Elkan's Algorithm using Ptolemy for Upper and Lower Bounds**

These algorithms are tested on the following datasets:
- **Gaussian Blobs**: Synthetic dataset with isotropic Gaussian blobs, useful for testing clustering algorithms based on Euclidean distance.
- **Iris**: Classic dataset containing measurements of iris flowers, with three different species.
- **Wine**: Dataset with chemical analysis results of wines grown in the same region in Italy, but derived from three different cultivars.
- **Moons**: Synthetic dataset with two interleaving half circles, demonstrating the need for clustering algorithms to handle non-linear separations.
- **Circles**: Synthetic dataset with a large circle containing a smaller circle, testing the algorithm's ability to manage concentric shapes.
- **Classification**: Generated dataset with clusters, providing control over feature distribution and separation between classes.
- **Sparse Blobs**: High-dimensional synthetic dataset with sparse data points, challenging the algorithm's efficiency in higher dimensions.




## 2. Setup and imports
Here we import our custom K-Means algorithms.

In [2]:
import numpy as np
import pandas as pd
from means import Kmeans, DataGenerator
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_circles, make_classification, make_gaussian_quantiles, load_iris, load_wine
from sklearn.preprocessing import StandardScaler


## 3. Dataset Generation

In [15]:
def generate_datasets():
    datasets = {}

    # Gaussian (Blobs)
    low_dim_generator = DataGenerator(n_samples=10000, n_features=10, n_clusters=4, random_state=43)
    low_dim_data, low_dim_labels = low_dim_generator.generate_data()
    datasets["Gaussian_low"] = low_dim_data

    medium_dim_generator = DataGenerator(n_samples=10000, n_features=100, n_clusters=4, random_state=43)
    medium_dim_data, medium_dim_labels = medium_dim_generator.generate_data()
    datasets["Gaussian_medium"] = medium_dim_data

    high_dim_generator = DataGenerator(n_samples=10000, n_features=300, n_clusters=4, random_state=43)
    high_dim_data, high_dim_labels = high_dim_generator.generate_data()
    datasets["Gaussian_high"] = high_dim_data

    # Iris
    iris = load_iris()
    X_iris = iris.data
    datasets['Iris'] = X_iris

    # Wines
    wine = load_wine()
    X_wine = wine.data
    datasets['Wine'] = X_wine

    # Moons
    X_moons, _ = make_moons(n_samples=10000, noise=0.1, random_state=42)
    datasets['Moons'] = X_moons

    # Circles
    X_circles, _ = make_circles(n_samples=15000, noise=0.1, factor=0.5, random_state=42)
    datasets['Circles'] = X_circles

    # Random
    datasets["Random_low"] = np.random.uniform(0, 10, size=(5000, 10))  # Low dimension random dataset
    datasets["Random_medium"] = np.random.uniform(0, 10, size=(5000, 100))  # Medium dimension random dataset
    datasets["Random_high"] = np.random.uniform(0, 10, size=(5000, 300))  # High dimension random dataset

    return datasets

datasets = generate_datasets()

## 5. Performance Measurement

In [None]:
# Initialize a dictionary to store results
results = {
    'Dataset': [],
    'Method': [],
    'K': [],
    'Distance_Evaluations': []
}

# List of methods to test
methods = ['classic', 'Elkan', 'Ptolemy_upper', 'Ptolemy_lower', 'Ptolemy']
# List of k values to test
k_values = [3, 20, 100]

for name, data in datasets.items():
    print(f"Processing dataset: {name}")
    
    for k in k_values:
        for method in methods:
            # Initialize and fit the model
            model = Kmeans(k=k, method=method)
            model.fit(data)
            
            # Store results
            results['Dataset'].append(name)
            results['Method'].append(method)
            results['K'].append(k)
            results['Distance_Evaluations'].append(model.distance_evaluations)  # Accessing distance evaluations
        
            # Print results
            print(f"{method} (k={k}) - Distance evaluations: {model.distance_evaluations}")

# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
results_df

In [16]:
# Assuming the following structures exist
results = {
    'Dataset': [],
    'Method': [],
    'K': [],
    'Distance_Evaluations': []
}
methods = ['classic', 'Elkan', 'Ptolemy_upper', 'Ptolemy_lower', 'Ptolemy'] # Example methods
k_values = [3, 20, 100]  # Example k values

def process_dataset(name, data, k_values, methods):
    print(f"Processing dataset: {name}")
    
    for k in k_values:
        for method in methods:
            # Initialize and fit the model
            model = Kmeans(k=k, method=method)
            model.fit(data)
            
            # Store results
            results['Dataset'].append(name)
            results['Method'].append(method)
            results['K'].append(k)
            results['Distance_Evaluations'].append(model.distance_evaluations)
        
            # Print results
            print(f"{method} (k={k}) - Distance evaluations: {model.distance_evaluations}")


In [17]:
dataset_name = 'Random_low'
dataset_data = datasets[dataset_name]

process_dataset(dataset_name, dataset_data, k_values, methods)

Processing dataset: Random_low
classic (k=3) - Distance evaluations: 840000
Elkan (k=3) - Distance evaluations: 879890
Ptolemy_upper (k=3) - Distance evaluations: 869932
Ptolemy_lower (k=3) - Distance evaluations: 882837
Ptolemy (k=3) - Distance evaluations: 872355
classic (k=20) - Distance evaluations: 6000000
Elkan (k=20) - Distance evaluations: 10330209
Ptolemy_upper (k=20) - Distance evaluations: 2863457
Ptolemy_lower (k=20) - Distance evaluations: 11052738
Ptolemy (k=20) - Distance evaluations: 2964126
classic (k=100) - Distance evaluations: 23500000
Elkan (k=100) - Distance evaluations: 11836355
Ptolemy_upper (k=100) - Distance evaluations: 2284908
Ptolemy_lower (k=100) - Distance evaluations: 20236336
Ptolemy (k=100) - Distance evaluations: 3996726


In [18]:
dataset_name = 'Random_medium'
dataset_data = datasets[dataset_name]

process_dataset(dataset_name, dataset_data, k_values, methods)

Processing dataset: Random_medium
classic (k=3) - Distance evaluations: 1080000
Elkan (k=3) - Distance evaluations: 1110000
Ptolemy_upper (k=3) - Distance evaluations: 1102015
Ptolemy_lower (k=3) - Distance evaluations: 1110000
Ptolemy (k=3) - Distance evaluations: 1094784
classic (k=20) - Distance evaluations: 7900000
Elkan (k=20) - Distance evaluations: 3300000
Ptolemy_upper (k=20) - Distance evaluations: 2504815
Ptolemy_lower (k=20) - Distance evaluations: 3300000
Ptolemy (k=20) - Distance evaluations: 2146340
classic (k=100) - Distance evaluations: 9000000
Elkan (k=100) - Distance evaluations: 7499766
Ptolemy_upper (k=100) - Distance evaluations: 5660515
Ptolemy_lower (k=100) - Distance evaluations: 7499786
Ptolemy (k=100) - Distance evaluations: 4391286


In [19]:
dataset_name = 'Random_high'
dataset_data = datasets[dataset_name]

process_dataset(dataset_name, dataset_data, k_values, methods)

Processing dataset: Random_high
classic (k=3) - Distance evaluations: 525000
Elkan (k=3) - Distance evaluations: 645000
Ptolemy_upper (k=3) - Distance evaluations: 645000
Ptolemy_lower (k=3) - Distance evaluations: 645000
Ptolemy (k=3) - Distance evaluations: 645000
classic (k=20) - Distance evaluations: 3100000
Elkan (k=20) - Distance evaluations: 2200000
Ptolemy_upper (k=20) - Distance evaluations: 2130270
Ptolemy_lower (k=20) - Distance evaluations: 2200000
Ptolemy (k=20) - Distance evaluations: 1638100
classic (k=100) - Distance evaluations: 7000000
Elkan (k=100) - Distance evaluations: 6934999
Ptolemy_upper (k=100) - Distance evaluations: 6330177
Ptolemy_lower (k=100) - Distance evaluations: 6934999
Ptolemy (k=100) - Distance evaluations: 3794620


In [22]:
results_df = pd.DataFrame(results)

# 6. Export to Excel

In [23]:
# Calculate speedup for each variant compared to Elkan
def calculate_speedup(df):
    speedup_results = []
    for idx, row in df.iterrows():
        if row['Method'] == 'Elkan':
            speedup_results.append(1.0)  # Speedup is 1 for the classic method itself
        else:
            classic_dist_evals = df[(df['Dataset'] == row['Dataset']) & (df['K'] == row['K']) & (df['Method'] == 'Elkan')]['Distance_Evaluations'].values[0]
            speedup = classic_dist_evals / row['Distance_Evaluations']
            speedup_results.append(speedup)
    return speedup_results

results_df['Speedup'] = calculate_speedup(results_df)

In [24]:
results_df.to_excel('clustering_performance_results2.xlsx', index=False)

In [25]:
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows

# Load the Excel file
file_path = 'clustering_performance_results2.xlsx'
df = pd.read_excel(file_path)

# Rearrange columns if needed (currently they seem to be in a logical order)
# e.g., df = df[['Dataset', 'K', 'Method', 'Distance_Evaluations', 'Speedup']]

# Save the DataFrame back to an Excel file with formatting
output_file_path = 'clustering_performance_results_formatted2.xlsx'
df.to_excel(output_file_path, index=False, sheet_name='Results')

# Load the workbook and the sheet
wb = load_workbook(output_file_path)
ws = wb.active

# Apply formatting
for col in ws.columns:
    max_length = 0
    column = col[0].column_letter  # Get the column name
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(cell.value)
        except:
            pass
    adjusted_width = (max_length + 2)
    ws.column_dimensions[column].width = adjusted_width

# Define styles
header_font = Font(bold=True)
center_aligned_text = Alignment(horizontal="center")
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")

# Apply styles to header
for cell in ws[1]:
    cell.font = header_font
    cell.alignment = center_aligned_text

# Apply conditional formatting for Speedup column
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=5, max_col=5):
    for cell in row:
        if cell.value > 1:
            cell.fill = green_fill
        elif cell.value < 1:
            cell.fill = red_fill

# Save the workbook
wb.save(output_file_path)

output_file_path

'clustering_performance_results_formatted2.xlsx'

In [None]:
low_dim_generator = DataGenerator(n_samples=10000, n_features=10, n_clusters=4, random_state=43)
low_dim_data, low_dim_labels = low_dim_generator.generate_data()

model1 = Kmeans(k=4, method='classic')
model1.fit(low_dim_data)
print("Elkan")
model2 = Kmeans(k=4, method='Elkan')
model2.fit(low_dim_data)
print("Ptolemy_upper")
model3 = Kmeans(k=4, method='Ptolemy_upper')
model3.fit(low_dim_data)
print("Ptolemy_lower")
model4 = Kmeans(k=4, method='Ptolemy_lower')
model4.fit(low_dim_data)
print("Ptolemy")
model5 = Kmeans(k=4, method='Ptolemy')
model5.fit(low_dim_data)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Function to plot clusters
def plot_clusters(data, centroids, labels, title, ax):
    unique_labels = np.unique(labels)
    colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))

    for label, color in zip(unique_labels, colors):
        points = data[labels == label]
        ax.scatter(points[:, 0], points[:, 1], s=30, color=color, label=f'Cluster {label+1}')
    
    ax.scatter(centroids[:, 0], centroids[:, 1], s=200, color='red', marker='X', label='Centroids')
    ax.set_title(title)
    ax.legend()

# Assuming low_dim_data is your dataset and model1, model2, model3 are your fitted models


fig, axs = plt.subplots(1, 5, figsize=(18, 6))

plot_clusters(low_dim_data, np.array(list(model1.centroids.values())), np.array(model1.labels), 'Classic Method', axs[0])
plot_clusters(low_dim_data, np.array(list(model2.centroids.values())), np.array(model2.labels), 'Elkan Method', axs[1])
plot_clusters(low_dim_data, np.array(list(model3.centroids.values())), np.array(model3.labels), 'Ptolemy Method', axs[2])
plot_clusters(low_dim_data, np.array(list(model4.centroids.values())), np.array(model4.labels), 'Ptolemy Method', axs[3])
plot_clusters(low_dim_data, np.array(list(model5.centroids.values())), np.array(model5.labels), 'Ptolemy Method', axs[4])

plt.show()

print("Classic Distance evaluations: " + str(model1.distance_evaluations))
print("Elkan Distance evaluations: " + str(model2.distance_evaluations))
print("Ptolemy Distance evaluations: " + str(model3.distance_evaluations))
print("Ptolemy Distance evaluations: " + str(model4.distance_evaluations))
print("Ptolemy Distance evaluations: " + str(model5.distance_evaluations))


In [None]:
low_dim_generator = DataGenerator(n_samples=1000, n_features=10, n_clusters=4, random_state=43)
low_dim_data, low_dim_labels = low_dim_generator.generate_data()

model1 = Kmeans(k=4, method='Ptolemy')
model1.fit(low_dim_data)