In [None]:
'''

import library

evaluate_model

evaluate_kmeans

getData

'''

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import make_blobs

In [None]:
def evaluate_model(Y_actual, Y_pred):
    # Calculate metrics
    accuracy            = accuracy_score( Y_actual, Y_pred)
    precision           = precision_score(Y_actual, Y_pred, average='weighted')
    #recall_sensitivity  = recall_score(   Y_actual, Y_pred, average='weighted')
    f1_Score            = f1_score(       Y_actual, Y_pred, average='weighted')

    '''
    conf_matrix = confusion_matrix(Y_actual, Y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall_sensitivity
    serendipity = (tp / (tp + fp)) * (tp / (tp + fn))
    '''
    return(accuracy)

In [None]:
# Function to evaluate K-Means clustering
def evaluate_kmeans(X, y_true, n_clusters):
    start_time = time.time()

    # Initialize KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)

    # Fit the model
    kmeans.fit(X)

    # Predict the clusters
    y_pred = kmeans.predict(X)

    # Calculate the time taken
    time_taken = time.time() - start_time

    # Calculate the sum of squared errors (SSE)
    sse = kmeans.inertia_

    return sse, time_taken, y_pred

In [None]:
# Function to calculate clustering accuracy
def getData(n_clusters, random_state):
  # Generate synthetic dataset
  n_samples = 1500
  n_features = 2

  return(make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=random_state))

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
def getDiseaseData():

  # Load the Breast Cancer dataset
  data = load_breast_cancer()
  X = data.data  # Features
  y_true = data.target  # True labels (0 = malignant, 1 = benign)

  # Standardize the features (important for K-Means)
  scaler = StandardScaler()
  X = scaler.fit_transform(X)
  return(X, y_true)


In [None]:
def get_table(table, dataset_sizes):
    # Return the list results
    dataset_sizes = ["train % -->"] + dataset_sizes

    # Create a pandas DataFrame
    df = pd.DataFrame(table, columns=dataset_sizes)

    # Use pandas styling for a nice table display
    styled_df = df.style.set_table_styles(
        [{'selector': 'thead th',
          'props': [('background-color', '#4CAF50'),
                    ('color', 'white'),
                    ('font-weight', 'bold')]},
        {'selector': 'tbody tr:nth-child(odd)',
          'props': [('background-color', '#f2f2f2')]},
        {'selector': 'tbody tr:nth-child(even)',
          'props': [('background-color', '#ffffff')]},
          {'selector': 'td', 'props': [('text-align', 'left')]}]
    )

    # Display the table
    return(styled_df.hide(axis="index"))

In [None]:
n_clusters = 3
random_state = 42

# X, y_true = getData(n_clusters, random_state)

X, y_true = getDiseaseData()

# Evaluate K-Means for different dataset sizes
dataset_sizes = ["0.5", "0.6", "0.7", "0.8", "0.9", "1"]
results = {}

table = []
table.append(['Sum of Squared Errors'] )
table.append(['Clustering Accuracy'] )
table.append(['Time Taken'] )

for size in dataset_sizes:
    # Split the dataset
    X_subset, _, y_subset, _ = train_test_split(X, y_true, train_size=(0.7 * float(size)), random_state=random_state)

    # Evaluate K-Means
    sse, time_taken, y_pred = evaluate_kmeans(X_subset, y_subset, n_clusters)

    accuracy = evaluate_model(y_subset, y_pred)

    table[0].append(sse)
    table[1].append(accuracy)
    table[2].append(time_taken)

    results[size] = {
        'SSE': sse,
        'Time Taken': time_taken,
        'Accuracy': accuracy,
        'Predictions': y_pred
    }

for size, metrics in results.items():
    print(f"Dataset Size: {int(float(size) * 100)}%")
    print(f"  Sum of Squared Errors (SSE): {metrics['SSE']}")
    print(f"  Time Taken: {metrics['Time Taken']:.4f} seconds")
    print(f"  Clustering Accuracy: {metrics['Accuracy']:.4f}")

get_table(table, dataset_sizes)

Dataset Size: 50%
  Sum of Squared Errors (SSE): 3573.4565192822743
  Time Taken: 0.0030 seconds
  Clustering Accuracy: 0.0603
Dataset Size: 60%
  Sum of Squared Errors (SSE): 4373.1160094039815
  Time Taken: 0.0033 seconds
  Clustering Accuracy: 0.6513
Dataset Size: 70%
  Sum of Squared Errors (SSE): 5038.327061201475
  Time Taken: 0.0032 seconds
  Clustering Accuracy: 0.0576
Dataset Size: 80%
  Sum of Squared Errors (SSE): 5884.807559106865
  Time Taken: 0.0041 seconds
  Clustering Accuracy: 0.1321
Dataset Size: 90%
  Sum of Squared Errors (SSE): 6806.481803178522
  Time Taken: 0.0112 seconds
  Clustering Accuracy: 0.8380
Dataset Size: 100%
  Sum of Squared Errors (SSE): 6943.834610351634
  Time Taken: 0.0100 seconds
  Clustering Accuracy: 0.7487


train % -->,0.5,0.6,0.7,0.8,0.9,1
Sum of Squared Errors,3573.456519,4373.116009,5038.327061,5884.807559,6806.481803,6943.83461
Clustering Accuracy,0.060302,0.651261,0.057554,0.132075,0.837989,0.748744
Time Taken,0.00302,0.003256,0.003158,0.004094,0.011159,0.010032
