<a href="https://colab.research.google.com/github/mmgaber/ClassDecomposition/blob/main/ClassDecomp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

def cluster_data(X, y, num_subclasses_per_class, clustering_model):
    clustered_data = []

    unique_classes = np.unique(y)

    for class_label in unique_classes:
        # Select data points belonging to the current class
        class_indices = np.where(y == class_label)[0]
        class_data = X[class_indices]

        # Apply clustering to the data of the current class
        clustering_model.n_clusters = num_subclasses_per_class.get(class_label, 2)
        labels = clustering_model.fit_predict(class_data)

        # Create subclass labels indicative of the main class and cluster number
        subclass_labels = np.array([f"{class_label}_cluster_{cluster}" for cluster in range(clustering_model.n_clusters)])

        # Append the clustered data with indicative subclass labels to the result
        clustered_data.extend(list(zip(class_data, subclass_labels[labels])))

    # Separate clustered data into X_clustered and y_clustered
    X_clustered, y_clustered = zip(*clustered_data)

    return np.array(X_clustered), np.array(y_clustered)

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify the number of subclasses for each class
num_subclasses_per_class = {0: 2, 1: 3, 2: 2}

# Choose a clustering technique (e.g., KMeans)
clustering_model = KMeans()

# Preprocess the training data
X_train_clustered, y_train_clustered = cluster_data(X_train, y_train, num_subclasses_per_class, clustering_model)

# Train a classifier (K-nearest neighbors as an example) at the subclass level
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train_clustered, y_train_clustered)

# During testing, aggregate predictions at the class level
X_test_clustered, _ = cluster_data(X_test, y_test, num_subclasses_per_class, clustering_model)
y_pred_subclass = classifier.predict(X_test_clustered)

# Convert subclass predictions to class predictions
y_pred_class = np.array([int(label.split('_')[0]) for label in y_pred_subclass])

# Print subclass and equivalent class predictions
print("Subclass Predictions:")
print(y_pred_subclass)
print("Equivalent Class Predictions:")
print(y_pred_class)

# Output the confusion matrix at the class level
conf_matrix = confusion_matrix(y_test, y_pred_class)
print("\nConfusion Matrix (Class Level):")
print(conf_matrix)


Subclass Predictions:
['0_cluster_0' '0_cluster_0' '0_cluster_1' '0_cluster_0' '0_cluster_1'
 '0_cluster_0' '0_cluster_1' '0_cluster_0' '0_cluster_1' '0_cluster_1'
 '1_cluster_1' '1_cluster_1' '1_cluster_1' '1_cluster_0' '1_cluster_1'
 '1_cluster_0' '1_cluster_1' '1_cluster_2' '1_cluster_0' '2_cluster_1'
 '2_cluster_1' '2_cluster_0' '2_cluster_1' '2_cluster_0' '2_cluster_0'
 '2_cluster_0' '2_cluster_1' '2_cluster_1' '2_cluster_0' '2_cluster_1']
Equivalent Class Predictions:
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2]

Confusion Matrix (Class Level):
[[2 4 4]
 [6 3 0]
 [2 2 7]]


