### Some imports

In [None]:
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans
from keras.datasets import mnist

warnings.filterwarnings('ignore')

### An existing k-means algorithm with three different distance metrics

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances

class KMeansWithCustomDistance(KMeans):
    def __init__(self, n_clusters, init, distance='euclidean', **kwargs):
        super().__init__(n_clusters=n_clusters, init=init, **kwargs)
        self.distance = distance

    def _pairwise_distances_argmin(self, X):
        if self.distance == 'euclidean':
            return np.argmin(euclidean_distances(X, self.cluster_centers_), axis=1)
        elif self.distance == 'manhattan':
            return np.argmin(manhattan_distances(X, self.cluster_centers_), axis=1)
        elif self.distance == 'cosine':
            return np.argmin(1 - cosine_distances(X, self.cluster_centers_), axis=1)
        else:
            raise ValueError(f"Invalid distance metric: {self.distance}")

### Get MNIST data

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data() # load_data() function takes 60000 for training and 10000 for test. 
X_train = X_train.astype('float32') / 255.0 # normalize
X_train = X_train.reshape(60000, 28*28) # to dimension reducing with 784=28*28

# According to HW4, we need 56000 data, because get %80 training data, %20 test data, so 70.000 * 0.8 = 56.000, 
X_train = X_train[:5600] # for now, 5600 > %10
y_train = y_train[:5600]
X_test = X_test[:5600]
y_test = y_test[:5600]
kmeans = KMeansWithCustomDistance(n_clusters=10, init='k-means++', distance='manhattan')

### Get clusters

In [None]:
kmeans.fit(X_train)
labels_manhattan = kmeans.predict(X_train)
print("Cluster labels (Manhattan):", labels_manhattan)

centers_manhattan = kmeans.cluster_centers_
print("Cluster centers (Manhattan):", centers_manhattan)


### If you see the diagram, you will uncommit

In [None]:
#from sklearn.decomposition import PCA

# pca = PCA(n_components=2)
# X_train_pca = pca.fit_transform(X_train)
# 
# kmeans.fit(X_train_pca)
# labels_manhattan_pca = kmeans.predict(X_train_pca)
# 
# centers_pca_manhattan = kmeans.cluster_centers_
# print(centers_pca_manhattan)
# 
# plt.figure(figsize=(10, 10))
# plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=labels_manhattan_pca, cmap='viridis')
# plt.scatter(centers_pca_manhattan[:, 0], centers_pca_manhattan[:, 1], c='red', s=100, alpha=0.5)
# plt.title('Cluster Centers and Data Points (PCA)')
# plt.show()

## P.S.
But these are not real label of each image, since the output of the kmeans.labels_ is just group id for clustering. For example, 6 in kmeans.labels_ has similar features with another 6 in kmeans.labels_. There is no more meaning from the label.

To match it with real label, we can tackle the follow things:

- Combine each images in the same group
- Check Frequency distribution of actual labels (using np.bincount)
- Find the Maximum frequent label (through np.argmax), and set the label.

In [None]:
def infer_cluster_labels(kmeans, actual_labels):
    """
    Associates most probable label with each cluster in KMeans model
    returns: dictionary of clusters assigned to each label
    """

    inferred_labels = {}

    # Loop through the clusters
    for i in range(kmeans.n_clusters):

        # find index of points in cluster
        labels = []
        index = np.where(kmeans.labels_ == i)

        # append actual labels for each point in cluster
        labels.append(actual_labels[index])

        # determine most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))

        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            # append the new number to the existing array at this slot
            inferred_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this slot
            inferred_labels[np.argmax(counts)] = [i]
        
    return inferred_labels  

def infer_data_labels(X_labels, cluster_labels):
    """
    Determines label for each array, depending on the cluster it has been assigned to.
    returns: predicted labels for each array
    """
    
    # empty array of len(X)
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [None]:
kmeans.fit(X_train)
cluster_labels = infer_cluster_labels(kmeans, y_train)
X_clusters_labels = kmeans.predict(X_train)
predicted_labels_with_argmax = infer_data_labels(X_clusters_labels, cluster_labels)

print(predicted_labels_with_argmax[:20])
print(y_train[:20])

### If you see Kmeans performance for different cluster, you will uncommit

In [None]:
# from sklearn.metrics import accuracy_score
# 
# clusters = [10, 16, 36, 64, 144, 256]
# acc_list = []
# 
# for n_clusters in clusters:
#     estimator = KMeansWithCustomDistance(n_clusters=n_clusters, init='k-means++', distance='manhattan')
#     estimator.fit(X_train)
# 
#     # Determine predicted labels
#     cluster_labels = infer_cluster_labels(estimator, y_train)
#     prediction = infer_data_labels(estimator.labels_, cluster_labels)
#     
#     acc = accuracy_score(y_train, prediction)
#     acc_list.append(acc)
#     print('Accuracy: {}\n'.format(acc))

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
# pd.set_option('display.max_columns', 7)

row_labels = [f'Label{i}' for i in range(10)]
column_labels = [f'Cluster{i+1}' for i in range(10)]

accuracy_scores_for_trainin_error = []
confusion_matrices_for_trainin_error = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation this function gives me an index subset of X_train
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    
    kmeans.fit(X_train_fold)
    
    cluster_labels_fold = infer_cluster_labels(kmeans, y_train_fold)
    X_cluster_labels_fold = kmeans.predict(X_train_fold)
    predicted_labels_fold = infer_data_labels(X_cluster_labels_fold, cluster_labels_fold)

    clusters = kmeans.labels_
    label_counts = np.zeros((10, 10))
    
    for i in range(len(clusters)):
        label_counts[clusters[i]][y_train[i]] += 1
    
    print(pd.DataFrame(label_counts, index=row_labels, columns=column_labels))
    print()

    # if you want to hide all label distribution in clusters, you will comment this section
    for i in range(10):
        for j in range(10):
            plt.bar(range(10), label_counts[i])
            plt.title(f"Label distribution in cluster {i}")
            plt.xlabel("Label")
        plt.show()

    accuracy = accuracy_score(y_train_fold, predicted_labels_fold)
    cm = confusion_matrix(y_train_fold, predicted_labels_fold)

    accuracy_scores_for_trainin_error.append(accuracy)
    confusion_matrices_for_trainin_error.append(cm)

In [None]:
for i, score in enumerate(accuracy_scores_for_trainin_error):
    plt.figure(figsize=(5, 5))
    sns.heatmap(confusion_matrices_for_trainin_error[i], annot=True, fmt='d', cmap='Greens')
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    i+=1
    plt.xlabel(f'{i}. Fold cross validation Score: {score:.3f}')
    plt.plot()

plt.show()

# Calculate average accuracy across all folds
avg_accuracy = np.mean(accuracy_scores_for_trainin_error)
print(f"Average accuracy: {avg_accuracy}")

### IMPORTANT NOTE
> As you can be seen from the tables, I created that in the operations performed by applying k fold cross, values in distributions are close to each other. To avoid this, I will try two separate approaches. 
> First is the approach I'm trying to implement here. The second solution would be to proceed to the result without applying k fold. When I try this, you will see that the distribution looks better.

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
# pd.set_option('display.max_columns', 7)

row_labels = [f'Label{i}' for i in range(10)]
column_labels = [f'Cluster{i+1}' for i in range(10)]

accuracy_scores_for_trainin_error = []
confusion_matrices_for_trainin_error = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
kmeans = KMeansWithCustomDistance(n_clusters=10, init='k-means++', distance='manhattan')
kmeans.fit(X_train)

cluster_labels = infer_cluster_labels(kmeans, y_train)
X_cluster_labels = kmeans.predict(X_train)
predicted_labels = infer_data_labels(X_cluster_labels, cluster_labels)

clusters = kmeans.labels_
label_counts = np.zeros((10, 10))

In [None]:
for i in range(len(clusters)):
    label_counts[clusters[i]][y_train[i]] += 1
    
conjucted_matrix = (pd.DataFrame(label_counts, index=row_labels, columns=column_labels))

### Training Error — Confusion Matrix & Accuracy

In [None]:
print(conjucted_matrix)

### All label distribution in clusters

In [None]:
# if you want to see all label distribution in clusters, you will uncomment
for i in range(10):
    for j in range(10):
        plt.bar(range(10), label_counts[i])
        plt.title(f"Label distribution in cluster {i}")
        plt.xlabel("Label")
    plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)