In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.spatial import distance
import math
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import pairwise_distances
import seaborn as sns


In [None]:
import pickle, os

def load_dataset(name_file):

    desired_directory = '/content/drive/MyDrive/Phase 3/' # Replace with your desired directory path

    file_path = os.path.join(desired_directory, name_file)

    with open(file_path, 'rb') as f:
        data_dict = pickle.load(f)

    return data_dict


In [None]:
loaders_dict = load_dataset(f"dataset-flowers102-features.pkl")
x_train = loaders_dict["x_train"]
x_test = loaders_dict["x_test"]
y_train = loaders_dict["y_train"]
y_test = loaders_dict["y_test"]
print(f"x_train:{x_train.shape}, y_train:{y_train.shape}")
print(f"x_test:{x_test.shape}, y_test:{y_test.shape}")


x_train:(4094, 512), y_train:(4094,)
x_test:(4095, 512), y_test:(4095,)


# Implementing K-means:

In [None]:
# Clustering with K-Means and centroid
n_clusters = 50

scaler = StandardScaler()
x_train_normalized = scaler.fit_transform(x_train)
x_test_normalized = scaler.transform(x_test)


kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(x_train_normalized)

cluster_labels_train = kmeans.predict(x_train_normalized)
clusters = [[] for _ in range(n_clusters)]
labels_train = [[] for _ in range(n_clusters)]

for i in range(len(x_train)):
    cluster_id = cluster_labels_train[i]
    clusters[cluster_id].append(x_train[i])
    labels_train[cluster_id].append(y_train[i])

centroids = kmeans.cluster_centers_

# Ensure centroids is 2D
if len(centroids.shape) == 1:
    centroids = centroids.reshape(1, -1)

# Calculate the distance of each point to each centroid
distances_to_centroids = cdist(x_train_normalized, centroids, 'euclidean')

# Find the closest centroid for each data point
closest_centroid_indices = np.argmin(distances_to_centroids, axis=1)

# Reassign points to clusters based on closest centroid
new_cluster_labels_train = closest_centroid_indices



***Display centroids and distances and clustering...***

In [None]:
print("Centroids shape:", centroids.shape)
print(type(centroids))

Centroids shape: (50, 512)
<class 'numpy.ndarray'>


In [None]:
# 1. Print the first few rows of x_train_normalized
print("First few rows of x_train_normalized:")
df_x_train_normalized = pd.DataFrame(x_train_normalized[:5], columns=[f"Feature_{i}" for i in range(x_train_normalized.shape[1])])
print(df_x_train_normalized)

# 2. Print the centroids
print("\nCluster centroids:")
df_centroids = pd.DataFrame(centroids, columns=[f"Feature_{i}" for i in range(centroids.shape[1])])
print(df_centroids)

# 3. Calculate and print the distances from the first few points in x_train_normalized to the centroids using Euclidean distance
first_few_points = x_train_normalized[:5]
distances_to_centroids_euclidean_few = cdist(first_few_points, centroids, 'euclidean')

print("\nDistances to centroids for the first few points using Euclidean distance:")
df_distances_euclidean_few = pd.DataFrame(distances_to_centroids_euclidean_few, columns=[f"Centroid_{i}" for i in range(centroids.shape[0])])
print(df_distances_euclidean_few)

# Calculate and print the distances using Manhattan distance
distances_to_centroids_manhattan_few = cdist(first_few_points, centroids, 'cityblock')

print("\nDistances to centroids for the first few points using Manhattan distance:")
df_distances_manhattan_few = pd.DataFrame(distances_to_centroids_manhattan_few, columns=[f"Centroid_{i}" for i in range(centroids.shape[0])])
print(df_distances_manhattan_few)

First few rows of x_train_normalized:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0  -0.724619  -0.399964  -0.866279  -0.721773  -0.721808   0.351396   
1  -0.080654  -0.870273   2.029314  -0.778844  -0.751627  -0.788599   
2  -0.724619   0.933212   1.241145  -0.251500  -0.505377   0.697888   
3   0.090268  -0.834964   1.368826   1.334334  -0.711382   0.317856   
4  -0.724619  -0.862510  -0.866279   0.429373  -0.316709  -0.154648   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_502  Feature_503  \
0  -0.742039  -0.485091  -0.741201   2.301722  ...    -0.664913    -0.275243   
1  -0.521359   2.740462   0.369321  -0.774194  ...    -0.664913    -0.832949   
2  -0.401839  -0.647586  -0.730270  -0.748674  ...    -0.367257    -0.828994   
3  -0.731979   1.062754  -0.691257  -0.774194  ...    -0.663871    -0.832949   
4  -0.428446  -0.869966  -0.373382  -0.774194  ...    -0.303365    -0.830888   

   Feature_504  Feature_505  Feature_506  Feature_507 

In [None]:
# Print the centroid values
for cluster_id, centroid in enumerate(centroids):
    print(f"Centroid for Cluster {cluster_id}: {centroid}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  1.85077831e-01 -4.35167134e-01  6.47650719e-01 -3.34776118e-02
  9.57347453e-01 -6.90471947e-01 -2.75565445e-01 -5.09522855e-01
  3.83386672e-01 -6.39990568e-01  3.04688752e-01 -1.22992815e-02
  1.02456674e-01  5.81324637e-01 -5.87710261e-01  8.01205754e-01]
Centroid for Cluster 9: [ 5.77044249e-01 -4.07129079e-01 -2.78301805e-01 -1.50345722e-02
 -6.38078511e-01  3.66374031e-02  1.86131343e-01  1.84117369e-02
  1.92258298e-01 -3.94790888e-01  5.83982527e-01 -2.14183927e-01
  7.07708299e-01 -2.47961566e-01 -6.23325884e-01 -6.35434091e-01
 -6.56925499e-01  8.55454445e-01  3.01677268e-02 -5.56461632e-01
 -3.80869031e-01 -5.90160310e-01 -3.70652765e-01 -9.83116105e-02
 -1.85486078e-01 -3.67747562e-05  5.93591690e-01 -3.84228766e-01
  1.43162921e-01 -1.32555395e-01  1.18156575e-01  1.09238946e+00
 -1.37958899e-01 -1.27161015e-03 -4.57504392e-01 -1.89928338e-01
  9.43428427e-02 -1.22778647e-01 -4.11444068e-01  6.87995732e-01


In [None]:
print("x_train_normalized shape:", x_train_normalized.shape)

print("labels_train:")
for idx, label in enumerate(labels_train):
    print(f"{idx}: {label}")


# Count the number of data points in each cluster
counts = np.bincount(cluster_labels_train)

# Print the counts
print("Data points in each cluster:")
for i, count in enumerate(counts):
    print(f"Cluster {i}: {count} data points")


x_train_normalized shape: (4094, 512)
labels_train:
0: [57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 30, 57, 57, 57, 57, 57, 57, 30, 57, 57, 57, 57, 57, 57, 57, 57, 57, 52, 57, 57, 57, 57, 57, 57, 57, 57, 57, 100, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 30, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57]
1: [39, 93, 3, 76, 3, 66, 81, 93, 95, 83, 10, 84, 84, 93, 39, 96, 94, 83, 39, 39, 83, 84, 39, 39, 39, 83, 3, 39, 92, 100, 2, 19, 39, 86, 88, 93, 96, 39, 39, 2, 39, 39, 3, 39, 86, 2, 3, 94, 96, 39, 98, 3, 93, 39, 93, 66, 83, 96, 93, 39, 3, 39, 39, 38, 3, 84, 39, 67, 39, 3, 39, 84, 3, 39, 67, 93, 93, 67, 39, 2, 39, 92, 2, 94, 94, 96, 83, 93, 81, 84, 3, 84, 93, 81, 8, 3, 39, 93, 94, 90, 96, 93, 22, 39, 39, 93, 96, 3, 39, 11, 39, 3, 2, 22]
2: [56, 70, 58, 70, 70, 70, 70, 70, 43, 91, 58, 70, 58, 70, 70, 98, 70, 70, 70, 70, 98, 98, 58, 58, 99, 70, 58, 70, 99, 58, 70, 58, 58, 58, 58, 70, 70, 70, 91, 99, 70, 70, 70, 70, 58, 70, 70, 58, 98, 58, 98, 70, 58, 58, 58, 58, 91, 58, 58, 70, 98,

In [None]:
print("new_cluster_labels_train shape:", new_cluster_labels_train.shape)

print("new labels_train:")
for idx, label in enumerate(new_cluster_labels_train):
    print(f"{idx}: {label}")

new_cluster_labels_train shape: (4094,)
new labels_train:
0: 30
1: 9
2: 8
3: 8
4: 40
5: 21
6: 17
7: 3
8: 34
9: 20
10: 31
11: 28
12: 37
13: 14
14: 13
15: 12
16: 40
17: 17
18: 0
19: 25
20: 39
21: 34
22: 31
23: 29
24: 9
25: 19
26: 12
27: 9
28: 6
29: 10
30: 34
31: 32
32: 11
33: 21
34: 0
35: 40
36: 11
37: 13
38: 44
39: 12
40: 25
41: 3
42: 1
43: 37
44: 9
45: 15
46: 22
47: 24
48: 34
49: 23
50: 12
51: 24
52: 21
53: 15
54: 40
55: 13
56: 0
57: 43
58: 26
59: 25
60: 39
61: 4
62: 5
63: 45
64: 10
65: 36
66: 3
67: 26
68: 0
69: 8
70: 12
71: 9
72: 39
73: 18
74: 22
75: 12
76: 49
77: 3
78: 10
79: 8
80: 18
81: 42
82: 35
83: 26
84: 9
85: 32
86: 32
87: 34
88: 8
89: 17
90: 22
91: 17
92: 32
93: 3
94: 28
95: 14
96: 15
97: 49
98: 28
99: 21
100: 31
101: 34
102: 7
103: 28
104: 1
105: 25
106: 47
107: 1
108: 16
109: 8
110: 29
111: 31
112: 16
113: 13
114: 28
115: 49
116: 12
117: 12
118: 19
119: 32
120: 36
121: 45
122: 5
123: 3
124: 4
125: 25
126: 4
127: 25
128: 13
129: 30
130: 3
131: 26
132: 2
133: 2
134: 9
135: 24


In [None]:
from collections import Counter
from sklearn.metrics import adjusted_rand_score, confusion_matrix

# Create a dictionary to store cluster accuracies
cluster_accuracies = {}

# Determine unique cluster labels
unique_clusters = set(cluster_labels_train)

# Compute accuracy for each cluster
for cluster in unique_clusters:
    # Get indices of data points in the current cluster
    indices = [i for i, label in enumerate(cluster_labels_train) if label == cluster]

    # Get true labels of data points in this cluster
    true_labels_in_cluster = [y_train[i] for i in indices]

    # Calculate the most frequent true label in this cluster
    most_common_label = Counter(true_labels_in_cluster).most_common(1)[0][0]

    # Calculate cluster accuracy
    accuracy = true_labels_in_cluster.count(most_common_label) / len(indices)

    # Store the accuracy for this cluster
    cluster_accuracies[cluster] = accuracy

# Print cluster accuracies
for cluster, accuracy in cluster_accuracies.items():
    print(f"Cluster {cluster}: Accuracy = {accuracy:.2f}")


# Compute the Rand Index
rand_index = adjusted_rand_score(cluster_labels_train, y_train)
print(f"Rand Index: {rand_index:.2f}")

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_train, cluster_labels_train)
print("Confusion Matrix:")
print(conf_matrix)


Cluster 0: Accuracy = 0.92
Cluster 1: Accuracy = 0.26
Cluster 2: Accuracy = 0.44
Cluster 3: Accuracy = 0.28
Cluster 4: Accuracy = 0.85
Cluster 5: Accuracy = 0.80
Cluster 6: Accuracy = 0.34
Cluster 7: Accuracy = 0.37
Cluster 8: Accuracy = 0.37
Cluster 9: Accuracy = 0.32
Cluster 10: Accuracy = 0.34
Cluster 11: Accuracy = 0.50
Cluster 12: Accuracy = 0.34
Cluster 13: Accuracy = 0.96
Cluster 14: Accuracy = 0.36
Cluster 15: Accuracy = 0.26
Cluster 16: Accuracy = 0.49
Cluster 17: Accuracy = 0.53
Cluster 18: Accuracy = 0.89
Cluster 19: Accuracy = 0.80
Cluster 20: Accuracy = 1.00
Cluster 21: Accuracy = 0.24
Cluster 22: Accuracy = 0.48
Cluster 23: Accuracy = 1.00
Cluster 24: Accuracy = 0.53
Cluster 25: Accuracy = 0.22
Cluster 26: Accuracy = 0.33
Cluster 27: Accuracy = 1.00
Cluster 28: Accuracy = 0.76
Cluster 29: Accuracy = 0.95
Cluster 30: Accuracy = 0.84
Cluster 31: Accuracy = 0.60
Cluster 32: Accuracy = 0.26
Cluster 33: Accuracy = 0.86
Cluster 34: Accuracy = 0.88
Cluster 35: Accuracy = 0.34
Cl

In [None]:
from sklearn.metrics import confusion_matrix


# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, new_cluster_labels_train)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 6 0 ... 0 0 0]
 ...
 [0 0 3 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Implement KNN

In [None]:
pip install scikit-learn



testing knn for start

In [None]:
# Define the function to apply k-NN on the K nearest clusters
def k_nearest_clusters(test_data, k, cluster_centers, cluster_labels, x_train, y_train):
    # 1. Identify the K Nearest Clusters

    # distances = cdist(test_data, cluster_centers, 'cityblock')
    distances = cdist(test_data, cluster_centers, 'euclidean')
    # distances = pairwise_distances(test_data, cluster_centers,'chebyshev')
    # distances = pairwise_distances(test_data, cluster_centers,'cosine')
    # distances = pairwise_distances(test_data, cluster_centers,'hamming')



    nearest_cluster_indices = distances.argsort()[0][:k]
    nearest_clusters = [cluster_labels[i] for i in nearest_cluster_indices]

    # 2. Aggregate Data from K Nearest Clusters
    aggregated_data = []
    for cluster_id in nearest_clusters:
        indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]
        aggregated_data.extend([(x_train[i], y_train[i]) for i in indices])

    # Extract features and labels separately for k-NN
    x_aggregated = [data[0] for data in aggregated_data]
    y_aggregated = [data[1] for data in aggregated_data]

    # 3. Apply k-NN on the Aggregated Data
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_aggregated, y_aggregated)
    prediction = knn.predict(test_data)


    # print(nearest_clusters)

    return prediction[0]

# Initialize counter for correct predictions
correct_predictions = 0

# Iterate over all test samples
for idx in range(len(x_test_normalized)):
    test_sample = [x_test_normalized[idx]]

    # Predicted label using k-NN on the K nearest clusters
    predicted_label = k_nearest_clusters(test_sample, 10, kmeans.cluster_centers_, new_cluster_labels_train, x_train_normalized ,y_train)

    # True label of the test data
    true_label = y_test[idx]


    # # Print true and predicted labels for the current test sample
    # print(f"\nFor test data at index {idx}:")
    # print(f"True Label: {true_label}")
    # print(f"Predicted Label: {predicted_label}")

    # Compare the true label with the predicted label
    if true_label == predicted_label:
        correct_predictions += 1

# Calculate and print overall accuracy
accuracy_percentage = (correct_predictions / len(x_test_normalized)) * 100
print(f"\nOverall Accuracy: {accuracy_percentage:.2f}%")




Overall Accuracy: 19.44%


In [None]:
def calculate_distance(x, y, metric='euclidean'):
    if metric == 'euclidean':
        return np.linalg.norm(x - y)
    elif metric == 'cityblock':
        return np.sum(np.abs(x - y))
    elif metric == 'chebyshev':
        return np.max(np.abs(x - y))
    elif metric == 'cosine':
        return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    elif metric == 'hamming':
        # Convert numerical values to binary strings
        x_binary = ''.join(format(int(xi), 'b') for xi in x)
        y_binary = ''.join(format(int(yi), 'b') for yi in y)
        # Calculate Hamming distance
        return sum(xi != yi for xi, yi in zip(x_binary, y_binary))
    else:
        raise ValueError(f"Unknown distance metric: {metric}")

# Function for k-NN classification on clusters
def knn_for_k_nearest_clusters(k1, cluster_centers, cluster_labels, x_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=k1)

    # Calculate distances to cluster centers
    distances = cdist(x_train, cluster_centers, metric=calculate_distance)
    nearest_cluster_indices = distances.argsort()[:, :k1]

    # Gather the data points belonging to the k1 nearest clusters
    aggregated_data = []
    for indices in nearest_cluster_indices:
        aggregated_data.extend([(x_train[i], y_train[i]) for i in indices])

    x_aggregated = [data[0] for data in aggregated_data]
    y_aggregated = [data[1] for data in aggregated_data]

    # Train k-NN model on aggregated data
    knn.fit(x_aggregated, y_aggregated)

    return knn

# Function for k-NN classification on the dataset
def knn_dataset(k2, knn_model, test_data):
    distances_to_k1_cluster = cdist(test_data, knn_model._fit_X, metric=calculate_distance)
    nearest_indices_k1_cluster = distances_to_k1_cluster.argsort()[:, :k2]

    # Extract k2 nearest data points and their corresponding labels
    x_k2_nearest = knn_model._fit_X[nearest_indices_k1_cluster.flatten()]
    y_k2_nearest = knn_model._y[nearest_indices_k1_cluster.flatten()]

    # Use these k2 nearest data points to predict the label for the test data
    model = KNeighborsClassifier(n_neighbors=k2, metric=calculate_distance, algorithm='auto')
    model.fit(x_k2_nearest, y_k2_nearest)

    return model.predict(test_data)

k1_range = range(1, 11)
k2_range = range(1, 11)
best_score = 0

# Iterate over different values of K1 and K2
for K1 in k1_range:
    for K2 in k2_range:
        # Initialize a dictionary to store k1 nearest clusters for each test sample
        k1_nearest_clusters = {}
        correct_predictions_dataset = 0

        # Testing k-NN on clusters
        for idx in range(len(x_test_normalized[:500])):
            test_sample = [x_test_normalized[idx]]

            # Ensure idx is within the valid range
            if idx < len(new_cluster_labels_train):
                # Initialize k1_nearest_clusters for each test sample
                k1_nearest_cluster = new_cluster_labels_train[idx]
                indices_in_cluster = [i for i, label in enumerate(new_cluster_labels_train) if label == k1_nearest_cluster]

                # Store the k1 nearest cluster for this test sample
                k1_nearest_clusters[idx] = k1_nearest_cluster


                knn_model = knn_for_k_nearest_clusters(K1, centroids, cluster_labels_train, x_train_normalized, y_train)
                predicted_label = knn_dataset(K2, knn_model, test_sample)
                true_label = y_test[idx]

                if true_label == predicted_label:
                    correct_predictions_dataset += 1

        # Calculate accuracy for k-NN on datasets
        accuracy_percentage_dataset = (correct_predictions_dataset / len(x_test_normalized)) * 100

        # Calculate penalty
        penalty = -0.2 * K1

        # Calculate the score
        score = accuracy_percentage_dataset + penalty

        # Update best_score if the current score is higher
        if score > best_score:
            best_score = score

        # Print the overall accuracy and score for the current values of K1 and K2
        print(f"K1 = {K1}, K2 = {K2}, Overall Accuracy: {accuracy_percentage_dataset:.2f}%, Score: {score:.2f}")

# Print the best score obtained
print(f"Best Score: {best_score}")

K1 = 1, K2 = 1, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 2, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 3, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 4, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 5, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 6, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 7, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 8, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 9, Overall Accuracy: 0.10%, Score: -0.10
K1 = 1, K2 = 10, Overall Accuracy: 0.10%, Score: -0.10
K1 = 2, K2 = 1, Overall Accuracy: 0.10%, Score: -0.30
K1 = 2, K2 = 2, Overall Accuracy: 0.10%, Score: -0.30
K1 = 2, K2 = 3, Overall Accuracy: 0.10%, Score: -0.30
K1 = 2, K2 = 4, Overall Accuracy: 0.10%, Score: -0.30
