In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Mall_Customers (1).csv')

In [3]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
features = df.drop(columns=['Spending Score (1-100)']).select_dtypes(include=[np.number]).values

In [6]:
#compute the distance matrix
def compute_distance_matrix(data):
    data = np.array(data)
    n = len(data)
    matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            dist = np.linalg.norm(data[i] - data[j])
            matrix[i, j] = matrix[j, i] = dist

    return matrix


In [7]:
#perform agglomerative clustering(Single linkage)
def agglomerative_clustering(data,num_clusters):
  clusters = {i:[i] for i in range(len(data))}
  distance_matrix = compute_distance_matrix(data)

  while len(clusters) > num_clusters:
    min_dist = float("inf")
    closest_pair = NameError

    cluster_keys = list(clusters.keys())
    for i in range(len(cluster_keys)):
      for j in range(i+1,len(cluster_keys)):
        c1,c2 = cluster_keys[i],cluster_keys[j]
        #compute min dist (single linkage)
        dist = min(distance_matrix[p1][p2] for p1 in clusters[c1] for p2 in clusters[c2])

        if dist < min_dist:
          min_dist = dist
          closest_pair = (c1,c2)

    c1,c2 = closest_pair
    clusters[c1].extend(clusters[c2])
    del clusters[c2]

  return clusters

In [10]:
def compute_sse(clusters, data):
    sse = 0

    for cluster in clusters.values():
        if len(cluster) == 0:
            continue

        # Compute centroid
        cluster_points = [data[i] for i in cluster]
        centroid = np.mean(cluster_points, axis=0)

        # Compute SSE for this cluster
        sse += sum(np.linalg.norm(point - centroid) ** 2 for point in cluster_points)

    return sse


In [11]:
clusters = agglomerative_clustering(features,num_clusters=10)

In [12]:
# Compute SSE for the final clusters
sse_value = compute_sse(clusters, features)

# Print the final clusters and SSE value
print("Final Clusters:", clusters)
print("Sum of Squared Errors (SSE):", sse_value)


Final Clusters: {0: [0, 1, 2, 3, 5, 7, 13, 15, 17, 4, 6, 11, 14, 16, 19, 20, 23, 25, 9, 22, 26, 27, 28, 21, 29, 31, 33, 35, 39, 41, 45, 47, 48, 49, 51, 52, 43, 36, 37, 38, 32, 34, 18, 58, 30, 24, 61, 65, 68, 8, 10, 12, 42, 44, 46, 50, 54, 56, 55, 59, 63, 53, 57, 60, 62, 64, 67, 70, 66, 71, 72, 73, 74, 80, 76, 79, 83, 85, 77, 81, 86, 89, 92, 96, 98, 101, 104, 107, 93, 75, 78, 84, 87, 91, 88, 94, 95, 97, 99, 100, 103, 105, 111, 113, 114, 115, 112, 120, 124, 125, 121, 122, 123, 126, 127, 129, 131, 130, 136, 132, 133, 135, 137, 139, 141, 143, 142, 145, 144, 147, 148, 149, 150, 152, 154, 151, 153, 156, 158, 155, 157, 159, 161, 163, 134, 138, 146, 165, 167, 169, 168, 172, 173, 170, 171, 166, 175, 177, 179, 180, 181, 183, 185, 187, 182, 184, 188, 189, 190, 191, 164, 160, 186, 192, 193, 195, 197, 82, 90, 162], 40: [40], 69: [69], 102: [102, 106, 108, 109, 110, 116], 117: [117, 118, 119], 128: [128], 140: [140], 174: [174, 176, 178], 194: [194, 196], 198: [198, 199]}
Sum of Squared Errors (SSE)