In [25]:
import pandas as pd
import numpy as np

data = pd.read_csv("wdbc.csv") # READ THE DATASET

In [26]:
# DATA NORMALIZATION

x = data.copy() # COPY THE DATA INTO A VARIABLE x

# Delete non-numerical values
del x["ID Number"]
del x["Diagnosis"]

# Standardize the data
X = (x - x.mean())/x.std()
y = np.array(X)


In [28]:
import random
k = int(input("Please specify a value of 'k' {2,3, or 5}"))

# randomly generate initial centroids
init_centroids = random.sample(range(0, len(y)), k)

#init_centroids = [339, 113, 42, 96, 487]

print("Initial Centroids:", init_centroids)

Please specify a value of 'k' {2,3, or 5}2
Initial Centroids: [286, 238]


In [29]:
centroids = []
for i in init_centroids:
    centroids.append(y[i])

# get the first centroids
centroids = np.array(centroids)

In [30]:
# to calculate the distance between points
def calc_distance(X1, X2):
    return(sum((X1 - X2)**2))**0.5

In [31]:
# to calculate nearest centroid
def findClosestCentroids(ic, y):
    assigned_centroid = []
    for i in y:
        distance=[]
        for j in ic:
            distance.append(calc_distance(i, j))
        assigned_centroid.append(np.argmin(distance))

    return assigned_centroid

In [32]:
get_centroids = findClosestCentroids(centroids, y)

In [33]:
# to calculate final centroids
def calc_centroids(clusters, y):
    new_centroids = []
    new_df = pd.concat([pd.DataFrame(y), pd.DataFrame(clusters, columns=['cluster'])],
                      axis=1)
    for c in set(new_df['cluster']):
        current_cluster = new_df[new_df['cluster'] == c][new_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        new_centroids.append(cluster_mean)
    return new_centroids

In [34]:
# centroids will get the final centroids in k clusters
for i in range(100):
    get_centroids = findClosestCentroids(centroids, y)
    centroids = calc_centroids(get_centroids, y)

In [35]:
# print the final centroids
print(centroids)


[0    -0.486348
1    -0.242569
2    -0.502207
3    -0.480579
4    -0.289452
5    -0.499943
6    -0.564452
7    -0.576955
8    -0.297474
9    -0.113033
10   -0.425520
11   -0.021773
12   -0.426185
13   -0.401195
14   -0.006059
15   -0.341563
16   -0.315407
17   -0.380435
18   -0.068028
19   -0.199100
20   -0.518746
21   -0.254911
22   -0.531308
23   -0.499866
24   -0.294838
25   -0.469762
26   -0.518366
27   -0.568576
28   -0.295513
29   -0.302517
dtype: float64, 0     0.985631
1     0.491589
2     1.017771
3     0.973940
4     0.586601
5     1.013182
6     1.143916
7     1.169254
8     0.602860
9     0.229073
10    0.862358
11    0.044125
12    0.863705
13    0.813061
14    0.012279
15    0.692210
16    0.639203
17    0.770989
18    0.137866
19    0.403495
20    1.051288
21    0.516602
22    1.076747
23    1.013026
24    0.597518
25    0.952017
26    1.050518
27    1.152275
28    0.598886
29    0.613081
dtype: float64]


In [36]:
# Davies-Bouldin index

def radius(centroids, num):
    return(sum((centroids - num)**2))

In [37]:
w = []
w2 = []
w3 = []
w4 = []
w5 = []

# get the number of points in each cluster
for i in range(0, len(get_centroids)):
    if get_centroids[i] == 0:
        w.append(i)
    if get_centroids[i] == 1:
        w2.append(i)
    if get_centroids[i] == 2:
        w3.append(i)
    if get_centroids[i] == 3:
        w4.append(i)
    if get_centroids[i] == 4:
        w5.append(i)

In [38]:
num = []
num2 = []
num3 = []
num4 = []
num5 = []

i = 0
j = 0
m = 0
p = 0
q = 0
r = 0

# associate each data point to its cluster
for i in range(len(w)):
    num.append(y[w[i]])

for i in range(len(w2)):
    num2.append(y[w2[i]])

for i in range(len(w3)):
    num3.append(y[w3[i]])

for i in range(len(w4)):
    num4.append(y[w4[i]])

for i in range(len(w5)):
    num5.append(y[w5[i]])

num = np.array(num)
num2 = np.array(num2)
num3 = np.array(num3)
num4 = np.array(num4)
num5 = np.array(num5)

radius_a = []
radius_b = []
radius_c = []
radius_d = []
radius_e = []

# find the numerator of davies-bouldin index
for i in range(0, len(num)):
    radius_a.append(radius(centroids[0], num[i]))
for i in range(0, len(num2)):
    radius_b.append(radius(centroids[1], num2[i]))
for i in range(0, len(num3)):
    radius_c.append(radius(centroids[2], num3[i]))
for i in range(0, len(num4)):
    radius_d.append(radius(centroids[3], num4[i]))
for i in range(0, len(num5)):
    radius_e.append(radius(centroids[4], num5[i]))

In [39]:
# find denominator and final radius for k clusters
if k > 1:

    denominator_01 = (radius(centroids[0], centroids[1]))
    denominator_01 = denominator_01**0.5
#    print(denominator)
    radi_1_2 = ((sum(radius_a))/len(num))**0.5 + ((sum(radius_b))/len(num2))**0.5
    radius_12 = radi_1_2/denominator_01
    
    
    if k > 2:
        denominator_02 = (radius(centroids[0], centroids[2]))
        denominator_02 = denominator_02**0.5
        radi_1_3 = ((sum(radius_a))/len(num))**0.5 + ((sum(radius_c))/len(num3))**0.5
        radius_13 = radi_1_3/denominator_02

        denominator_12 = (radius(centroids[1], centroids[2]))
        denominator_12 = denominator_12**0.5
        radi_2_3 = ((sum(radius_b))/len(num2))**0.5 + ((sum(radius_c))/len(num3))**0.5
        radius_23 = radi_2_3/denominator_12
    if k == 5:
        denominator_03 = (radius(centroids[0], centroids[3]))
        denominator_03 = denominator_03**0.5
        radi_1_4 = ((sum(radius_a))/len(num))**0.5 + ((sum(radius_d))/len(num4))**0.5
        radius_14 = radi_1_4/denominator_03
        
        denominator_04 = (radius(centroids[0], centroids[4]))
        denominator_04 = denominator_04**0.5
        radi_1_5 = ((sum(radius_a))/len(num))**0.5 + ((sum(radius_e))/len(num5))**0.5
        radius_15 = radi_1_5/denominator_04
        
        denominator_13 = (radius(centroids[1], centroids[3]))
        denominator_13 = denominator_13**0.5
        radi_2_4 = ((sum(radius_b))/len(num2))**0.5 + ((sum(radius_d))/len(num4))**0.5
        radius_24 = radi_2_4/denominator_13

        denominator_14 = (radius(centroids[1], centroids[4]))
        denominator_14 = denominator_14**0.5
        radi_2_5 = ((sum(radius_b))/len(num2))**0.5 + ((sum(radius_e))/len(num5))**0.5
        radius_25 = radi_2_5/denominator_14
        
        denominator_23 = (radius(centroids[2], centroids[3]))
        denominator_23 = denominator_23**0.5
        radi_3_4 = ((sum(radius_c))/len(num3))**0.5 + ((sum(radius_d))/len(num4))**0.5
        radius_34 = radi_3_4/denominator_23

        denominator_24 = (radius(centroids[2], centroids[4]))
        denominator_24 = denominator_24**0.5
        radi_3_5 = ((sum(radius_c))/len(num3))**0.5 + ((sum(radius_e))/len(num5))**0.5
        radius_35 = radi_3_5/denominator_24

        denominator_34 = (radius(centroids[3], centroids[4]))
        denominator_34 = denominator_34**0.5
        radi_4_5 = ((sum(radius_d))/len(num4))**0.5 + ((sum(radius_e))/len(num5))**0.5
        radius_45 = radi_4_5/denominator_34


In [40]:
# print the davies-bouldin index
if k == 2:
    print("Davies-Bouldin index is:", radius_12)
elif k == 3:
    max_1 = (max(radius_12, radius_13))
    max_2 = (max(radius_23, radius_12))
    max_3 = (max(radius_23, radius_13))
    print("Davies-Bouldin index is:", (max_1 + max_2 + max_3)/k)
elif k == 5:
    max_1 = (max(radius_12, radius_13, radius_14, radius_15))
    max_2 = (max(radius_23, radius_12, radius_24, radius_25))
    max_3 = (max(radius_23, radius_13, radius_34, radius_35))
    max_4 = (max(radius_14, radius_24, radius_34, radius_45))
    max_5 = (max(radius_15, radius_25, radius_35, radius_45))
    print("Davies-Bouldin index is:", (max_1 + max_2 + max_3 + max_4 + max_5)/k)

Davies-Bouldin index is: 1.4421315776945116
