In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

## the data set for testing

df = np.array([ [12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72],
                [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24] ])
print(df.shape)

k = 5 ## the number of clusters
def plotclusters(X, labels, centers,k):
    
    colors = ['C{}'.format(i) for i in range(k)] ## [C(1), C(2), C(3)] clusters 
    for i in range(k):
        plt.scatter(X[labels == i, 0], X[labels == i, 1], color=colors[i], label='Cluster {}'.format(i+1)) ## for cluster 1, 
    plt.scatter(centers[:, 0], centers[:, 1], color='black', marker='x', s=200, linewidths=k, label='Cluster centers')
    plt.legend()
    plt.show()
# Generate some sample data
X = np.random.randn(100, 2)

# Apply KMeans clustering with k=3
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
# Plot the clusters and centers
plotclusters(X, kmeans.labels_, kmeans.cluster_centers_,k)

In [None]:
## Part 1. Initialization.
import random
k = 5

minX = min(df[0]) ## min value of X  from dataset
maxX = max(df[0]) ## max value of X from dataset
minY = min(df[1]) ## min value of Y from dataset
maxY = max(df[1]) ## max value of Y from dataset
# labels = np.random.randint(0, 3, size=19)  # Randomly assign each data point to one of three clusters

centres = np.zeros((k,2)) ## empty array for centers coordinates

for i in range(k): ##randomly select 3 points as centers
    centres[i] = [random.uniform(minX, maxX), random.uniform(minY, maxY)] 
df = df.T ##transpose the dataset for easier use
### plot data and centres, print centres
plt.scatter(df[:, 0], df[:, 1], c='red')

plt.scatter(centres[:, 0],centres[:, 1],c='black',marker='X')
plt.show()
# plotclusters(df.T, labels,centres)

In [None]:
## Part 2. Function: assigmnent.
# print(df)
import math
def assignment(df, centres):
    labels = np.zeros(df.shape[0],dtype=int) ## labels vector with the same dimension as dataset
    length= df.shape[0] ## amount of datapoints
    for i in range(0,length): ##Assignment for all datapoints
        mindistance= 1000000 
        point= df[i] 
#         print(point[0], point[1])
        for k in range(len(centres)): ## calculate function= shortest distance
            center= centres[k] 
            distance = (center[0]-point[0])**2 + (center[1]- point[1])**2 
            distance = math.sqrt(distance)
            if(distance< mindistance): 
                mindistance = distance
                labels[i] =  k
    return labels

labels = assignment(df,centres)
plotclusters(df,labels,centres,k)

In [None]:
## Part 3. Function: replace.
def replace(df, centres):
    labels = assignment(df, centres)
    k=len(centres)
    clusterCenterSums= np.zeros((k,2))
    length = df.shape[0]
    
    clusterCount = np.zeros(k)
    for k in labels:
        clusterCount[int(k)]+=1
    
    for index in range(length):
        point = df[index]
        k = int(labels[index])
        clusterCenterSums[k][0]+= point[0]
        clusterCenterSums[k][1]+= point[1]
    
    for index in range(len(clusterCenterSums)):
        centreX = clusterCenterSums[index][0] / clusterCount[index]
        centreY = clusterCenterSums[index][1] / clusterCount[index]
        centres[index] = [centreX, centreY]
    return centres
    
    
replace(df,centres)  
plotclusters(df,labels,centres,k)

In [None]:
print(labels)
def calculate_objective_function(X, labels, centres):
    obj = 0
    for i in  range(0,len(X)):
#         print(labels[i])
        distance= (centres[labels[i]][0] -X[i][0])**2+(centres[labels[i]][1] -X[i][1])**2
        
        obj = obj + distance
    return obj

calculate_objective_function(df, labels, centres)


def compareCentres(oldCenters,newCenters):

    for i in range(len(oldCenters)):
        if((oldCenters[i][0]-newCenters[i][0])**2+(oldCenters[i][1]-newCenters[i][1])**2 >0.0001):
            
            return False
    return True

def measureDistance(point1,point2):
    return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)


In [None]:

def findNearestCluster(point,centres,clusterNo):
    minDistance = 1000000000 
    cluster = -1
    for i in range(0,len(centres)):
        if(i !=clusterNo):
            distance = measureDistance(point,centres[i])
            if(distance< minDistance ):
                minDistance = distance
                cluster = i
    return cluster
# print(findNearestCluster([1,1],[[8,2],[2,2],[3,4]],0))

def silhouetteCoefficient(X,centres,labels):
        k=len(centres)
#         aSums = np.zeros(k)
        aList = np.zeros(k)
        
        bList = np.zeros(k)

        clusterCount = np.zeros(k)
#         print(labels)
        for j in labels:
            clusterCount[int(j)]+=1
        for i in range(len(X)):
            sumPoint=0
            sumPointtob=0
            
            nearestCluster = findNearestCluster(X[i],centres,labels[i])
#             if(labels[i]==1):
#                 print(nearestCluster, X[i],"\n")
            
            for k in range(len(X)):
                
                
                if(labels[i]==labels[k]):
                    
                    dist = measureDistance(X[i], X[k])
                    
                    sumPoint += dist
#                     if(labels[i]==1):
#                         print(sumPoint, i, labels[i], dist)
#             if(labels[i]==1):
#                 print(sumPoint, i, labels[i])
                
                if(labels[k]==nearestCluster):
                    disttob =measureDistance(X[i],X[k])
                    sumPointtob += disttob
#                     print(sumPointtob, disttob, X[i],X[k], labels[i])

                



            meanDist=sumPoint/(clusterCount[labels[i]]-1)

#             if(labels[i]==1):
#                 print(sumPoint,meanDist, i, labels[i])
#             aSums[labels[i]] += sumPoint/clusterCount[labels[i]]
            aList[labels[i]] += meanDist/clusterCount[labels[i]]
    
            
            meanDisttob = sumPointtob/(clusterCount[nearestCluster])
#             if(labels[i]==1):
#                 print(sumPointtob,meanDisttob, i, labels[i])
            bList[labels[i]] += meanDisttob/clusterCount[labels[i]]
    
            
        
#         print(aSums)
#         print(centres)
#         print(aList)
#         print(bList)
        a = np.mean(aList)
        b = np.mean(bList)
#         print(a,b)
        s= (b-a)/max(a,b)
#         print(s)
        return s



In [None]:
## Part 4. Script.
from sklearn.datasets import make_blobs
from matplotlib import pyplot
def kmeans(k,X):
#     print(k)
    
    # print(X,y)
    # df = np.array(X,y)
    count=0
    objective_function = [] 
    n=len(X)
    df=X
    global X2
    X2=X
    plt.scatter(df[:, 0], df[:, 1], c='red')
    plt.show()

    labels = np.random.randint(0, k, size=n)  # Randomly assign each data point to one of three clusters
    centres = np.zeros((k,2)) ## empty array for centers coordinates
    # print(df)
    global centres2
    centres2 =centres

    count=0

    
    
    for i in range(k): ##randomly select k points as centers
        random_index = random.randint(0, len(df) - 1)  # Generate a random index within the range of data points
        random_point = df[random_index]
        centres[i] = random_point 


    # df= df
#     print(centres)
    plotclusters(df,labels, centres,k)
#     print(k)
    flag=True
    
    while (flag):
        oldLabels =labels.copy()
        oldCentres =centres.copy()
#         print(np.array_equal(df,X))
        
        labels = assignment(df,centres)    
        centres = replace(df,centres)
        objective_function.append(calculate_objective_function(X,labels,centres))
#         print("old:\n",oldCentres)
#         print("new:\n",centres)
        count+=1
        if(count<4):
            print("Iteration ",count,":")
            plotclusters(df,labels, centres,k) 

    #     print(centres)
        
        epsilon =1e-6
#         if(count==4):
#             break
        if(compareCentres(oldCentres,centres)):
            flag=False

    plotclusters(df,labels, centres,k)
    global labels2 
    labels2 =labels
#     print(objective_function)
    plt.plot(range(1,len(objective_function)+1),objective_function)
    plt.show()

    
    s=silhouetteCoefficient(X,centres,labels)
    print("silhouette:",s)
    return s

k=5
X, y = make_blobs(n_samples=1000, centers=k, n_features=2)
kmeans(k, X)

kmeans2 = KMeans(n_clusters=k)
kmeans2.fit(X)
# Plot the clusters and centers
plotclusters(X, kmeans2.labels_, kmeans2.cluster_centers_,k)
print("sci-kit silhouette coefficient:",silhouetteCoefficient(X, kmeans2.cluster_centers_,kmeans2.labels_))

from sklearn.metrics import silhouette_score

# Assuming you have X, labels, and centres defined

# Calculate the silhouette score
silhouette_avg = silhouette_score(X,  kmeans2.labels_)
print("Silhouette Score:", silhouette_avg)
        
    
# print(labels2)
# print(X2)
# silhouetteCoefficient(X2,centres2,labels2)

In [None]:
def findBestK(X):
    maxSilhouette=-1
    bestk=-1.0
    for i in range (3,10):
        
        silhouette = kmeans(i,X)
        if(maxSilhouette<silhouette):
            maxSilhouette = silhouette
            print(maxSilhouette, i)
            bestk = i
    return bestk
X, y = make_blobs(n_samples=1000, centers=5, n_features=2)

print("the best:" ,findBestK(X))


In [None]:
# Generate a non-convex dataset
np.random.seed(0)
n_samples = 200

# Create a non-convex shape
def generate_non_convex_dataset(n_samples):
    theta = np.linspace(0, 2 * np.pi, n_samples)
    radius = 2 + np.random.rand(n_samples)
    x = radius * np.cos(theta)
    y = radius * np.sin(theta)
    return np.column_stack((x, y))

dataset = generate_non_convex_dataset(n_samples)

# Plot the non-convex dataset
plt.scatter(dataset[:, 0], dataset[:, 1], c='green', marker='o')
plt.title('Non-Convex Dataset')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.grid(True)
plt.show()


kmeans(k, dataset)



In [None]:

import numpy as np
import matplotlib.pyplot as plt

# Set the random seed for reproducibility
np.random.seed(0)

# Define the number of data points and clusters
n_samples = 300
n_clusters = 2

# Generate data for an elongated dataset
cluster1 = 1.0 * np.random.randn(n_samples // 2, 2)
cluster2 = 1.0 * np.random.randn(n_samples // 2, 2) + np.array([4, 0])

# Combine the two clusters
data = np.vstack((cluster1, cluster2))

# Plot the elongated dataset
plt.scatter(data[:, 0], data[:, 1], s=50)
plt.title('Elongated Dataset')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.grid(True)
plt.show()
kmeans(k, dataset)


In [None]:
import matplotlib.pyplot as plt

# Generate random data points in a circular pattern
def generate_circular_data(radius, num_points):
    theta = np.linspace(0, 2 * np.pi, num_points)
    x = radius * np.cos(theta) + np.random.normal(0, 0.1, num_points)
    y = radius * np.sin(theta) + np.random.normal(0, 0.1, num_points)
    return np.column_stack((x, y))

# Generate random data points in a uniform distribution
def generate_uniform_data(num_points):
    x = np.random.uniform(-1, 1, num_points)
    y = np.random.uniform(-1, 1, num_points)
    return np.column_stack((x, y))

# Create the dataset
num_points = 200
data1 = generate_circular_data(1, num_points)
data2 = generate_uniform_data(num_points)
data = np.vstack((data1, data2))

# Plot the dataset
plt.scatter(data[:, 0], data[:, 1,], s=10)
plt.show()
kmeans(k, dataset)
