<h3>Importing the necessary packages:</h3>

In [83]:
import numpy as np
import pandas as pd

<h3>A function for measuring euclidean distance:</h3>

In [84]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

<h3>K-Means Implementation:</h3>

In [85]:
def kmeans(data, k, max_iterations=100):
    n = data.shape[0]
    centroids = data[np.random.choice(n, k, replace=False)]
    for i in range(max_iterations):
        clusters = {i: [] for i in range(k)}
        for point in data:
            distances = [euclidean_distance(point, centroid) for centroid in centroids]
            closest_centroid = np.argmin(distances)
            clusters[closest_centroid].append(point)

        new_centroids = []
        for cluster in clusters.values():
            new_centroids.append(np.mean(cluster, axis=0))
        new_centroids = np.array(new_centroids)

        if np.array_equal(centroids, new_centroids):
            break
        centroids = new_centroids
    return centroids, clusters

<h3>Purity Score Implementation:</h3>

In [86]:
def purity_score(clusters, labels):
    total_points = 0
    correctly_labelled_points = 0
    for cluster in clusters.values():
        cluster_points = np.array(cluster)
        cluster_label_counts = np.zeros(4)
        for i in range(4):
            mask = np.isin(labels, cluster_points)
            cluster_label_counts[i] = np.sum(labels[mask] == i)
        most_frequent_label = np.argmax(cluster_label_counts)
        cluster_mask = np.isin(labels, cluster_points)
        total_points += np.sum(cluster_mask)
        correctly_labelled_points += np.sum(labels[cluster_mask] == most_frequent_label)
    purity = correctly_labelled_points / total_points
    return purity

<h3>Load and preprocess the data:</h3>

In [87]:
df = pd.read_csv('hcvdat0.csv')
df['Sex'] = df['Sex'].replace({'m': 0, 'f': 1})
mean_values = df.mean(numeric_only=True)
df = df.fillna(mean_values)
data = df.iloc[:, :-1].apply(pd.to_numeric, errors='coerce').values
print(data.shape)
labels = df.iloc[:, -1].replace({'0=Blood Donor': 0, '0s=suspect Blood Donor': 1, 
                                 '1=Hepatitis': 2, '2=Fibrosis': 3, '3=Cirrhosis': 4}).values.reshape(-1, 1)

(615, 13)


<h3>Cluster the data using k-means:</h3>

In [88]:
centroids, clusters = kmeans(data, k=10)

<h3>Evaluate algorithm using purity score:</h3>

In [89]:
purity = purity_score(clusters, labels)
print(f"Purity Score: {purity}")

Purity Score: 0.9297568506368198
