EM 538-001: Practical Machine Learning for Enginering Analystics (Spring 2025)  
Instructor: Fred Livingston (fjliving@ncsu.edu) 

### Load and Prepare Datasets

In [1]:
import numpy as np
import pandas as pd 

In [None]:
blobs_df = pd.read_csv('kmeans_blobs.csv')
colnames = list(blobs_df.columns[1:-1])
blobs_df.head()

In [None]:
import seaborn as sns
sns.scatterplot(data=blobs_df, y="x", x="y", hue='cluster')

In [4]:
class Kmeans: 
    def __init__(self, k): 
        self.k = k

    def initiate_centroids(self, dset): 
        '''
        Select k data points as centroids
        k: number of centroids
        dset: pandas dataframe
        '''
        centroids = dset.sample(self.k)
        return centroids
    
    def rsserr(self, a,b):
        '''
        Calculate the root of sum of squared errors. 
        a and b are numpy arrays
        '''
        return np.square(np.sum((a-b)**2))

    def centroid_assignation(self, dset, centroids):
        '''
        Given a dataframe `dset` and a set of `centroids`, we assign each
        data point in `dset` to a centroid. 
        - dset - pandas dataframe with observations
        - centroids - pa das dataframe with centroids
        '''
        self.k = centroids.shape[0]
        n = dset.shape[0]
        assignation = []
        assign_errors = []

        for obs in range(n):
            # Estimate error
            all_errors = np.array([])
            for centroid in range(self.k):
                err = self.rsserr(centroids.iloc[centroid, :], dset.iloc[obs,:])
                all_errors = np.append(all_errors, err)

            # Get the nearest centroid and the error
            nearest_centroid =  np.where(all_errors==np.amin(all_errors))[0].tolist()[0]
            nearest_centroid_error = np.amin(all_errors)

            # Add values to corresponding lists
            assignation.append(nearest_centroid)
            assign_errors.append(nearest_centroid_error) 
        return assignation, assign_errors

### Steps 1 and 2 - Define k and initiate the centroids

In [None]:
#Example usage
kmeans_cluster = Kmeans(3)
centroids = kmeans_cluster.initiate_centroids(blobs_df)
centroids

### Step 3 - Calculate distance
### Step 4 - Assign centroids

In [None]:
blobs_df['centroid'], blobs_df['error'] = kmeans_cluster.centroid_assignation(blobs_df, centroids)
blobs_df.head()

In [None]:
sns.scatterplot(data=blobs_df, y="x", x="y")
sns.scatterplot(data=centroids, y="x", x="y", hue='cluster')

In [None]:
print("The total error is {0:.2f}".format(blobs_df['error'].sum()))


### Step 5 - Update centroid location

In [None]:
centroids = blobs_df.groupby('centroid').agg('mean').loc[:, colnames].reset_index(drop = True)
centroids

In [None]:
blobs_df['centroid'], blobs_df['error'] = kmeans_cluster.centroid_assignation(blobs_df, centroids)
blobs_df.head()