
# Implementation of K-Means Clustering


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from sklearn import preprocessing
%matplotlib inline

In [None]:
df = pd.read_csv('Tetra.lrn',sep='\t',skiprows=3,  usecols=[0,1,2,3])

In [None]:
df.info()
#Size 400
#Dimensions 3
#Classes 4

In [None]:
df.head()

In [None]:
#Set '% Key' column as index
df = df.set_index(['% Key'])
df.head()

In [None]:
#Normalizing data for each column to be between 0 and 1
x= df.values
min_max_scaler = preprocessing.MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(x)
df = pd.DataFrame(X_normalized)

In [None]:
df.head()

In [None]:
df.columns=['C1','C2','C3']
df.head()
df1 = df.as_matrix()
df1[0:5]

In [None]:
#Initialize clusters as k randomly selected points from data
def initialize_clusters(data, k):
    return data[np.random.randint(data.shape[0], size=k)]

def euclid_dist(p1,p2):   
    return np.sqrt(np.sum((p1-p2)**2))

def obj_function(old_centroids,new_centroids):
    return np.sum(np.abs(new_centroids-old_centroids))

In [None]:
def k_Means(data,k):
    distances = []
    rc = initialize_clusters(data,k)
    Converge = False
    while (Converge == False):
        #for i in data:
            #for j in rc:
                #distances.append(euclid_dist(i,j))
        distances = [euclid_dist(i,j) for i in data for j in rc]
        reshaped_distances = np.array(distances,dtype=float).reshape(data.shape[0],k)
        assigned_Centroids = np.argmin(reshaped_distances, axis=1)
        old = rc
        
        for c in range(k):
            rc[c] = np.mean(data[assigned_Centroids == c], 0)
            
        new = rc
        if (obj_function(old,new) < 1**(-100000000000)):
            Converge = True  
    return rc

In [None]:
k_Means(df1,4)

In [None]:
clusters=k_Means(df1,4)
k=len(clusters)
distances = [euclid_dist(i,j) for i in df1 for j in clusters]
reshaped_Distances = np.array(distances,dtype=float).reshape(df1.shape[0],k)
assigned_Centroids = np.argmin(reshaped_Distances,axis=1)

In [None]:
group_colors = ['blue', 'red', 'green', 'yellow']
colors = [group_colors[i] for i in assigned_Centroids]

fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(df1[:,0], df1[:,1],df1[:,2], color=colors, alpha=0.4)
ax.scatter(clusters[:,0], clusters[:,1], clusters[:,2], color='black', marker='o', lw=2)

fig.set_size_inches(10, 6)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z');