In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs


In [2]:
df = pd.read_csv("MLData.csv")
columns = ["Timestamp","Level","Class","Scale","Gender","Age","Residence","RelationshipStatus","FinanceState","CopeWithInstitute","RelationWithFamily","Pressure","AcademicResult","LivingPlace","SupportedBy","SocialMediaIn6","InferiorityComplex","MealSatisfaction","Health","OtherPositiveActivity","SleepTime"]
df.columns = columns
df = df.sample(frac=1).reset_index(drop=True)
Class_Status = df["Class"]
Class_Scale = df["Scale"]
df.drop(["Class", "Scale", "Timestamp"], axis=1, inplace=True)
columns = ["Level","Gender","Residence","RelationshipStatus","FinanceState","CopeWithInstitute","RelationWithFamily","Pressure","AcademicResult","LivingPlace","SupportedBy","SocialMediaIn6","InferiorityComplex","MealSatisfaction","Health","OtherPositiveActivity"]
df_Enc = pd.get_dummies(df, columns=columns)

## Using Scikit-learn 

In [3]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(df_Enc)

In [4]:
kmeans.predict(df_Enc[:5])

array([3, 0, 3, 2, 2])

In [5]:
print("Labels :{}".format(kmeans.labels_))
print("Centers :{}".format(kmeans.cluster_centers_))

Labels :[3 0 3 2 2 1 3 3 0 3 0 1 3 4 0 2 0 3 3 0 1 2 2 2 1 1 3 0 3 0 3 3 4 2 0 0 2
 1 3 1 0 1 3 3 0 2 4 3 3 1 1 0 4 3 0 1 1 3 2 2 3 0 0 4 3 1 3 1 0 0 4 0 1 0
 0 0 1 0 2 1 3 1 3 2 1 0 2 4 2 2 1 3 2 3 1 1 3 2 3 1 0 3 2 2 4 3 1 1 1 1 3
 3 3 2 3 4 2 2 3 1 2 2 2 3 1 3 2 3 1 3 0 2 0 0 3 3 2 0 4 2 1 0 0 2 0 0 2 2
 0 0 0 0 3 1 3 3 0 2 3 3 2 1 0 2 0 3 2 0 1 3 3 1 1 0 1 4 0 2 0 1 2 3 4 0 2
 2 0 1 1 2 2 0 3 2 2 4 2 4 4 4 0 0 1 3 2 4 3 3 4 3 3 0 0 3 0 0 2 2 0 3 2 0
 1 4 2 1 0 3 2 4 1 2 0 0 1 2 4 0 2 1 0 2 1 1 1 0 0 1 1 1 1 0 2 1 0 0 0 2 0
 1 0 4 3 0 0 4 1 3 2 0 3 0 2 3 1 1 2 1 2 3 2 1 3 2 1 3 1 2 0 1 2 0 3 1 1 3
 0 1 3 2 1 2 1 1 2 0 2 1 2 1 4 2 3 1 2 2 2 3 1 4 1 0 4 3 4 2 2 3 3 0 4 0 0
 3 0 0 2 0 0 0 2 0 3 2 4 1 1 3 3 3 2 2 4 4 3 3 4 0 3 0 2 4 3 0 3 1 3 3 0 2
 2]
Centers :[[ 2.20000000e+01  9.08888889e+00 -9.71445147e-17 -8.32667268e-17
   2.22044605e-16  1.00000000e+00  1.00000000e+00  1.22222222e-01
   8.77777778e-01 -6.93889390e-17 -2.08166817e-17  1.00000000e+00
   5.55111512e-17  1.00000000e+

## Using Own Implementation

In [6]:
from KMeansClustering import KMeans as KMn

In [7]:
## Sample Data
X, y = make_blobs(n_samples=100, centers=5, n_features=20, random_state=0)
sample = pd.DataFrame(X)

kmn = KMn(5)
kmn.fit(sample)
print(kmn.predict(X))
print("Number of iteration = {}".format(kmn.n_iteration))

[0 1 3 1 0 0 2 0 3 0 0 3 0 0 2 3 1 1 1 0 0 4 0 0 0 1 0 2 2 3 2 2 0 1 2 1 2
 3 2 3 0 0 0 3 2 1 0 0 0 0 1 0 2 2 0 0 0 3 0 0 3 2 0 0 3 0 0 2 0 1 2 1 3 1
 3 3 3 1 0 1 3 0 0 3 4 2 0 3 2 2 0 0 0 4 3 3 2 4 2 1]
Number of iteration = 2


In [8]:
kmn = KMn(n_clusters=5)
kmn.fit(df_Enc)
print("Labels :{}".format(kmn.labels))
print("Centers :{}".format(kmn.centers))
print("Number of iteration = {}".format(kmn.n_iteration))

Labels :[0 4 3 2 2 3 0 3 4 3 4 2 0 1 4 2 4 3 0 4 3 2 2 2 2 2 3 3 3 4 0 3 1 2 4 4 2
 2 0 2 4 2 3 0 4 2 1 3 3 3 3 4 1 3 4 2 3 3 2 2 3 4 4 1 3 2 3 2 4 4 1 4 3 4
 4 4 3 4 2 2 3 2 3 2 3 4 2 1 2 2 2 3 2 3 2 2 0 2 3 2 4 0 2 2 1 3 2 3 2 2 0
 3 0 2 3 1 2 2 3 3 2 2 2 3 3 3 2 0 3 3 4 2 4 4 0 3 2 4 1 2 3 4 4 2 4 4 2 2
 4 4 4 4 3 2 3 0 4 2 3 3 2 3 4 2 4 3 2 4 3 0 0 2 2 4 2 1 4 2 3 2 2 0 1 4 2
 2 4 3 3 2 2 4 3 2 2 1 2 1 1 1 4 4 3 0 2 1 3 3 1 0 0 4 4 3 4 4 2 2 4 3 2 4
 3 1 2 2 4 3 2 1 1 2 4 4 3 2 1 4 2 2 4 2 3 1 3 4 4 3 2 3 2 4 2 3 4 4 4 2 4
 3 4 1 0 4 4 1 3 0 2 3 0 4 2 0 2 2 2 2 2 3 2 2 3 2 2 0 2 2 4 3 2 4 0 3 2 0
 4 2 0 2 3 2 3 2 2 4 2 3 2 2 1 2 3 2 2 2 2 0 3 1 2 4 1 0 1 2 2 0 3 4 1 4 4
 3 4 4 2 4 4 4 2 4 3 2 1 2 3 0 0 3 2 2 1 1 3 0 1 4 3 4 2 1 3 4 0 2 0 0 4 2
 2]
Centers :[[2.20000000e+01 4.89189189e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 1.00000000e+00 0.00000000e+00
  2.70270270e-02 9.72972973e-01 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e

In [10]:
#Cluster comparison between my implementaion and and sklearn
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(kmeans.predict(df_Enc), kmn.predict(df_Enc))
# Score is low because of my simple euclidean distance algorithm where sklearn uses elkan algorithm (Most probabli) 

0.6237290597140378