In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('iris.csv')

**Investigating the dataset**

In [4]:
df.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.3+ KB


**replacing catagorical data**

In [6]:
df_copy = df.copy()

In [7]:
replace_cat_data = {'variety':{'Setosa':1,'Versicolor':2,'Virginica':3}}

In [8]:
df_copy.replace(replace_cat_data,inplace=True)

In [9]:
df_copy.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [10]:
data = df_copy.values #creating a np matrix with dataframe

**splitting the data set into two part(training set and test set),split ratio is given by the user(in this example i am using 67% split ratio)**

In [11]:
def loadDataSet(training_data,test_data,split):
    for i in range(len(data)):
        dataset = data[i,:]
        if random.random()<split:
            training_data.append(dataset)
        else:
            test_data.append(dataset)

In [12]:
#testing function
split_ratio=0.67
training_data = []
test_data = []

loadDataSet(training_data,test_data,split_ratio)
training_set = np.asarray(training_data)
test = np.asarray(test_data)
print(training_set.shape)
print(test.shape)

(94, 5)
(56, 5)


In [13]:
def assign_centroid(training_data,k):
    centroid = []
    for i in range(0,k):
        index = random.randint(0,len(training_data)-1)
        point = training_data[[index],:]
        centroid.append(point)
    
    return np.asarray(centroid)

In [14]:
#testing function
assign_centroid(training_set,3)

array([[[5.5, 3.5, 1.3, 0.2, 1. ]],

       [[5.1, 3.8, 1.6, 0.2, 1. ]],

       [[5.1, 3.8, 1.9, 0.4, 1. ]]])

In [15]:
# parameter min_index,i,training_set length,cluster no.
import pdb
def map_cluster(cluster,min_index,point_index):
    #pdb.set_trace()
    for j in range(0,len(cluster)):
        if cluster[j,min_index]==0:
            cluster[j,min_index] = point_index+1
            break
    
    return cluster
    

In [16]:
def avg_cluster(cluster,data,n):
    vector = np.zeros([n,5])
    for i in range(0,n):
        #pdb.set_trace()
        avg = []
        counter = 0
        for j in range(0,len(data)):
            if cluster[j,i]!=0:
                datapoint = int(cluster[j,i]-1)
                value = data[[datapoint],:]
                avg.append(value)
                counter += 1
        #pdb.set_trace()
        avg = np.asarray(avg).reshape(counter,5)
        vector[[i],:] = np.mean(avg,axis=0)
    
    return vector

In [17]:
#testing function
cluster = np.zeros([150,3])
for i in range(1,4):
    cluster = map_cluster(cluster,0,i)

for i in range(4,8):
    cluster = map_cluster(cluster,1,i)

for i in range(8,10):
    cluster = map_cluster(cluster,2,i)    
cluster
x = avg_cluster(cluster,data,3)
x

array([[4.73333333, 3.1       , 1.4       , 0.2       , 1.        ],
       [5.        , 3.575     , 1.5       , 0.275     , 1.        ],
       [4.65      , 3.        , 1.45      , 0.15      , 1.        ]])

In [18]:
def demap_cluster(cluster,data):
    x,y = cluster.shape
    label = np.zeros([x,y])
    for i in range(0,y):
        for j in range(0,x):
            if cluster[j,i]!=0:
                #pdb.set_trace()
                datapoint = int(cluster[j,i])
                value = data[datapoint-1,4]
                label[j,i] = value
    
    return label
                
            

In [19]:
def label_info(label,i):
    class_label = {1:0,2:0,3:0}
    for j in range(len(label)):
        if label[j,i]!=0:
            val = int(label[j,i])
            class_label[val]+=1
    return class_label

In [26]:
def main():
    split_ratio=0.67
    training_data = []
    test_data = []
    k = 3;#no. of 
    loadDataSet(training_data,test_data,split_ratio)
    training_set = np.asarray(training_data)
    test = np.asarray(test_data)
    centroid = assign_centroid(training_set,k)
    centroid_diff = np.ones([k,5])
    cluster = np.zeros([len(training_set),k])
    #pdb.set_trace()
    while np.linalg.norm(centroid_diff)>10**-2:
        cluster = np.zeros([len(training_set),k])
        for i in range(0,len(training_set)):
            distance = []
            for j in range(0,k):
                val = np.linalg.norm(training_set[[i],:]-centroid[[j],:])
                distance.append(val)
            #pdb.set_trace()
            min_index = distance.index(min(distance))
            cluster = map_cluster(cluster,min_index,i)
        #pdb.set_trace()
        new_centroid = avg_cluster(cluster,training_set,k)
        centroid_diff = new_centroid-centroid
        #print(np.linalg.norm(centroid_diff))
        centroid = new_centroid
    #print(cluster)
    label = demap_cluster(cluster,training_set)
    #pdb.set_trace()
    x,y = label.shape
    for i in range(0,y):
        count = label_info(label,i)
        print("Cluster: "+str(i+1)+" has")
        #pdb.set_trace()
        #print(count)
        print('setosa: ' +str(count[1])+' versicolor: '+str(count[2])+' virginica: '+str(count[3])+"\n")
    #return centroid
    
main()    

Cluster: 1 has
setosa: 0 versicolor: 38 virginica: 0

Cluster: 2 has
setosa: 0 versicolor: 1 virginica: 31

Cluster: 3 has
setosa: 31 versicolor: 0 virginica: 0

