In [3]:
import numpy as np
import pandas as pd
from scipy import stats
import random

In [4]:
df = pd.read_csv("agaricus-lepiota.data", sep=',', header=None)

In [5]:
print(df.isna().sum())
df.describe()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [6]:
for colname in df[[11]]:
    print("{} = {}".format(colname, len(df[colname].unique())))

11 = 5


In [7]:
#to deal with '?' we replace it with NaN 
df = df.replace({'?':np.NaN})
print(df.isna().sum())

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11    2480
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
dtype: int64


In [8]:
print(df[11].value_counts())

b    3776
e    1120
c     556
r     192
Name: 11, dtype: int64


In [9]:
#Now to deal with NaN we create a column to keep track of imputed variables and impute the variables with the mode of the values    

#add new column and replace it with binary variables if null then 1 else 0
df["11_imputed"] =   np.where(df[11].isnull(),1,0)

#Take mode in that vairable
Mode = df[11].mode()[0]

#Replace NaN values with mode in actual vairable
df[11].fillna(Mode,inplace=True)

    

In [10]:
print(df.isna().sum())

0             0
1             0
2             0
3             0
4             0
5             0
6             0
7             0
8             0
9             0
10            0
11            0
12            0
13            0
14            0
15            0
16            0
17            0
18            0
19            0
20            0
21            0
22            0
11_imputed    0
dtype: int64


In [11]:
print(df[11].value_counts())

b    6256
e    1120
c     556
r     192
Name: 11, dtype: int64


In [12]:
#lets drop the class label as it should not be used in unsupervised learning algorithms
#lets drop the last column for now as we dont need it for our k-mode clustering
df2 = df.drop(columns=[0,'11_imputed'])
df2


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [15]:
def get_distance(x,c):
    return np.sum(np.array(x) != np.array(c), axis = 0)

In [19]:
def random_clusters(k,n):
    dup = np.array([])
    while 1:
        ranIndex = np.random.randint(low=0, high=n, size=k)
        u, c = np.unique(ranIndex, return_counts=True)
        dup = u[c > 1]
        if dup.size == 0:
            break
    return ranIndex

In [90]:
def kmodes(dataset, NumberOfClusters):
    n = len(dataset)
    d = len(dataset.columns)
    df_temp = dataset.to_numpy()
    addZeros = np.zeros((n, 1))
    df_temp = np.append(df_temp, addZeros, axis=1)
    cluster = df_temp[random_clusters(NumberOfClusters,n)]
    print("\n The initial cluster centers: \n", cluster , "\n\n")
    cluster2 = []   
    for i in range(n):
        minDist = 9999999
        
        for j in range(NumberOfClusters):
            dist = get_distance(cluster[j,0:d],df_temp[i,0:d])
            if(dist < minDist):
                minDist = dist
                clusterNumber = j
                df_temp[i,d] = clusterNumber
                cluster[j,d] = clusterNumber
                
    for j in range(NumberOfClusters):
        result =  np.where(df_temp[:,d] == j)
        mode = stats.mode(df_temp[result])
        cluster[j] = np.reshape(mode[0],(d+1)) 
            
    while(np.any(cluster != cluster2)):
        cluster2 = cluster
        for i in range(n):
            minDist = 9999999
            for j in range(NumberOfClusters):
                dist = get_distance(cluster[j,0:d],df_temp[i,0:d])
                if(dist < minDist):
                    minDist = dist
                    clusterNumber = j
                    df_temp[i,d] = clusterNumber
                    cluster[j,d] = clusterNumber
                    
        for j in range(NumberOfClusters):
            result =  np.where(df_temp[:,d] == j)
            mode = stats.mode(df_temp[result])
            cluster[j] = np.reshape(mode[0],(d+1))
            
        if np.array_equal(cluster,cluster2):
            break
            
    dataset3 = pd.DataFrame(df_temp)
    
    return dataset3

In [97]:
cluster = kmodes(df2,20)


 The initial cluster centers: 
 [['x' 's' 'n' 't' 'n' 'f' 'c' 'b' 'w' 'e' 'b' 's' 's' 'w' 'w' 'p' 'w' 't'
  'p' 'w' 'y' 'p' 0.0]
 ['x' 'y' 'g' 't' 'n' 'f' 'c' 'b' 'n' 't' 'b' 's' 's' 'g' 'w' 'p' 'w' 'o'
  'p' 'n' 'v' 'd' 0.0]
 ['x' 's' 'w' 't' 'p' 'f' 'c' 'n' 'w' 'e' 'e' 's' 's' 'w' 'w' 'p' 'w' 'o'
  'p' 'k' 's' 'g' 0.0]
 ['f' 'f' 'e' 't' 'n' 'f' 'c' 'b' 'n' 't' 'b' 's' 's' 'g' 'p' 'p' 'w' 'o'
  'p' 'n' 'v' 'd' 0.0]
 ['f' 'y' 'n' 't' 'n' 'f' 'c' 'b' 'p' 't' 'b' 's' 's' 'g' 'p' 'p' 'w' 'o'
  'p' 'k' 'y' 'd' 0.0]
 ['x' 'f' 'g' 'f' 'n' 'f' 'w' 'b' 'k' 't' 'e' 's' 'f' 'w' 'w' 'p' 'w' 'o'
  'e' 'n' 'a' 'g' 0.0]
 ['f' 's' 'n' 'f' 'f' 'f' 'c' 'n' 'b' 't' 'b' 'k' 's' 'w' 'p' 'p' 'w' 'o'
  'e' 'w' 'v' 'l' 0.0]
 ['x' 'y' 'n' 'f' 'f' 'f' 'c' 'n' 'b' 't' 'b' 'k' 'k' 'w' 'p' 'p' 'w' 'o'
  'e' 'w' 'v' 'p' 0.0]
 ['b' 's' 'w' 't' 'a' 'f' 'c' 'b' 'n' 'e' 'c' 's' 's' 'w' 'w' 'p' 'w' 'o'
  'p' 'k' 'n' 'g' 0.0]
 ['x' 's' 'g' 'f' 'n' 'f' 'w' 'b' 'w' 'e' 'b' 'k' 's' 'w' 'w' 'p' 'w' 't'
  'p' 'w' 'n' 'g' 0.

  while(np.any(cluster != cluster2)):


In [98]:
cluster = cluster.rename(columns ={22: "Cluster"} )
cluster

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,Cluster
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,2
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,8
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,8
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,2
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,o,o,p,o,o,p,b,c,l,0
8120,x,s,n,f,n,a,c,b,y,e,...,o,o,p,n,o,p,b,v,l,0
8121,f,s,n,f,n,a,c,b,n,e,...,o,o,p,o,o,p,b,c,l,0
8122,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,7


In [99]:
cluster.to_csv("cluster.csv",index=False)