In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# importing the libraries
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [5]:
iris_df = pd.read_csv('datasets/Iris.csv',
                      skiprows=1,
                      names=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'])

iris_df.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,2.5,4.5,1.7,Iris-virginica
1,6.1,2.9,4.7,1.4,Iris-versicolor
2,5.1,3.8,1.5,0.3,Iris-setosa
3,7.7,3.8,6.7,2.2,Iris-virginica
4,5.1,3.7,1.5,0.4,Iris-setosa


In [7]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

iris_df['class'] = label_encoding.fit_transform(iris_df['class'].astype(str))

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.9,2.5,4.5,1.7,2
1,6.1,2.9,4.7,1.4,1
2,5.1,3.8,1.5,0.3,0
3,7.7,3.8,6.7,2.2,2
4,5.1,3.7,1.5,0.4,0


In [8]:
iris_features = iris_df.drop('class', axis=1)



In [9]:
iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,4.9,2.5,4.5,1.7
1,6.1,2.9,4.7,1.4
2,5.1,3.8,1.5,0.3
3,7.7,3.8,6.7,2.2
4,5.1,3.7,1.5,0.4


In [10]:
iris_labels = iris_df['class']

iris_labels.sample(5)

71     1
7      2
105    1
90     2
79     2
Name: class, dtype: int32

In [11]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)
    print('homo/tcompl/tv_meas/tARI/tAMI/tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         %(metrics.homogeneity_score(labels, model.labels_),
          metrics.completeness_score(labels, model.labels_),
          metrics.v_measure_score(labels, model.labels_),
          metrics.adjusted_rand_score(labels, model.labels_),
          metrics.adjusted_mutual_info_score(labels, model.labels_),
          metrics.silhouette_score(data, model.labels_)))

# K Means Clustering

In [12]:
def k_means(data, n_clusters = 3, max_iter = 1000):
    model = KMeans(n_clusters = n_clusters, max_iter = max_iter).fit(data)
        
    return(model)

In [13]:
build_model(k_means, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


# Agglomerative Clustering


In [14]:
def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [15]:
build_model(agglomerative_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


# DBSCAN

In [16]:
def dbscan_fn(data, eps =0.45, min_samples=4):
    model = DBSCAN(eps = eps, min_samples=min_samples).fit(data)
    return model

In [17]:
build_model(dbscan_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.577	0.609	0.593	0.508	0.584	0.372


# Mean shift clustering

In [18]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    
    return model

In [19]:
build_model(mean_shift_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.763	0.551


# BIRCH

In [20]:
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters = n_clusters).fit(data)
    return model

In [21]:
build_model(birch_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.630	0.798	0.704	0.564	0.700	0.534


# Affinity porpagation

In [22]:
def affinity_propogation_fn(data, damping=0.6, max_iter = 1000):
    model = AffinityPropagation(damping=damping, max_iter = max_iter).fit(data)
    return model

In [23]:
build_model(affinity_propogation_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.851	0.492	0.623	0.437	0.612	0.349




# Mini batch k means 

In [24]:
def mini_batch_kmeans_fn(data, n_clusters=3, max_iter =1000):
    model = MiniBatchKMeans(n_clusters = n_clusters, max_iter = max_iter, batch_size=20).fit(data)
    return model

In [25]:
build_model(mini_batch_kmeans_fn, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


# Spectral

In [26]:
from sklearn.cluster import SpectralClustering

In [27]:
#self similarity
SS=1000

In [28]:
#intra cluster similarity
IS = 10

In [29]:
# low similarity
LS=0.01

In [30]:
similarity_mat =[[SS,IS,IS,LS,LS,LS,LS,LS,LS],
                 [IS,SS,IS,LS,LS,LS,LS,LS,LS],
                 [IS,IS,SS,LS,LS,LS,LS,LS,LS],
                 [LS,LS,LS,SS,IS,IS,LS,LS,LS],
                 [LS,LS,LS,IS,SS,IS,LS,LS,LS],
                 [LS,LS,LS,IS,IS,SS,LS,LS,LS],
                 [LS,LS,LS,LS,LS,LS,SS,IS,IS],
                 [LS,LS,LS,LS,LS,LS,IS,SS,IS],
                 [LS,LS,LS,LS,LS,LS,IS,IS,SS]]

In [31]:
spectral_model = SpectralClustering(n_clusters = 3, affinity='precomputed').fit(similarity_mat)

In [32]:
spectral_model.labels_

array([0, 0, 0, 1, 1, 1, 2, 2, 2])