In [1]:
## normal imports and setup
%reset -f
%matplotlib notebook
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt

In [2]:
## imports for jupyterthemes
from jupyterthemes import jtplot
jtplot.style()

In [3]:
## imports for sklearn
import sklearn
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [4]:
## misc imports
import warnings
warnings.filterwarnings('ignore')

In [5]:

# load data
iris_df = pd.read_csv('datasets/iris.csv',
                     skiprows=1,
                     names = ['sepal-length',
                             'sepal-width',
                             'petal-length',
                             'petal-width',
                             'class'])
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
# randomize rows
iris_df = iris_df.sample(frac=1).reset_index(drop=True)
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.8,2.7,4.1,1.0,Iris-versicolor
1,5.1,3.5,1.4,0.3,Iris-setosa
2,6.5,2.8,4.6,1.5,Iris-versicolor
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.4,3.4,1.5,0.4,Iris-setosa


In [7]:
iris_df.shape

(150, 5)

In [8]:
## label encode the data
from sklearn import preprocessing
label_encoding = preprocessing.LabelEncoder()
iris_df['class'] = label_encoding.fit_transform(iris_df['class'].astype(str))
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.8,2.7,4.1,1.0,1
1,5.1,3.5,1.4,0.3,0
2,6.5,2.8,4.6,1.5,1
3,4.6,3.4,1.4,0.3,0
4,5.4,3.4,1.5,0.4,0


In [9]:
iris_features = iris_df.drop('class',axis=1)
iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,5.8,2.7,4.1,1.0
1,5.1,3.5,1.4,0.3
2,6.5,2.8,4.6,1.5
3,4.6,3.4,1.4,0.3
4,5.4,3.4,1.5,0.4


In [10]:
iris_labels = iris_df['class']
iris_labels.sample(5)

6      2
133    1
36     0
126    0
46     0
Name: class, dtype: int64

In [11]:
## build model and print metrics
def build_model(clustering_model, data, labels):
    model = clustering_model(data)
    print('home\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50*'-')
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         %(metrics.homogeneity_score(labels,model.labels_),
          metrics.completeness_score(labels,model.labels_),
          metrics.v_measure_score(labels,model.labels_),
          metrics.adjusted_rand_score(labels,model.labels_),
          metrics.adjusted_mutual_info_score(labels,model.labels_),
          metrics.silhouette_score(data,model.labels_)))
    

In [12]:
## kmeans cluster
def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    return model
build_model(k_means, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.748	0.553


In [13]:
## default linkage is ward
## minimizes the variance of the data points in the two clusters before joining them
def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters=n_clusters).fit(data)
    return model
build_model(agglomerative_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.758	0.554


In [21]:
##dbscan function
def dbscan_fn(data, eps=.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return model
build_model(agglomerative_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.758	0.554


In [22]:
## meanshift clustering
## meanshift uses binning techniques to try and find starting seeds for data
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model
build_model(mean_shift_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.757	0.551


In [23]:
## birch clustering
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model
build_model(birch_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.635	0.792	0.705	0.566	0.630	0.534


In [24]:
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model
build_model(birch_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.635	0.792	0.705	0.566	0.630	0.534


In [25]:
## mini batch kmeans
def mini_batch_kmeans_fn(data, n_clusters=3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    return model
build_model(mini_batch_kmeans_fn, iris_features, iris_labels)

home	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.736	0.747	0.742	0.716	0.733	0.551


In [39]:
from sklearn.cluster import SpectralClustering
SS = 1000  # self similarity, the similarity of a point with itself
IS = 10    # intracluster similarity, the similarity between points in a cluster
LS = 0.01  # low similarity, similarity between points between different clusters
## create sample similarity matrix
similarity_mat = [[SS, IS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, SS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, IS, SS, LS, LS, LS, LS, LS, LS],
                  [LS, LS, LS, SS, IS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, SS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, IS, SS, LS, LS, LS],
                  [LS, LS, LS, LS, LS, LS, SS, IS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, SS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, IS, SS]] 
similarity_mat

[[1000, 10, 10, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [10, 1000, 10, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [10, 10, 1000, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 1000, 10, 10, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 10, 1000, 10, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 10, 10, 1000, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1000, 10, 10],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 10, 1000, 10],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 10, 10, 1000]]

In [40]:
spectral_model = SpectralClustering(n_clusters=3, affinity='precomputed').fit(similarity_mat)
spectral_model.labels_

array([1, 1, 1, 0, 0, 0, 2, 2, 2], dtype=int32)