In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [5]:
iris_df = pd.read_csv('datasets/Iris.csv',
                      skiprows=1,
                      names=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'])

iris_df.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.0,3.3,1.4,0.2,Iris-setosa
1,7.2,3.6,6.1,2.5,Iris-virginica
2,6.5,3.0,5.8,2.2,Iris-virginica
3,6.2,2.8,4.8,1.8,Iris-virginica
4,5.4,3.4,1.7,0.2,Iris-setosa


In [7]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

iris_df['class'] = label_encoding.fit_transform(iris_df['class'].astype(str))

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.0,3.3,1.4,0.2,0
1,7.2,3.6,6.1,2.5,2
2,6.5,3.0,5.8,2.2,2
3,6.2,2.8,4.8,1.8,2
4,5.4,3.4,1.7,0.2,0


In [8]:
iris_features = iris_df.drop('class', axis=1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,5.0,3.3,1.4,0.2
1,7.2,3.6,6.1,2.5
2,6.5,3.0,5.8,2.2
3,6.2,2.8,4.8,1.8
4,5.4,3.4,1.7,0.2


In [9]:
iris_labels = iris_df['class']

iris_labels.sample(5)

45     1
42     1
106    0
79     1
74     2
Name: class, dtype: int32

In [10]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)
    print('homo/tcompl/tv_meas/tARI/tAMI/tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         %(metrics.homogeneity_score(labels, model.labels_),
          metrics.completeness_score(labels, model.labels_),
          metrics.v_measure_score(labels, model.labels_),
          metrics.adjusted_rand_score(labels, model.labels_),
          metrics.adjusted_mutual_info_score(labels, model.labels_),
          metrics.silhouette_score(data, model.labels_)))

In [12]:
def k_means(data, n_clusters = 3, max_iter = 1000):
    model = KMeans(n_clusters = n_clusters, max_iter = max_iter).fit(data)
        
    return(model)

In [14]:
build_model(k_means, iris_features, iris_labels)

homo/tcompl/tv_meas/tARI/tAMI/tsilhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553
