# Cluster Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from scipy.cluster import hierarchy

## Import data

In [None]:
# loading iris data
iris = datasets.load_iris()

# forming a dataframe
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# create a target column
iris_df['species'] = iris.target

# view the dataset
iris_df.head(10)

## EDA

In [None]:
# map some colours in the dataset
cmap = {'0': 'r', '1': 'g', '2': 'b' }

iris_df['cspecies'] = iris_df.species.apply(lambda x: cmap[str(x)])

# viewing the data
iris_df.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=iris_df.cspecies)
iris_df.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=iris_df.cspecies)

Those are the 'real' species... what if we didn't know what they were...

In [None]:
# remove the labels
X = iris_df.drop(columns = ['species', 'cspecies'])

X.head(10)

## Clustering

### k-means

In [None]:
# set the method at a given k
km = cluster.KMeans(n_clusters = 3)

Let's run the clustering

In [None]:
# kmeans meets the data!
km.fit(X)

Let's take a look at result!

In [None]:
# cluster labels
km.labels_

## Validation

In [None]:
# add the cluster labels to the data set
validate = X.copy()

validate['km_clusters'] = km.labels_

validate.head(10)

In [None]:
# map some colours (change depending on how many clusters you choose...)
cclus = {'0': 'r', '1': 'g', '2': 'b', '3': 'y'}

validate['ckm_clusters'] = validate.km_clusters.apply(lambda x: cclus[str(x)]) 

validate.head(10)

In [None]:
# viewing the data
validate.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=validate.ckm_clusters)
validate.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=validate.ckm_clusters)

## Hierarchical clustering

Hierarchical clustering using Ward's Method

In [None]:
# get set
hc = cluster.AgglomerativeClustering(n_clusters = 4)

In [None]:
# fit
hc.fit(X)

In [None]:
# clusters
hc.labels_

In [None]:
# adding the resulting clusters
validate['hc_clusters'] = hc.labels_

validate.head(10)

In [None]:
# map some colours (change depending on how many clusters you choose...)
validate['chc_clusters'] = validate.hc_clusters.apply(lambda x: cclus[str(x)]) 

validate.head(10)

In [None]:
# viewing the data
validate.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=validate.chc_clusters)
validate.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=validate.chc_clusters)

Now up to you... try the DBSCAN method!