# Clustering Exploration

### Import Dataset

In [15]:
import pandas as pd 
import numpy as np
np.seterr(over='ignore')

# read iris data
df = pd.read_csv('dataset/iris.csv')
iris_features = df.drop('variety', axis = 1)
iris_labels = df.drop(list(iris_features), axis = 1)

# read tennis data
df = pd.read_csv('dataset/tennis.csv')
df = df.drop('day', axis = 1)
tennis_features = df.drop('play', axis = 1)
tennis_labels = df.drop(list(tennis_features), axis = 1)

In [17]:
iris_features.head(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [18]:
tennis_features.head(5)

Unnamed: 0,outlook,temp,humidity,wind
0,Sunny,Hot,High,Weak
1,Sunny,Hot,High,Strong
2,Overcast,Hot,High,Weak
3,Rain,Mild,High,Weak
4,Rain,Cool,Normal,Weak


### One Hot Encoder 

Konversi data kategorik menjadi sparse

In [19]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

tennis_features = pd.get_dummies(tennis_features)

In [21]:
tennis_features.head()

Unnamed: 0,outlook_Overcast,outlook_Rain,outlook_Sunny,temp_Cool,temp_Hot,temp_Mild,humidity_High,humidity_Normal,wind_Strong,wind_Weak
0,0,0,1,0,1,0,1,0,0,1
1,0,0,1,0,1,0,1,0,1,0
2,1,0,0,0,1,0,1,0,0,1
3,0,1,0,0,0,1,1,0,0,1
4,0,1,0,1,0,0,0,1,0,1


### K-Means

In [23]:
from sklearn.cluster import KMeans

# iris dataset
kmeans = KMeans(n_clusters = 4, random_state = 0)

kmeans.fit(iris_features)
iris_result = kmeans.predict(iris_features)

print(iris_result)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 0 3 0 3 0 3 0 0 0 0 3 0 3 0 0 3 0 3 0 3 3
 3 3 3 3 3 0 0 0 0 3 0 3 3 3 0 0 0 3 0 0 0 0 0 3 0 0 2 3 2 2 2 2 0 2 2 2 3
 3 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 3 2 2 2 3 2 2 2 3 3
 2 3]


In [24]:
# tennis dataset
kmeans = KMeans(n_clusters = 2, random_state = 0)

kmeans.fit(tennis_features)
tennis_result = kmeans.predict(tennis_features)

print(tennis_result)

[1 1 1 1 0 0 0 1 0 0 0 1 0 1]


### Agglomerative Clustering

In [25]:
from sklearn.cluster import AgglomerativeClustering

# iris dataset
clustering = AgglomerativeClustering()
iris_result = clustering.fit_predict(iris_features)

print(iris_result)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [26]:
# tennis dataset
clustering = AgglomerativeClustering()
tennis_result = clustering.fit_predict(tennis_features)

print(tennis_result)

[0 0 0 0 1 1 1 0 1 1 0 0 0 0]


### DBSCAN

In [39]:
from sklearn.cluster import DBSCAN

# iris dataset
clustering = DBSCAN(eps=1, min_samples=4)
iris_result = clustering.fit_predict(iris_features)

print(iris_result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]


In [40]:
# tennis dataset
clustering = DBSCAN(eps=4, min_samples=2)
tennis_result = clustering.fit_predict(tennis_features)

print(tennis_result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0]


### Gaussian Mixture

In [42]:
from sklearn.mixture import GaussianMixture

# iris dataset
model = GaussianMixture()
model.fit(iris_features)
iris_result = model.predict(iris_features)

print(iris_result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [43]:
# tennis dataset
model = GaussianMixture()
model.fit(tennis_features)
tennis_result = model.predict(tennis_features)

print(tennis_result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0]


### K-Medoids

In [44]:
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils.metric import distance_metric, type_metric

# iris dataset
features = iris_features
list_features = []
for i in features.index:
    list_features.append(np.array(features.iloc[[i]])[0])
    
metric = distance_metric(type_metric.MINKOWSKI, degree = 4)
# metric = distance_metric(type_metric.CHEBYSHEV)

# set initial medoids
initial_medoids = [1, 6]

# create instance of K-Medoids algorithm
kmedoids_instance = kmedoids(list_features, initial_medoids, metric = metric)

# run cluster analysis and obtain results
kmedoids_instance.process();
clusters = kmedoids_instance.get_clusters()

# show allocated clusters
print(clusters)

[[126, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], [7, 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 98]]


In [45]:
# tennis dataset
features = tennis_features
list_features = []
for i in features.index:
    list_features.append(np.array(features.iloc[[i]])[0])
    
metric = distance_metric(type_metric.MINKOWSKI, degree = 4)
# metric = distance_metric(type_metric.CHEBYSHEV)

# set initial medoids
initial_medoids = [1, 6]

# create instance of K-Medoids algorithm
kmedoids_instance = kmedoids(list_features, initial_medoids, metric = metric)

# run cluster analysis and obtain results
kmedoids_instance.process();
clusters = kmedoids_instance.get_clusters()

# show allocated clusters
print(clusters)

[[7, 0, 1, 2, 3, 10, 11, 13], [4, 5, 6, 8, 9, 12]]


### MST

In [53]:
import matplotlib.pyplot as plt

# iris dataset
from mst_clustering import MSTClustering
model = MSTClustering(cutoff_scale=0.7, approximate=False)
iris_result = model.fit_predict(iris_features)

print(iris_result)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]


In [54]:
# tennis dataset
from mst_clustering import MSTClustering
model = MSTClustering(cutoff_scale=1.414214, approximate=False)
tennis_results = model.fit_predict(tennis_features)

print(tennis_results)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0]


### Grid Clustering

In [66]:
from grid_based_clustering.clusterData import clusterTwoColumns

# print(features[1])

attributes = features.iloc[[1]]
min_den = 10
gridSize = len(attributes)

# Execute clustering strategy.
for i in range(len(attributes)-1):
    for j in range(i+1, len(attributes)-1):
        clusterTwoColumns(attributes[i], attributes[j], features, min_den, gridSize)

# print(features)
# print(attributes)