# **Clustering 01**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the iris data
iris = sns.load_dataset("iris")
g = sns.pairplot(iris, hue="species")

In [None]:
iris.sample(10)

In [None]:
from sklearn.preprocessing import LabelEncoder

iris['species_encoded'] = LabelEncoder().fit_transform(iris['species'])
C = iris['species_encoded'].to_numpy()
iris.sample(10)

In [None]:
X = iris[['sepal_length','sepal_width','petal_length','petal_width']].to_numpy()
print(X.shape, type(X), len(C))

In [None]:
# Difference in sepal length and width per species
sns.scatterplot(x=iris["sepal_length"], y=iris["sepal_width"], hue=iris["species"])
plt.xlabel("Sepal Length(cm)")
plt.ylabel("Sepal Width(cm)")
plt.show()

In [None]:
sns.scatterplot(x=iris["petal_length"], y=iris["petal_width"], hue=iris["species"])
plt.xlabel("Petal Length(cm)")
plt.ylabel("Petal Width(cm)")
plt.show()

In [None]:
# Create the sample(data)
iris_sample = iris.drop(["species", "species_encoded"], axis=1).values
species = iris["species"].values

iris_sample[:5]

In [None]:
species[:5]

In [None]:
from sklearn.cluster import KMeans

np.random.seed(42)

# Instantiate the KMeans model
kmean = KMeans(n_clusters=3)

# Fit the model to the sample
kmean.fit(iris_sample)

In [None]:
# Predict the labels
labels = kmean.predict(iris_sample)
labels[:5]

In [None]:
df = pd.DataFrame({
    "labels": labels,
    "species": species
})
df.head()

In [None]:
# Crosstab
pd.crosstab(df["labels"], df["species"])

In [None]:
# Using inertia
kmean.inertia_

In [None]:
# You can compare different number of clusters
clusters = np.arange(1,6)
inertia = []

for c in clusters:
    model = KMeans(n_clusters=c)
    model.fit(iris_sample)
    inertia.append(model.inertia_)
    
plt.plot(clusters, inertia, marker="o")
plt.xlabel("Number of clusters(k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

np.random.seed(42)

kmeans = KMeans(n_clusters=3)
sc = StandardScaler()

# Pipeline
pipeline = make_pipeline(sc, kmeans)

# Fit the pipeline on the sample
pipeline.fit(iris_sample)

# Predictions
labels = pipeline.predict(iris_sample)

In [None]:
# Evaluate using cross tab
df = pd.DataFrame({
    "labels": labels,
    "species": species
})

# Crosstab
pd.crosstab(df["labels"], df["species"])

In [None]:
# Using inertia
kmean.inertia_

In [None]:
# Add labels to the iris dataset to see if they were really correct
iris["labels"] = labels
iris.head()

In [None]:
sns.scatterplot(x=iris["petal_length"], y=iris["sepal_length"], hue=iris["labels"])
plt.xlabel("Petal Length(cm)")
plt.ylabel("Sepal Length(cm)")
plt.show()

In [None]:
sns.scatterplot(x=iris["petal_length"], y=iris["petal_width"], hue=iris["labels"])
plt.xlabel("Petal Length(cm)")
plt.ylabel("Petal Width(cm)")
plt.show()

# **Clustering 02**

In [None]:
!pip install --upgrade umap-learn
!wget https://raw.githubusercontent.com/masruriyah/DataMining/master/Ruri_unsup.py

In [None]:
# Importing Modules untuk Notebook ini
import warnings; warnings.simplefilter('ignore')
import umap, numpy as np, Ruri_unsup as runsup, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
from sklearn import cluster, datasets
from sklearn.metrics import silhouette_score as siluet
from sklearn.metrics.cluster import homogeneity_score as purity
from sklearn.metrics import normalized_mutual_info_score as NMI 

sns.set(style="ticks", color_codes=True)
random_state = 99

In [None]:
# Kita akan menggunakan 2 data: [1]. Iris dan [2]. Data untuk Studi Kasus (tentang Energy) - di bagian akhir
# load the iris data
df = sns.load_dataset("iris")
X = df[['sepal_length','sepal_width','petal_length','petal_width']]#.values
C = df['species']#.values
print(X.shape)
df.sample(7)

In [None]:
g = sns.pairplot(df, hue="species")

In [None]:
# k-means: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
# 
k = 3
km = cluster.KMeans(n_clusters=k, init='random', random_state = 0)
km.fit(X)
# Hasil clusteringnya
C_km = km.predict(X)
p= sns.countplot(x=C_km)

In [None]:
# "Label" hasil clustering k-Means diatas.
C_km

In [None]:
X2D = umap.UMAP(n_neighbors=5, min_dist=0.3, random_state=random_state).fit_transform(X)
fig, ax = plt.subplots()
ax.scatter(X2D[:,0], X2D[:,1], c=C_km)
plt.show()

In [None]:
df['k-means'] = C_km
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','k-means']], \
                 hue="k-means", diag_kind="hist", palette="tab10")

In [None]:
# k-means: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
k = 3
km = cluster.KMeans(n_clusters=k, init='random', max_iter=300, tol=0.0001, random_state = 0)
km.fit(X)
# Hasil clusteringnya
C_km = km.predict(X)
p= sns.countplot(x=C_km)

In [None]:
# k-means++ clustering http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
k=3
kmPP = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=300, tol=0.0001, random_state = random_state)
kmPP.fit(X)
C_kmpp = kmPP.predict(X)

sns.countplot(x=C_kmpp)
C_kmpp[:10]

In [None]:
df['k-means++'] = C_kmpp
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','k-means++']], \
                 hue="k-means++", diag_kind="hist", palette="tab10")

In [None]:
# MiniBatch k-Means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html
mbkm = cluster.MiniBatchKMeans(n_clusters=k, init='random', \
                               max_iter=300, tol=0.0001, batch_size = 100, random_state = random_state) 
mbkm.fit(X)
C_mbkm = mbkm.predict(X)
sns.countplot(x=C_mbkm)
C_mbkm[:10]

In [None]:
df['mini-k-means'] = C_mbkm
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','mini-k-means']], \
                 hue="mini-k-means", diag_kind="hist", palette="tab10")

In [None]:
# MiniBatch k-Means++
mbkmPP = cluster.MiniBatchKMeans(n_clusters=k, init='k-means++', \
                                 max_iter=300, tol=0.0001, random_state = random_state) 
mbkmPP.fit(X)
C_mbkmPP = mbkmPP.predict(X)
sns.countplot(x=C_mbkmPP)
C_mbkmPP[:10]

In [None]:
# Saving the Clustering results for future use/analysis
!mkdir data
df.to_csv("data/df_Module-05.csv", encoding='utf8', index=False)

In [None]:
# Mari kita lakukan ulang clustering k-Means sebelumnya
k = 3
km = cluster.KMeans(n_clusters=k, init='random', max_iter=300, tol=0.0001, random_state = 0)
km.fit(X)
# Hasil clusteringnya
C_km = km.predict(X)
p= sns.countplot(x=C_km)

In [None]:
# Perhatikan inertia menggunakan variabel "km" BUKAN C_km
km.inertia_

In [None]:
distorsions, k1, kN = [], 2, 10
for k in range(k1, kN):
    kmeans = cluster.KMeans(n_clusters=k).fit(X)
    distorsions.append(kmeans.inertia_)

plt.plot(range(k1, kN), distorsions); plt.grid(True)
plt.title('Elbow curve')

In [None]:
runsup.sil_based_optimal_km()


In [None]:
#Evaluasi : Internal . Contoh Silouette Coefficient ==> warning hanya cocok untuk k-means (centroid-based clustering)
Hasil_Clustering = [C_km, C_kmpp, C_mbkm, C_mbkmPP]
for res in Hasil_Clustering:
    print(siluet(X,res), end=', ')

In [None]:
# Bagaimana dengan evaluasi External?
# "C" adalah golden standard dalam hal ini misal spesies bunga iris menurut para ahli biologi
for res in Hasil_Clustering:
    print(purity(C,res), end=', ')

In [None]:
kmPP.cluster_centers_

In [None]:
# Evaluasi sebenarnya tidak terlalu penting di Unsupervised learning.
# inilah yang membedakan "clustering" dan "clustering Analysis"
# yang lebih penting adalah interpretasi, tapi Bagaimana?
# contoh k-means++

cols = ['sepal_length','sepal_width','petal_length','petal_width']
dfC = pd.DataFrame(kmPP.cluster_centers_, columns=cols)
dfC['cluster'] = dfC.index

pd.plotting.parallel_coordinates(dfC, 'cluster', color=('r', 'g', 'b'))
plt.show()

In [None]:
!mkdir data
!pip install scikit-learn-extra

In [None]:
# Importing Modules untuk Notebook ini
import warnings; warnings.simplefilter('ignore')
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
from sklearn import cluster
from scipy.spatial import distance
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.spatial.distance
from sklearn_extra.cluster import KMedoids
from sklearn.ensemble import IsolationForest

sns.set(style="ticks", color_codes=True)
random_state = 99

In [None]:
# Loading the clustering Data from the previous module
file_ = 'data/df_Module-05.csv'
try: # Running Locally, yakinkan "file_" berada di folder "data"
    df = pd.read_csv(file_, error_bad_lines=False, low_memory = False, encoding='utf8')
except: # Running in Google Colab
    !mkdir data
    !wget -P data/ https://raw.githubusercontent.com/taudata-indonesia/ptpjb/master/{file_}
    df = pd.read_csv(file_, error_bad_lines=False, low_memory = False, encoding='utf8')

X = df[['sepal_length','sepal_width','petal_length','petal_width']]
C = df['species']#.values
df.head()


In [None]:
# Toy-data ==> ini perlu di riset
A = np.array([[2, 0], [1, 2], [2, 2], [7, 2], [6, 1], [7, 0], [10,10]])
fig, ax = plt.subplots()
ax.scatter(A[:,0], A[:,1])
plt.show()

In [None]:
# Solusi k-Means
km = cluster.KMeans(n_clusters=2, init='k-means++').fit(A)
C_km = km.predict(A)
centroid_km = km.cluster_centers_

# Solusi k-Medoid: https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
kmedoids = KMedoids(n_clusters=2).fit(A)
C_med = kmedoids.predict(A)
centroid_med = kmedoids.cluster_centers_
"Done"

In [None]:
# Plot Solusi k-Means
fig, ax = plt.subplots()
ax.scatter(A[:,0], A[:,1], c=C_km, s=100) # s= size
ax.scatter(centroid_km[:,0], centroid_km[:,1], c="red", s=30) # Centroid k-Means "Red"
plt.show()
C_km

In [None]:
# Plot Solusi k-Medoid
fig, ax = plt.subplots()
ax.scatter(A[:,0], A[:,1], c=C_med, s=100)
ax.scatter(centroid_med[:,0], centroid_med[:,1], c="green", s=30) # Centroid k-Medoid "Green"
plt.show() 
# Di Contoh ini terlihat pentingnya Outlier detection!

In [None]:
kmedoids = KMedoids(n_clusters=3).fit(X)
C_medoid = kmedoids.predict(X)
sns.countplot(x=C_medoid)
C_medoid[:10]

In [None]:
df['k-medoid'] = C_medoid
# Saving the Clustering results for future use/analysis
df.to_csv("data/df_Module-06.csv", encoding='utf8', index=False)
# Plot the clustering result
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','k-medoid']], \
                 hue="k-medoid", diag_kind="hist", palette="tab10")


In [None]:
# Hierarchical http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering
hierarchical = cluster.AgglomerativeClustering(n_clusters=3, linkage='average', affinity = 'euclidean')
hierarchical.fit(X) # Lambat .... dan menggunakan banyak memori O(N^2 log(N))
C_h = hierarchical.labels_.astype(np.int)

sns.countplot(x=C_h)
C_h[:10]

In [None]:
df['Hierarchical'] = C_h
# Saving the Clustering results for future use/analysis
df.to_csv("data/df_Module-06.csv", encoding='utf8', index=False)
# Plot the clustering result
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','Hierarchical']], \
                 hue="Hierarchical", diag_kind="hist", palette="tab10")

In [None]:
# Dendogram Example
# http://seaborn.pydata.org/generated/seaborn.clustermap.html
g = sns.clustermap(X, method="average", metric="euclidean")

In [None]:
# Ilustrasi Jika Data tidak terlalu banyak
N = 30 # sampel 30 record
S = X.sample(N)
Nama = ["Bunga-"+str(i) for i in range(N)]
M = scipy.spatial.distance.pdist(S, 'euclidean')
print(S.shape, M.shape)
S.head()

In [None]:
plt.figure(figsize=(12, 16))
dists = scipy.spatial.distance.squareform(M)
Z = linkage(dists, "average")
dendrogram(Z, labels=Nama, leaf_font_size=12, orientation='right')
plt.title("Clustering Sampel Bunga Iris")
plt.show()

In [None]:
# DBSCAN http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
# tidak membutuhkan input parameter k!!!... sangat bermanfaat untuk clustering data yang besar
dbscan = cluster.DBSCAN(eps=0.625, min_samples=5, metric='euclidean')
dbscan.fit(X)
C_db = dbscan.labels_.astype(np.int)
sns.countplot(x=C_db)
C_db[:10]
# apa makna cluster label -1?

In [None]:
# Banyaknya outlier terdeteksi
sum([1 for i in C_db if i==-1])

In [None]:
df['Dbscan'] = C_db
# Saving the Clustering results for future use/analysis
df.to_csv("data/df_Module-06.csv", encoding='utf8', index=False)
# Plot the clustering result
g = sns.pairplot(df[['sepal_length','sepal_width','petal_length','petal_width','Dbscan']], \
                 hue="Dbscan", diag_kind="hist", palette="tab10")