# Clustering Extras
## Minh Nguyen

### I'm gonna use the ___wine_quality___ dataset from HW1

In [58]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score

# Sets parameters for the entire notebook for Seaborn/Matplotlib plots
rcParams['figure.figsize'] = 10, 7
rcParams.update({'font.size': 12})
sns.set_style('darkgrid')

In [59]:
wine_data = pd.read_csv('wine_quality.csv')
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.182611
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,4.874886
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,4.755772
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,5.783057
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.199293


- I will do with the original data and with scaled data.

### __Original__

In [60]:
X_unscaled = wine_data.drop('quality', axis=1)
y_unscaled = wine_data['quality']

### KMeans clustering

In [61]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_unscaled)
kmeans_silhouette = silhouette_score(X_unscaled, kmeans.labels_)
print("KMeans Silhouette Score:", kmeans_silhouette)

KMeans Silhouette Score: 0.5197446557017006


### Hierarchial clustering

In [62]:
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical.fit(X_unscaled)
hierarchical_silhouette = silhouette_score(X_unscaled, hierarchical.labels_)
print("Hierarchical Silhouette Score:", hierarchical_silhouette)

Hierarchical Silhouette Score: 0.4739848424880398


### DBSCAN

In [63]:
dbscan = DBSCAN(eps=0.5, min_samples=3)
dbscan.fit(X_unscaled)
dbscan_silhouette = silhouette_score(X_unscaled, dbscan.labels_)
print("DBSCAN Silhouette Score:", dbscan_silhouette)

DBSCAN Silhouette Score: -0.7075315227381067


- With the original data, we can see that KMeans clustering returns the best Silhouette Score among the other clusters, with a score of 0.5197

### __MinMaxScaler__

In [64]:
scaler_mm = MinMaxScaler()
X_mm = pd.DataFrame(scaler_mm.fit_transform(X_unscaled), columns=X_unscaled.columns)

### KMeans clustering

In [65]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_mm)
kmeans_silhouette = silhouette_score(X_mm, kmeans.labels_)
print("MM Scaler - KMeans Silhouette Score:", kmeans_silhouette)

MM Scaler - KMeans Silhouette Score: 0.21085037217573813


### Hierarchial clustering

In [66]:
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical.fit(X_mm)
hierarchical_silhouette = silhouette_score(X_mm, hierarchical.labels_)
print("MM Scaler - Hierarchical Silhouette Score:", hierarchical_silhouette)

MM Scaler - Hierarchical Silhouette Score: 0.17689084601186422


### DBSCAN

In [67]:
dbscan = DBSCAN(eps=0.5, min_samples=3)
dbscan.fit(X_mm)
dbscan_silhouette = silhouette_score(X_mm, dbscan.labels_)
print("MM Scaler - DBSCAN Silhouette Score:", dbscan_silhouette)

MM Scaler - DBSCAN Silhouette Score: 0.5096701783755767


- After using three different clusters, it turns out that DBSCAN is the best clustering algorithm with this dataset, which has the Silhouette Score of 0.5302