In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import seaborn as sns
import os

In [5]:
os.chdir("D:\\meridianthe4\\PML\\Datasets")

In [9]:
milk = pd.read_csv("milk.csv", index_col=0)
scaler = StandardScaler().set_output(transform="pandas")
df_scaled = scaler.fit_transform(milk)

In [14]:
clust = DBSCAN(eps=0.5, min_samples=3)
clust.fit(df_scaled)
df_scaled_copy = df_scaled.copy()
df_scaled_copy['Cluster'] = clust.labels_
df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
if len(df_scaled_copy['Cluster'].unique()) >= 2:
    print(silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster']))
else:
    print("Not enough clusters for silhouette score")

0.5344431042454363


In [18]:
eps_values = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
min_samples_values = [2, 3, 4, 5, 7]
scores = []
for eps in eps_values:
    for min_samples in min_samples_values:
        clust = DBSCAN(eps=eps, min_samples=min_samples)
        clust.fit(df_scaled)
        df_scaled_copy = df_scaled.copy()
        df_scaled_copy['Cluster'] = clust.labels_
        df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
        if len(df_scaled_copy['Cluster'].unique()) >= 2:
            score = silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster'])
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Silhouette Score: {score}")
            scores.append([eps, min_samples, score])
        else:
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Not enough clusters for silhouette score")
            scores.append([eps, min_samples, None])
df_scores = pd.DataFrame(scores, columns=["EPS", "Min Samples", "Score"])
df_scores.sort_values(by="Score", ascending=False)

Unnamed: 0,EPS,Min Samples,Score
26,0.9,3,0.657551
15,0.5,2,0.593446
16,0.5,3,0.534443
20,0.7,2,0.488035
25,0.9,2,0.44228
29,0.9,7,0.4274
21,0.7,3,0.418695
0,0.1,2,
1,0.1,3,
2,0.1,4,


In [19]:
clust = DBSCAN(eps=0.9, min_samples=3)
clust.fit(df_scaled)
df_scaled_copy = df_scaled.copy()
df_scaled_copy['Cluster'] = clust.labels_
df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
if len(df_scaled_copy['Cluster'].unique()) >= 2:
    print(silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster']))
else:
    print("Not enough clusters for silhouette score")

0.6575507189237809


In [20]:
clust = DBSCAN(eps=0.9, min_samples=3)
clust.fit(df_scaled)
df_cluster = milk.copy()
df_cluster['Cluster'] = clust.labels_
df_cluster.groupby('Cluster').mean()

Unnamed: 0_level_0,water,protein,fat,lactose,ash
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,69.2625,8.175,17.5625,2.9375,0.97
0,86.071429,4.121429,4.0,5.221429,0.665714
1,65.166667,10.733333,20.4,2.233333,1.5


### Nutrients

In [21]:
nut = pd.read_csv("nutrient.csv", index_col=0)
scaler = StandardScaler().set_output(transform="pandas")
df_scaled = scaler.fit_transform(nut)

In [22]:
eps_values = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
min_samples_values = [2, 3, 4, 5, 7]
scores = []
for eps in eps_values:
    for min_samples in min_samples_values:
        clust = DBSCAN(eps=eps, min_samples=min_samples)
        clust.fit(df_scaled)
        df_scaled_copy = df_scaled.copy()
        df_scaled_copy['Cluster'] = clust.labels_
        df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
        if len(df_scaled_copy['Cluster'].unique()) >= 2:
            score = silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster'])
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Silhouette Score: {score}")
            scores.append([eps, min_samples, score])
        else:
            # print(f"EPS: {eps}, Min Samples: {min_samples}, Not enough clusters for silhouette score")
            scores.append([eps, min_samples, None])
df_scores = pd.DataFrame(scores, columns=["EPS", "Min Samples", "Score"])
df_scores.sort_values(by="Score", ascending=False)

Unnamed: 0,EPS,Min Samples,Score
10,0.3,2,0.92214
15,0.5,2,0.775244
20,0.7,2,0.686821
5,0.2,2,0.571084
25,0.9,2,0.436648
0,0.1,2,
1,0.1,3,
2,0.1,4,
3,0.1,5,
4,0.1,7,


In [24]:
clust = DBSCAN(eps=0.3, min_samples=2)
clust.fit(df_scaled)
df_scaled_copy = df_scaled.copy()
df_scaled_copy['Cluster'] = clust.labels_
df_scaled_copy = df_scaled_copy[df_scaled_copy['Cluster'] != -1]
if len(df_scaled_copy['Cluster'].unique()) >= 2:
    print(silhouette_score(df_scaled_copy.iloc[:,:-1], df_scaled_copy['Cluster']))
else:
    print("Not enough clusters for silhouette score")

0.9221398180495596


In [26]:
clust = DBSCAN(eps=0.9, min_samples=3)
clust.fit(df_scaled)
df_cluster = nut.copy()
df_cluster['Cluster'] = clust.labels_
df_cluster.groupby('Cluster').mean()

Unnamed: 0_level_0,energy,protein,fat,calcium,iron
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,157.5,18.833333,7.833333,61.555556,2.311111
0,307.222222,19.333333,24.777778,8.777778,2.522222
