In [84]:
import pandas as pd

In [85]:
# load dataset
df = pd.read_csv("../data/digit.dat", delimiter=';', dtype=str)

In [86]:
# drop duplicate columns
df.drop(columns=["A2", "B2", "C2", "D2", "E2", "F2", "G2", "H2"], inplace=True)

In [87]:
df.replace({'ZERO': 0, 'ONE': 1}, regex=True, inplace=True)
df.replace(
    {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'zero': 0
    }, 
    regex=True, 
    inplace=True
)

  df.replace({'ZERO': 0, 'ONE': 1}, regex=True, inplace=True)
  df.replace(


In [88]:
# view data
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H
0,7,1,0,1,0,0,1,0
1,1,0,0,1,0,0,1,0
2,4,0,1,1,1,0,1,0
3,2,1,1,1,1,1,0,0
4,8,0,1,1,1,1,1,1


In [89]:
# save features as df
features = df.loc[:, "B":"H"]

In [90]:
# save target as df
target = df[["A"]]

In [91]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import pandas as pd

eps_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.15, 0.2, 0.25]
min_samples_values = [3, 5, 10, 15, 20, 25]

results = []

for e in eps_values:
    for m in min_samples_values:
        dbscan = DBSCAN(
            eps=e, 
            min_samples=m, 
            metric='hamming',
            algorithm='auto',
        ).fit(features)
        labels = dbscan.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)

        if n_clusters > 1:
            s_mean = silhouette_score(features, labels, metric='hamming')
        else:
            s_mean = -1

        results.append((e, m, n_clusters, n_noise, s_mean))

# Put results in a table
eps_analysis = pd.DataFrame(results, columns=["eps", "min_samples", "n_clusters", "n_noise", "silhouette"])
eps_analysis = eps_analysis.sort_values("silhouette", ascending=False)
eps_analysis

Unnamed: 0,eps,min_samples,n_clusters,n_noise,silhouette
0,0.01,3,48,36,0.877221
6,0.03,3,48,36,0.877221
12,0.05,3,48,36,0.877221
18,0.07,3,48,36,0.877221
24,0.1,3,48,36,0.877221
1,0.01,5,25,110,0.633419
13,0.05,5,25,110,0.633419
7,0.03,5,25,110,0.633419
19,0.07,5,25,110,0.633419
25,0.1,5,25,110,0.633419


In [92]:
model = DBSCAN(
    eps=0.10, 
    min_samples=10, 
    metric='hamming',
    algorithm='auto',
).fit(features)

In [93]:
labels = model.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
s_mean = silhouette_score(features, labels, metric='hamming')
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Silhouette Score: {s_mean}")

Number of clusters: 12
Number of noise points: 189
Silhouette Score: 0.37348504869854776


In [95]:
df["cluster"] = labels
df.groupby('cluster').mean()

Unnamed: 0_level_0,A,B,C,D,E,F,G,H
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,4.269841,0.571429,0.502646,0.613757,0.62963,0.507937,0.68254,0.592593
0,6.75,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.666667,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,3.884615,0.0,1.0,1.0,1.0,0.0,1.0,0.0
3,5.4,1.0,1.0,0.0,1.0,0.0,1.0,1.0
4,2.208333,1.0,0.0,1.0,1.0,1.0,0.0,1.0
5,7.5,1.0,1.0,1.0,1.0,0.0,1.0,1.0
6,7.444444,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,5.931034,1.0,1.0,0.0,1.0,1.0,1.0,1.0
8,6.636364,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [96]:
for i in range(-1, n_clusters):
    print(f"Cluster {i}:")
    print(df[df['cluster'] == i]['A'].value_counts())
    print()

Cluster -1:
A
1    25
6    23
4    23
2    20
3    20
7    19
8    17
0    15
5    15
9    12
Name: count, dtype: int64

Cluster 0:
A
7    23
1     1
Name: count, dtype: int64

Cluster 1:
A
1    16
7     2
Name: count, dtype: int64

Cluster 2:
A
4    25
1     1
Name: count, dtype: int64

Cluster 3:
A
5    30
9     3
6     2
Name: count, dtype: int64

Cluster 4:
A
2    22
9     1
0     1
Name: count, dtype: int64

Cluster 5:
A
9    27
5     5
3     5
8     4
0     1
Name: count, dtype: int64

Cluster 6:
A
8    31
0     2
6     1
5     1
9     1
Name: count, dtype: int64

Cluster 7:
A
6    24
5     2
8     2
2     1
Name: count, dtype: int64

Cluster 8:
A
4    4
9    4
8    2
5    1
Name: count, dtype: int64

Cluster 9:
A
0    23
3     1
8     1
Name: count, dtype: int64

Cluster 10:
A
3    23
9     7
8     1
Name: count, dtype: int64

Cluster 11:
A
3    6
7    4
Name: count, dtype: int64

