In [47]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generating sample dataset from gaussian distributions

The dataset was constructed using two-dimensional data, where each dimension is sampled from Gaussian distributions.


* Cluster 1: $X\sim N(0, 0.8), Y\sim N(0,0.8)$
* Cluster 2: $X\sim N(-6, 1), Y\sim N(-3,1)$
* Cluster 3: $X\sim N(2, 1.2), Y\sim N(-5,1.2)$
* Cluster 4: $X\sim N(9, 1.6), Y\sim N(3,1.6)$

In [48]:
import pandas as pd
from sklearn.datasets import make_blobs
from pandas import DataFrame
import numpy as np
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)


X, y = make_blobs(n_samples=1000, centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)
df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
df.label = df.label.astype(str)

fig = px.scatter(df, x='x', y='y', color='label', width=600, height=600, range_x=[-10, 20], range_y=[-12,10])
fig.update_traces(marker=dict(size=12, line=dict(width=1,
                                        color='DarkSlateGrey')), selector=dict(mode='markers'))
fig.show()

## Added gaussian noise to each sample

Gaussian noise (or white noise) is a type of interference that arises often in real-world environments.
The probability density function of such noise is equal to that of the normal distribution. In our experiments, in order to simulate noise, we use a random number generator that follows such distribution to generate noise values, which are added to each individual data point of the dataset.

In [49]:
from util import add_gaussian_noise


X, y = make_blobs(n_samples=1000, centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)
df = DataFrame(dict(x=X[:,0], y=X[:,1], noise_std=0.0, label=y))

n = np.array([0.5, 1, 2])
for std in n:
    noise_df = add_gaussian_noise(df, std)
    noise_df.noise_std=std
    df = df.append(noise_df)


fig = px.scatter(
    df.loc[df.noise_std != 0.0, ],
    x="x",
    y="y",
    facet_col="noise_std",
    range_y=[-13, 10], range_x=[-10, 20])
fig.update_traces(marker=dict(size=9, line=dict(width=1,
                                        color='DarkSlateGrey')), selector=dict(mode='markers'))
fig.show()

## Clustering with different noise std

In this experiment, we add Gaussian noise to each data point, and evaluate the performance of the algorithms. We alter the standard deviation of the noise distribution and observe the effects on the clustering process.

It is shown that K-Means, GMM and Spectral Clustering perform relatively close, in the case of added gaussian noise. DBSCAN has the worst performance, since the clusters are becoming more and more sparse, as the standard deviation of the noise increases. This leads to many points being labeled as noise.

In [50]:
from pandas import DataFrame
from util import add_gaussian_noise
from sklearn.datasets import make_blobs
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture

X, y_true = make_blobs(n_samples=1000, centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)

total = DataFrame()
for std in np.arange(0, 2, 0.1):
    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y_true, std=std))
    df = add_gaussian_noise(df, std)

    y_pred_kmeans = KMeans(n_clusters=4).fit_predict(list(zip(df.x, df.y)))
    y_pred_dbscan = DBSCAN(min_samples=7).fit_predict(list(zip(df.x, df.y)))
    y_pred_spectral = SpectralClustering(n_clusters=4).fit_predict(list(zip(df.x, df.y)))
    y_pred_gmm = GaussianMixture(n_components=4).fit_predict(list(zip(df.x, df.y)))

    df = df.join(DataFrame(dict(pred_kmeans=y_pred_kmeans,
                                pred_dbscan=y_pred_dbscan,
                                pred_spectral=y_pred_spectral,
                                pred_gmm=y_pred_gmm)))

    total = total.append(df)

df = total.loc[total['std'] == 1, ]
df.label = df.label.astype(str)
df.pred_kmeans = df.pred_kmeans.astype(str)
df.pred_dbscan = df.pred_dbscan.astype(str)
df.pred_spectral = df.pred_spectral.astype(str)
df.pred_gmm = df.pred_gmm.astype(str)

df = df.rename(columns={
    'pred_kmeans': 'K-Means',
    'pred_dbscan': 'DBSCAN',
    'pred_spectral': 'Spectral Clustering',
    'pred_gmm': 'Gaussian Mixture Models'
})

df = df.melt(id_vars=['x', 'y', 'label', 'std'], var_name='algorithm', value_name='cluster label')

fig = px.scatter(
    df,
    x='x', y='y',
    color='cluster label',
    range_x=[-10, 20],
    range_y=[-10,8],
    facet_col='algorithm', facet_col_wrap=2
)
fig.show()

Clustering with gaussian noise. Std: 0.0
Clustering with gaussian noise. Std: 0.1
Clustering with gaussian noise. Std: 0.2
Clustering with gaussian noise. Std: 0.30000000000000004
Clustering with gaussian noise. Std: 0.4
Clustering with gaussian noise. Std: 0.5
Clustering with gaussian noise. Std: 0.6000000000000001
Clustering with gaussian noise. Std: 0.7000000000000001
Clustering with gaussian noise. Std: 0.8
Clustering with gaussian noise. Std: 0.9
Clustering with gaussian noise. Std: 1.0
Clustering with gaussian noise. Std: 1.1
Clustering with gaussian noise. Std: 1.2000000000000002
Clustering with gaussian noise. Std: 1.3
Clustering with gaussian noise. Std: 1.4000000000000001
Clustering with gaussian noise. Std: 1.5
Clustering with gaussian noise. Std: 1.6
Clustering with gaussian noise. Std: 1.7000000000000002
Clustering with gaussian noise. Std: 1.8
Clustering with gaussian noise. Std: 1.9000000000000001




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



NMI Score

Since we know the original cluster labels (ground truth), we can use this information to evaluate the performance of the algorithms. The metric used is the Normalized Mutual Information (NMI)

In [51]:
from pandas import factorize, DataFrame
_ , std = factorize(total.loc[:, 'std'])

d = {
    'Standard Deviation of Noise': [],
    'K-Means': [],
    'DBSCAN': [],
    'Spectral Clustering': [],
    'Gaussian Mixture Models': []
}
for i in std.values:
    df = total.loc[total.loc[:, 'std'] == i, ]
    d['Standard Deviation of Noise'].append(i)
    d['K-Means'].append(adjusted_mutual_info_score(df.label, df.pred_kmeans))
    d['DBSCAN'].append(adjusted_mutual_info_score(df.loc[df.pred_dbscan != -1,].label, df.loc[df.pred_dbscan != -1,].pred_dbscan))
    d['Spectral Clustering'].append(adjusted_mutual_info_score(df.label, df.pred_spectral))
    d['Gaussian Mixture Models'].append(adjusted_mutual_info_score(df.label, df.pred_gmm))

res = DataFrame(d).melt(id_vars=['Standard Deviation of Noise'], var_name='Algorithm', value_name='NMI score')

fig = px.line(res, x='Standard Deviation of Noise', y='NMI score', color='Algorithm', symbol="Algorithm")
fig.update_traces(line=dict(width=2.5))
fig.show()

## Unnormalized Data / Incorrect Scaling

As K-Means is based on the distance between the points and the cluster's centroid, measuring each dimension at different
scale has a negative impact on the algorithm's performance. To illustrate this, we multiply one dimension by a factor of 5,
and apply different clustering algorithms. It is shown that K-Means was affected the most, since the formed clusters do not
resemble the actual ones.


In [52]:
sample_size=1000
bias_factor=0.02
X, y_true = make_blobs(n_samples=[sample_size, sample_size, sample_size, sample_size], centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)
n_clusters = 4
df = DataFrame(dict(x=X[:,0], y=5*X[:,1], label=y_true))
df.label = df.label.astype(str)
y_pred_kmeans = KMeans(n_clusters=n_clusters).fit_predict(list(zip(df.x, df.y)))
y_pred_dbscan = DBSCAN(min_samples=5).fit_predict(list(zip(df.x, df.y)))
y_pred_spectral = SpectralClustering(n_clusters=n_clusters).fit_predict(list(zip(df.x, df.y)))
y_pred_gmm = GaussianMixture(n_components=n_clusters).fit_predict(list(zip(df.x, df.y)))

df = df.join(DataFrame(dict(pred_kmeans=y_pred_kmeans,
                            pred_dbscan=y_pred_dbscan,
                            pred_spectral=y_pred_spectral,
                            pred_gmm=y_pred_gmm)))


temp = df.copy()
temp.label = temp.label.astype(str)
temp.pred_kmeans = temp.pred_kmeans.astype(str)
temp.pred_dbscan = temp.pred_dbscan.astype(str)
temp.pred_spectral = temp.pred_spectral.astype(str)
temp.pred_gmm = temp.pred_gmm.astype(str)

temp = temp.rename(columns={
    'pred_kmeans': 'K-Means',
    'pred_dbscan': 'DBSCAN',
    'pred_spectral': 'Spectral Clustering',
    'pred_gmm': 'Gaussian Mixture Models'
})

temp = temp.melt(id_vars=['x', 'y', 'label'], var_name='algorithm', value_name='cluster label')

fig = px.scatter(
    temp,
    x='x', y='y',
    color='cluster label',
    width=600,
    height=1200,
    range_x=[-50, 40],
    range_y=[-50,40],
    facet_col='algorithm', facet_col_wrap=2,
)
fig.update_layout(showlegend=False)
fig.show()

## Dataset containing uniformly random noise

This experiment considers the case where the dataset contains random noise data, uniformly distributed over the feature
space. These points should be filtered out by the clustering algorithm or the data preprocessing step.


In [56]:
import numpy as np
from pandas import DataFrame
from util import add_uniform_noise_samples
from sklearn.datasets import make_blobs
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture

X, y_true = make_blobs(n_samples=1000, centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)

total = DataFrame()

for noise_samples in np.arange(0,1000, 50):
    print("Clustering with random uniform noise samples, size: {}".format(noise_samples))
    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y_true, noise_samples=noise_samples))
    df = add_uniform_noise_samples(df, noise_samples, -10, 20, -10, 10)

    y_pred_kmeans = KMeans(n_clusters=4).fit_predict(list(zip(df.x, df.y)))
    y_pred_dbscan = DBSCAN(min_samples=7).fit_predict(list(zip(df.x, df.y)))
    y_pred_spectral = SpectralClustering(n_clusters=4).fit_predict(list(zip(df.x, df.y)))
    y_pred_gmm = GaussianMixture(n_components=4).fit_predict(list(zip(df.x, df.y)))

    df = df.join(DataFrame(dict(pred_kmeans=y_pred_kmeans,
                                pred_dbscan=y_pred_dbscan,
                                pred_spectral=y_pred_spectral,
                                pred_gmm=y_pred_gmm)))

    total = total.append(df)

from plotly.subplots import make_subplots
import plotly.graph_objects as go

df = total.copy()
df.label = df.label.astype(str)
df.pred_kmeans = df.pred_kmeans.astype(str)
df.pred_dbscan = df.pred_dbscan.astype(str)
df.pred_spectral = df.pred_spectral.astype(str)
df.pred_gmm = df.pred_gmm.astype(str)

df = df.rename(columns={
    'pred_kmeans': 'K-Means',
    'pred_dbscan': 'DBSCAN',
    'pred_spectral': 'Spectral Clustering',
    'pred_gmm': 'Gaussian Mixture Models'
})

df = df.melt(id_vars=['x', 'y', 'label', 'noise_samples'], var_name='algorithm', value_name='Cluster label')


iterations=[
    {
        'id': 0,
        'alg': 'K-Means',
        'sampl': 100,
    },
    {
        'id': 1,
        'alg': 'DBSCAN',
        'sampl': 100,
    },
    {
        'id': 2,
        'alg': 'Spectral Clustering',
        'sampl': 100,
    },
    {
        'id': 3,
        'alg': 'Gaussian Mixture Models',
        'sampl': 100,
    },
        {
        'id': 4,
        'alg': 'K-Means',
        'sampl': 400,
    },
    {
        'id': 5,
        'alg': 'DBSCAN',
        'sampl': 400,
    },
    {
        'id': 6,
        'alg': 'Spectral Clustering',
        'sampl': 400,
    },
    {
        'id': 7,
        'alg': 'Gaussian Mixture Models',
        'sampl': 400,
    },
        {
        'id': 8,
        'alg': 'K-Means',
        'sampl': 800,
    },
    {
        'id': 9,
        'alg': 'DBSCAN',
        'sampl': 800,
    },
    {
        'id': 10,
        'alg': 'Spectral Clustering',
        'sampl': 800,
    },
    {
        'id': 11,
        'alg': 'Gaussian Mixture Models',
        'sampl': 800,
    }
]

fig = make_subplots(rows=6, cols=2,
                    subplot_titles=(["{} <br> noise percentage: {:.0f}%".format(it['alg'], float(it['sampl']/(it['sampl'] + 1000)*100)) for it in iterations]),
                    vertical_spacing=0.05)

for it in iterations:
    df2 = df.loc[(df.loc[:, ['algorithm', 'noise_samples']] == [it['alg'], it['sampl']]).all(axis=1), :]
    fig.append_trace(go.Scatter(
        x=df2.x,
        y=df2.y,
        mode='markers',
        marker=dict(color=df2.loc[: , 'Cluster label'].astype(int)+1),
        showlegend=False
    ), row=int(it['id'] / 2)+1, col=it['id'] % 2+1)

fig.update_layout(height=1800, width=700)
# fig.update_annotations(font_size=12)
fig.show()

Clustering with random uniform noise samples, size: 0
Clustering with random uniform noise samples, size: 50
Clustering with random uniform noise samples, size: 100
Clustering with random uniform noise samples, size: 150
Clustering with random uniform noise samples, size: 200
Clustering with random uniform noise samples, size: 250
Clustering with random uniform noise samples, size: 300
Clustering with random uniform noise samples, size: 350
Clustering with random uniform noise samples, size: 400
Clustering with random uniform noise samples, size: 450
Clustering with random uniform noise samples, size: 500
Clustering with random uniform noise samples, size: 550
Clustering with random uniform noise samples, size: 600
Clustering with random uniform noise samples, size: 650
Clustering with random uniform noise samples, size: 700
Clustering with random uniform noise samples, size: 750
Clustering with random uniform noise samples, size: 800
Clustering with random uniform noise samples, size:

In the presence of such noise, deciding the appropriate number of clusters for K-Means is challenging. A usual rule of
thumb is to use the Silhouette method. However, as the count of the noise points increases, it becomes difficult to determine the value that maximizes the silhouette score.

In [57]:
from util import add_gaussian_noise
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

X, y_true = make_blobs(n_samples=1000, centers=[[0,0], [-6, -3], [3,-5], [14,3]], cluster_std=[0.8, 1, 1.2, 1.6], random_state=9)
df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y_true))

silhouette_scores = list()
idx=0
for noise_samples in [0, 200, 800]:
    noise_df = add_uniform_noise_samples(df, noise_samples, -10, 20, -10, 10)
    for n in range(2, 10):
        y_pred = KMeans(n_clusters=n).fit_predict(list(zip(noise_df.x, noise_df.y)))
        silhouette_scores.append([noise_samples, n, silhouette_score(list(zip(noise_df.x, noise_df.y)), y_pred)])

sil_df = DataFrame(silhouette_scores, columns=['noise_samples', 'n_clusters', 'score'])
sil_df = sil_df.rename(columns={'n_clusters': 'Number of clusters', 'noise_samples': 'Noise samples', 'score': 'Avg. silhouette score'})
fig = px.line(sil_df, x="Number of clusters", y="Avg. silhouette score", color="Noise samples", symbol="Noise samples")
fig.update_traces(line=dict(width=2.5))
fig.show()