In [None]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import warnings
from collections import deque

warnings.filterwarnings('ignore', '.*Explicit initial center position passed: performing only one init in KMeans instead of n_init.*', )

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.animation import FuncAnimation
from IPython.display import display, HTML

In [None]:
# set whatever dataset you want and do the preproc ;)
# However: for this to work, we need to have only two features (we want to visualize the clustering)
from sklearn import datasets
dataset = datasets.load_iris()
df = pd.DataFrame(dataset['data'], columns=dataset.feature_names)
# target = pd.DataFrame(dataset['target'])
display(df.head())

# select the two features
df = df[['sepal length (cm)', 'sepal width (cm)']]
df.head()

In [None]:
# if you want to normalize, comment this out:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
# df.describe()

In [None]:
# settings 
eps = 0.3
min_samples = 5

In [None]:
def start_algo(df, eps, min_samples):
    """
    df: input dataset. It must contains only two features
    eps: 
        The maximum distance between two samples for one to be considered
        as in the neighborhood of the other. This is not a maximum bound
        on the distances of points within a cluster. This is the most
        important DBSCAN parameter to choose appropriately for your data set
        and distance function.
    min_samples: 
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
    """
    X = np.array(df.copy())
    nn = NearestNeighbors(
            radius=eps,
            metric="euclidean",
        )
    nn.fit(X)

    # for each datapoint find all its neighbors
    neighborhoods = nn.radius_neighbors(X, return_distance=False)
    n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])

    labels = np.full(X.shape[0], -1, dtype=np.intp)
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    yield core_samples, labels, [], 0
    yield core_samples, labels, [], 0
    
    k = -1
    for i, core in enumerate(core_samples):
        if core and labels[i] == -1:
            k += 1
            q = deque()
            q.append(i)
            while len(q) != 0:
                curr = q.popleft()
                labels[curr] = k
                for j in neighborhoods[curr]:
                    if core_samples[j]:
                        if labels[j] == -1:
                            labels[j] = k
                            q.append(j)
                            yield core_samples, labels, curr, 1
                    elif labels[j] == -1:
                        labels[j] = k
                        yield core_samples, labels, curr, 1
                        
    yield core_samples, labels, [], 2

In [None]:
# TODO: you can also not hard set init. In this first step (this is just an example)
generator = start_algo(df, eps, min_samples)

In [None]:
fig, ax = plt.subplots()
rects = ax.scatter(df[df.columns[0]], df[df.columns[1]])


In [None]:
def animate(A, rects):
    # core_samples, labels, [], 2
    ax.clear()
    if A[3] == 0:
        ax.scatter(df[df.columns[0]], df[df.columns[1]])
    if A[3] == 1:
        ax.scatter(df[df.columns[0]], df[df.columns[1]], c=A[1])
        ax.scatter(df[df.columns[0]][A[2]], df[df.columns[1]][A[2]], c='red')
        text = ax.text(0.01, 0.95, "", transform=ax.transAxes)
        text.set_text("Red point: current core point for density connect")
    if A[3] == 2:
        ax.scatter(df[df.columns[0]], df[df.columns[1]], c=A[1])

In [None]:
anim = FuncAnimation(fig, func=animate, fargs=(rects,), frames=generator, interval=20,
                     repeat=False, save_count=10000)

In [None]:
# This will output the animation
a = HTML(anim.to_jshtml())

In [None]:
a

In [None]:
# Test if visualizer did right:
model = DBSCAN(eps=eps, min_samples=min_samples)
l = model.fit_predict(df)
plt.scatter(df[df.columns[0]], df[df.columns[1]], c=l)

In [None]:
# with open("examples/dbscan.html", "w") as f:
#     print(anim.to_jshtml(), file=f)