In [1]:
import os
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets.samples_generator import make_blobs

In [2]:
centers = [(-2, -2), (0, 0), (4.2, 5)]
X, y = make_blobs(n_samples=40000, centers=centers, n_features=5, random_state=0)

In [3]:
radius = 0.1
min_samples = 10

round_decimals = 2

## DBSCAN without subsampling

In [4]:
from sklearn.preprocessing import FunctionTransformer

In [5]:
pipeline_noClip = make_pipeline(StandardScaler(), 
                                FunctionTransformer(np.round, 
                                                    validate=False, 
                                                    kw_args={"decimals": round_decimals}),
                                DBSCAN(eps=radius, min_samples=min_samples))
labels_noClip = pipeline_noClip.fit_predict(X)

## DBSCAN with subsampling

In [6]:
from gridrep import cluster

In [7]:
pipeline_clip = make_pipeline(StandardScaler(), 
                              cluster.ClippedDBSCAN(eps=radius,
                                                    min_samples=min_samples,
                                                    round_decimals=round_decimals))
labels_clip = pipeline_clip.fit_predict(X)

### Compare labels

In [8]:
counts_all = []
for _labels_ in [labels_noClip, labels_clip]:
    _, counts = np.unique(_labels_, return_counts=True)
    counts_sorted = np.sort(counts)[::-1]
    counts_all.append(counts_sorted)
all(counts_all[0]==counts_all[1])

True

In [9]:
len(np.unique(labels_clip))

4

### Time

In [10]:
%%timeit
pipeline_noClip.fit_predict(X)

946 ms ± 4.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
pipeline_clip.fit_predict(X)

586 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
