In [None]:
!pip install sklearn

In [7]:
import k3d
import numpy as np

In [None]:
import sklearn
from sklearn import datasets
from sklearn.cluster import DBSCAN

In [308]:
import warnings
warnings.filterwarnings('ignore')

# Define number of points in [generated] dataset

In [309]:
nSamples = 1000

# Available SkLearn Datasets 
feel free to comment out unused datasets to reduce memory overhead [ especially for  large nSamples ]

In [466]:
swissRollX = datasets.make_swiss_roll( n_samples=nSamples, noise=0.0, random_state=0)[0]
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_swiss_roll.html

blobsX, blobsY = datasets.make_blobs(n_samples=nSamples, centers=4, n_features=3, cluster_std=0.4, random_state=0)

noisyMoonsX = datasets.make_moons(n_samples=nSamples, noise=.005)[0]

# Visualize Dataset(s)

In [500]:
def viz_dataset( dataset ):
    assert( dataset.shape[1] == 3)    
    
    plot = k3d.plot()
    
    plot += k3d.points ( dataset, color=0xFF00FF, point_size = .3, shader = 'flat' )        
    
    plot.display()
    
def make_3dimensional(dataset):
    if dataset.shape[1] == 2:
        print('        > add empty third dimension to 2d dataset')
        dataset = np.hstack([dataset, np.zeros((dataset.shape[0],1))])
    return dataset


In [501]:
# moons 
dataset = make_3dimensional ( noisyMoonsX )
viz_dataset( dataset )

        > add empty third dimension to 2d dataset


Output()

In [502]:
# blobs 
dataset = make_3dimensional ( blobsX )
viz_dataset( dataset )


Output()

In [503]:
# swiss roll
dataset = make_3dimensional ( swissRollX )
viz_dataset( dataset )

Output()


# Plot labels [ i.e. model cluster predictions ] as color overlays onto dataset

In [471]:
import matplotlib.pyplot as plt

In [492]:
def plot_cluster_predictions ( dataset, labels ):

    labelSet = set(labels)
    print(labelSet)
    nClusters = len(labelSet)
    print( 'predicted nClusters = ' + str(nClusters) + ' [ from ' + str(nSamples) + ' samples ]')

    # generate nCluster random colors
    np.random.seed(1)
    colors = np.random.randint(0, 0xFFFFFF, nClusters)
    plot = k3d.plot()

    for iCluster in range(nClusters):
        
        clusterID = list(labelSet)[iCluster]
        clusterInds = np.where(labels == list(labelSet)[iCluster])[0]
        print(str(iCluster) + ' : ' + str(len(clusterInds)))
        plot += k3d.points ( dataset[clusterInds, :], color=int(colors[iCluster]), point_size = .1, shader = 'flat' )
        
    plot.display()

# DBScan @ Blobs

In [493]:
dataset = make_3dimensional ( blobsX )
dataset = StandardScaler().fit_transform(dataset)

dbScanModel = DBSCAN().fit(dataset)
labels = dbScanModel.labels_

plot_cluster_predictions ( dataset, labels )


{0, 1, 2, 3}
predicted nClusters = 4 [ from 1000 samples ]
0 : 250
1 : 250
2 : 250
3 : 250


Output()

# DBScan @ Swiss Roll

In [494]:
dataset = make_3dimensional ( swissRollX )
dataset = StandardScaler().fit_transform(dataset)
#dbScanModel = DBSCAN(eps = .7, min_samples = 10, algorithm= 'ball_tree').fit(dataset)
dbScanModel = DBSCAN().fit(dataset)

labels = dbScanModel.labels_

plot_cluster_predictions ( dataset, labels )


{0}
predicted nClusters = 1 [ from 1000 samples ]
0 : 1000


Output()

# DBScan @ Noisy Moons

In [497]:
dataset = make_3dimensional ( noisyMoonsX )
# normalize dataset
dataset = StandardScaler().fit_transform(dataset)
    
dbScanModel = DBSCAN().fit(dataset)

labels = dbScanModel.labels_

plot_cluster_predictions ( dataset, labels )


! NOTE visualization requires 3d datasets !
   > attempting to add empty third dimension to 2d dataset
{0, 1}
predicted nClusters = 2 [ from 1000 samples ]
0 : 500
1 : 500


Output()

In [459]:
?DBSCAN

In [None]:
'''
Init signature: DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)

Docstring:     
Perform DBSCAN clustering from vector array or distance matrix.

DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.

Parameters
----------
eps : float, optional
    The maximum distance between two samples for them to be considered
    as in the same neighborhood.

min_samples : int, optional
    The number of samples (or total weight) in a neighborhood for a point
    to be considered as a core point. This includes the point itself.

metric : string, or callable
    The metric to use when calculating distance between instances in a
    feature array. If metric is a string or callable, it must be one of
    the options allowed by :func:`sklearn.metrics.pairwise_distances` for
    its metric parameter.
    If metric is "precomputed", X is assumed to be a distance matrix and
    must be square. X may be a sparse matrix, in which case only "nonzero"
    elements may be considered neighbors for DBSCAN.

    .. versionadded:: 0.17
       metric *precomputed* to accept precomputed sparse matrix.

metric_params : dict, optional
    Additional keyword arguments for the metric function.

    .. versionadded:: 0.19

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
    The algorithm to be used by the NearestNeighbors module
    to compute pointwise distances and find nearest neighbors.
    See NearestNeighbors module documentation for details.

leaf_size : int, optional (default = 30)
    Leaf size passed to BallTree or cKDTree. This can affect the speed
    of the construction and query, as well as the memory required
    to store the tree. The optimal value depends
    on the nature of the problem.

p : float, optional
    The power of the Minkowski metric to be used to calculate distance
    between points.

n_jobs : int or None, optional (default=None)
    The number of parallel jobs to run.
    ``None`` means 1 unless in a :obj:`joblib.parallel_backend` contex
    
'''