# **Density-based spatial clustering of applications with noise (DBSCAN)**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

In [2]:
# Importing the dataset
dataset = pd.read_csv('Mall_Customers.csv')
dataset.head(3)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6


In [3]:
# taking the last 2 columns for our clustering
X = dataset.iloc[:, [3, 4]].values

# Fitting the model

In [4]:
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=3,min_samples=4)
model = dbscan.fit(X)

In [5]:
labels=model.labels_

In [6]:
#Calculating the number of clusters

n_clusters = len(set(labels))- (1 if -1 in labels else 0)

In [7]:
n_clusters

9

So 9 clusters are formed if use DBSCAN algorithm

In [8]:
#identifying the points which makes up our core points
sample_cores = np.zeros_like(labels,dtype=bool)

sample_cores[dbscan.core_sample_indices_] = True

In [9]:
sample_cores

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False,  True,
       False,  True, False,  True,  True, False,  True, False, False,
        True, False,  True,  True,  True,  True,  True, False,  True,
        True, False,  True, False,  True, False,  True, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False,

In the above array we can understand that 
* The points which has False are noise points or outliers
* The points which has True are Core or Border points and that are added in the cluster

In [10]:
# Checking the score
from sklearn import metrics
print(metrics.silhouette_score(X,labels))

-0.1908319132560097
