#### DBSCAN and Use-case for exploring anomalic instances:

In [1]:
# tabular manipulation:
import numpy as np
import pandas as pd
# visualization:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib qt
import seaborn as sns
# sklearn for scaling and clustering:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
# environment:
from env import host, user, password

def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("grocery_db")

sql = """
select *
from grocery_customers
"""

df = pd.read_sql(sql, url, index_col="customer_id")
df.head()


Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


#### We will need to proceed through several operational steps to achieve utility from DBSCAN:

- Select what variables/features that we wish to examine
- Scale these features (DBSCAN is going to be useful for continuous variables)
- Ensure that our features are in a numpy array for fitting DBSCAN
- Select our epsilon and min_neighbors to fit our clusters
- Use our clusters to label outliers
- Explore our clusters

In [2]:
# select what features we wish to examine

In [3]:
selected_feats = ['Fresh', 'Milk', 'Grocery']

In [4]:
# note: choosing continuous variables is going to be significantly more valuable for distance based clustering as points
# in space will not inherently snap to any given set values and density will mean more as a result

In [5]:
# note: we will treat df as if it has already been split into train, validate test and we are examining train

In [6]:
# We will create a scaler:
# Make it
# Fit it
# Use it

minmax = MinMaxScaler()

In [7]:
scaled_features = minmax.fit_transform(df[selected_feats])

In [8]:
# examine our numpy array
scaled_features

array([[0.11294004, 0.13072723, 0.08146416],
       [0.06289903, 0.13282409, 0.10309667],
       [0.05662161, 0.11918086, 0.08278992],
       ...,
       [0.1295431 , 0.21013575, 0.32594285],
       [0.091727  , 0.02622442, 0.02402535],
       [0.02482434, 0.02237109, 0.02702178]])

In [9]:
# note: the output of the caker is already a numpy array
# this is suitable for feeding into our DBSCAN model
type(scaled_features)

numpy.ndarray

In [10]:
# glue the scaled information back into df:
scaled_cols = [col + '_scaled' for col in selected_feats]

In [11]:
scaled_df = pd.DataFrame(scaled_features, index = df.index, columns = scaled_cols)

In [12]:
scaled_df

Unnamed: 0_level_0,Fresh_scaled,Milk_scaled,Grocery_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.112940,0.130727,0.081464
1,0.062899,0.132824,0.103097
2,0.056622,0.119181,0.082790
3,0.118254,0.015536,0.045464
4,0.201626,0.072914,0.077552
...,...,...,...
435,0.264829,0.163338,0.172715
436,0.349761,0.018736,0.008202
437,0.129543,0.210136,0.325943
438,0.091727,0.026224,0.024025


In [13]:
df = df.merge(scaled_df, on=df.index)

In [14]:
df.head(3)

Unnamed: 0,key_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Fresh_scaled,Milk_scaled,Grocery_scaled
0,0,2,3,12669,9656,7561,214,2674,1338,0.11294,0.130727,0.081464
1,1,2,3,7057,9810,9568,1762,3293,1776,0.062899,0.132824,0.103097
2,2,2,3,6353,8808,7684,2405,3516,7844,0.056622,0.119181,0.08279


In [15]:
# Create our DBSCAN model:

In [16]:
# make it, fit it, use it

In [17]:
df.shape

(440, 12)

In [18]:
# make the object
dbsc = DBSCAN(eps = 0.1, min_samples = 20)

In [19]:
# fit it
dbsc.fit(scaled_features)

DBSCAN(eps=0.1, min_samples=20)

In [20]:
# use it
dbsc.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0, -1,  0, -1,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       -1, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [21]:
df['labels'] = dbsc.labels_

In [22]:
df[selected_feats + ['labels']].head(3)

Unnamed: 0,Fresh,Milk,Grocery,labels
0,12669,9656,7561,0
1,7057,9810,9568,0
2,6353,8808,7684,0


In [23]:
df.labels.value_counts()

 0    409
-1     31
Name: labels, dtype: int64

In [24]:
# We have our cluster labels now:
# Clusters: 1 (label: 0)
# OUtliers: (Label: -1)

In [25]:
# What now?

In [26]:
# Explore!

In [28]:
sns.scatterplot(x = 'Fresh', y = 'Milk', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Milk'>

In [29]:
sns.scatterplot(x = 'Fresh', y = 'Grocery', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Grocery'>

In [30]:
# lets examine it on a 3D scale

In [32]:
fig = plt.figure(1, figsize=(10, 10))
ax = Axes3D(fig)

# plot the points
ax.scatter(df.Fresh,
          df.Milk,
          df.Grocery,
          c=df.labels,
          edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

ax.set_xlabel('Fresh')
ax.set_ylabel('Milk')
ax.set_zlabel('Grocery')

Text(0.5, 0, 'Grocery')

In [33]:
# Initial Takeaways:
# - We have at least ~10 points that are definite outliers
# - We have some data points that may or may not belong to the main cluster
# - Out of the dimensions we observed, there does not appear to be need for more than one cluster
# - We may want to be less strict. about our hyperparemeters to catch the data points that arent 
#   as extreme as the furthest outliers in the data set