# AD using Clusters

Distance Based algorithm
- computationally intensive
- scaling is neccessary to compare values of different units

In [1]:
# tabular manipulation:
import numpy as np
import pandas as pd
# visualization:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib qt
import seaborn as sns
# sklearn for scaling and clustering:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
# environment:
from env import host, user, password

In [2]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

## Acquire

In [3]:
url = get_db_url("grocery_db")

sql = """
select *
from grocery_customers
"""

df = pd.read_sql(sql, url, index_col="customer_id")
df.head()

Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


### Steps for DBSCAN:
- Step 1: Select what variables/features that we wish to examine
    - df = df[["col1", "col2"]]
    
    <br>
    
- Step 2: Convert to array and dtype float
    -     - np_array = df.values.astype("float32", copy = False)
    
    <br>

- Step 3: Scale these features (DBSCAN is going to be useful for continuous variables)
    - scaler = StandardScaler().fit(np_array) np_array = scaler.transform(np_array)
    
    <br>

- Step 4: Ensure that our features are in a numpy array for fitting DBSCAN
    - dbsc = DBSCAN(eps = .75, min_samples = 15).fit(np_array)
    
<br>

- Step 5: Select our epsilon and min_neighbors to fit our clusters


- Step 6: Use our clusters to label outliers
- Step 7: Explore our clusters
    - df.labels.value_counts() sns.scatterplot(df.col1, df.col2, hue=df.labels)

In [4]:
#select features we wish to examine
selected_feats = ['Fresh','Milk', 'Grocery']

**note: choosing continuous variables is going to be significcantly more valuale for dustance based clustering as points in space will not inherently snap to any given set values and denisty will mean more as a result**

**we will treat df as if it has already been split into train validate test and we are examining train

<hr style="border:2px solid black"> </hr>

In [5]:
#create a scaler
#make it
minmax = MinMaxScaler()

#fit it/ #use it
scaled_features = minmax.fit_transform(df[selected_feats])

In [6]:
scaled_features
#shows the array

array([[0.11294004, 0.13072723, 0.08146416],
       [0.06289903, 0.13282409, 0.10309667],
       [0.05662161, 0.11918086, 0.08278992],
       ...,
       [0.1295431 , 0.21013575, 0.32594285],
       [0.091727  , 0.02622442, 0.02402535],
       [0.02482434, 0.02237109, 0.02702178]])

In [7]:
#shows that the data is already a np array
#this works to feed into our DBSCAN model
type(scaled_features)

numpy.ndarray

In [8]:
#create variable for scaled data
scaled_cols = [col +'_scaled' for col in selected_feats]

In [9]:
#create the dataframe in a scaled version
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=scaled_cols)

In [10]:
#concat info back into the original df
df = df.merge(scaled_df, on= df.index)

In [11]:
#take a look at the merged df
df.head()

Unnamed: 0,key_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Fresh_scaled,Milk_scaled,Grocery_scaled
0,0,2,3,12669,9656,7561,214,2674,1338,0.11294,0.130727,0.081464
1,1,2,3,7057,9810,9568,1762,3293,1776,0.062899,0.132824,0.103097
2,2,2,3,6353,8808,7684,2405,3516,7844,0.056622,0.119181,0.08279
3,3,1,3,13265,1196,4221,6404,507,1788,0.118254,0.015536,0.045464
4,4,2,3,22615,5410,7198,3915,1777,5185,0.201626,0.072914,0.077552


### Takeaways:
- decided on features
- scaled the data
- now we can use to DBSCAN model

<hr style="border:1px solid black"> </hr>

### Create the DBSCAN model

In [12]:
#make it
dbsc = DBSCAN(eps= 0.1, min_samples=20)

#fit it
dbsc.fit(scaled_features)

DBSCAN(eps=0.1, min_samples=20)

**with minmaxscaler**:
- eps can only be betweeen 0 and 1
    - lower the eps is... the more strict it is
- min_samples has to be less than # of entries


In [13]:
#use it
dbsc.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0, -1,  0, -1,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       -1, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0

### Takeaways:
- ^ this just shows there is ONE cluster
- '-1' is everything that is NOT in a cluster
- '0' is the cluster
<br>

- with more clusters:
    - '0','1','2',etc will be a cluster 
    - '-1' is everything NOT in a cluster

In [14]:
#assign a column to the labels
df['labels'] = dbsc.labels_

In [15]:
df[selected_feats + ['labels']].head()

Unnamed: 0,Fresh,Milk,Grocery,labels
0,12669,9656,7561,0
1,7057,9810,9568,0
2,6353,8808,7684,0
3,13265,1196,4221,0
4,22615,5410,7198,0


In [16]:
#take a look at the cluster counts
df.labels.value_counts()

#this shows that there is one cluster
#out of 440 entries, 31 of them are outliers

 0    409
-1     31
Name: labels, dtype: int64

<hr style="border:2px solid black"> </hr>

## Explore

In [17]:
#visualize the clusters using milk and fresh
sns.scatterplot(x='Fresh', y='Milk', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Milk'>

In [18]:
#visualize the clusters using grocery and fresh
sns.scatterplot(x='Fresh', y='Grocery', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Milk'>

In [19]:
#Examine this on a 3D scale
#matplotlib figure
fig = plt.figure(1, figsize=(10,10))
#wrap that figure in 3D figure
ax = Axes3D(fig)
                 
#ax.scatter(x,y,z- c=color/hue)
ax.scatter(df.Fresh, 
           df.Milk, 
           df.Grocery,
           c=df.labels,
          edgecolor='k')
                 
#remove anything within axis labels
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

#label your axis                 
ax.set_xlabel('Fresh')
ax.set_ylabel('Milk')
ax.set_zlabel('Grocery')

Text(0.5, 0, 'Grocery')

### Takeaways:
- we have at least 10 points that are definate outliers
- we have some data points that may or may not belong to main cluster
- out of the dimensions we obseved, there does not appear to be need for more then one cluster
- hyperparameters can be changed