# Exercise
**file name**: `clustering_anomaly_detection.py` or `clustering_anomaly_detection.ipynb`

# Experiment with the DBSCAN properties
- Read up on the epsilon and min_samples arguments into DBSCAN at https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

In [None]:
# from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


#used for DBclustering and scaling
from sklearn.cluster import DBSCAN
#using MinMax b'c Standard can have LESSTHAN 0, and Epsolon is 0-1 only
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

import wrangle as w

In [None]:
from env import host, user, password

def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

In [None]:
url = get_db_url("grocery_db")

sql = """
select *
from grocery_customers
"""

df = pd.read_sql(sql, url, index_col="customer_id")

In [None]:
df.head(2)

In [None]:
# cluster on subset of features
grocery_milk_fresh = df[["Grocery", "Milk","Fresh"]]

In [None]:
grocery_milk_fresh.head(2)

In [None]:
grocery_milk_fresh.columns

In [None]:
scaler = MinMaxScaler().fit(grocery_milk_fresh)
grocery_milk_fresh = scaler.transform(grocery_milk_fresh)

- Experiment with altering the epsilon values (the eps argument holding the threshhold parameter). Run the models and visualize the results. What has changed? Why do you think that is?

In [None]:
dbsc = DBSCAN(eps = .50, min_samples = 20).fit(grocery_milk_fresh)

In [None]:
gmf_columns = ['Grocery', 'Milk', 'Fresh']

In [None]:
# Now, let's add the scaled value columns back onto the dataframe
columns = list(df.columns)

scaled_columns = ["Scaled_" + column for column in gmf_columns]

In [None]:
# Save a copy of the original dataframe
original_df = df.copy()

# Create a dataframe containing the scaled values
scaled_df = pd.DataFrame(grocery_milk_fresh, columns=scaled_columns)

# Merge the scaled and non-scaled values into one dataframe
df = df.merge(scaled_df, on=df.index)
df = df.drop(columns=['key_0'])

In [None]:
labels = dbsc.labels_
labels[0:10]

In [None]:
df['labels'] = labels
df.labels.value_counts()

In [None]:
df[df.labels==-1].head()

In [None]:
sns.scatterplot(df.Grocery, df.Fresh, hue=df.labels)
plt.show()

sns.scatterplot(df.Milk, df.Fresh, hue=df.labels)
plt.show()

In [None]:

%matplotlib qt

fig = plt.figure(1, figsize=(8, 8))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

# plot the points
ax.scatter(df.Fresh, df.Milk, df.Grocery,
           c=df.labels, edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

ax.set_xlabel('Fresh')
ax.set_ylabel('Milk')
ax.set_zlabel('Grocery')

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(16, 10))
fig.subplots_adjust(hspace=.7, wspace=.2)
i = 1
for x in range(10, 0, -1):
    eps = 1/(11-x)
    db = DBSCAN(eps=eps, min_samples=25).fit(grocery_milk_fresh)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    ax = fig.add_subplot(2, 5, i)
    print(f'𝛆 = {eps}')
    sns.scatterplot(grocery_milk_fresh[:,0], grocery_milk_fresh[:,1], hue=["cluster-{}".format(x) for x in labels])
    i += 1
    plt.show()

- Double the `min_samples` parameter. Run your model and visualize the results. Consider what changed and why.

In [None]:
fig = plt.figure(figsize=(16, 10))
fig.subplots_adjust(hspace=.7, wspace=.2)
i = 1
for x in range(4):
    minsample = 20*x
    db = DBSCAN(eps=0.1, min_samples=minsample).fit(grocery_milk_fresh)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    ax = fig.add_subplot(2, 5, i)
    print(f'min_sample = {minsample}')
    sns.scatterplot(grocery_milk_fresh[:,0], grocery_milk_fresh[:,1], hue=["cluster-{}".format(x) for x in labels])
    i += 1
    plt.show()

***
## Clustering - DBSCAN
Use DBSCAN to detect anomalies in other products from the customers dataset.

In [None]:
# cluster on subset of features
grocery_frozen_deli = df[["Grocery", "Frozen","Delicassen"]]

In [None]:
grocery_frozen_deli.head(2)

In [None]:
grocery_frozen_deli.columns

In [None]:
gfd_columns = ['Grocery', 'Frozen', 'Delicassen']

In [None]:
# scale
scaler = MinMaxScaler().fit(grocery_frozen_deli)
grocery_frozen_deli = scaler.transform(grocery_frozen_deli)

In [None]:
dbsc = DBSCAN(eps = .10, min_samples = 20).fit(grocery_frozen_deli)

In [None]:
# Now, let's add the scaled value columns back onto the dataframe
columns = list(df.columns)

scaled_columns = ["Scaled_" + column for column in gfd_columns]

In [None]:
# Save a copy of the original dataframe
original_df = df.copy()

# Create a dataframe containing the scaled values
scaled_df = pd.DataFrame(grocery_frozen_deli, columns=scaled_columns)

# Merge the scaled and non-scaled values into one dataframe
df = df.merge(scaled_df, on=df.index)
df = df.drop(columns=['key_0'])

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(16, 10))
fig.subplots_adjust(hspace=.7, wspace=.2)
i = 1
for x in range(10, 0, -1):
    eps = 1/(11-x)
    db = DBSCAN(eps=eps, min_samples=20).fit(grocery_frozen_deli)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    ax = fig.add_subplot(2, 5, i)
    print(f'𝛆 = {eps}')
    sns.scatterplot(grocery_milk_fresh[:,0], grocery_milk_fresh[:,1], hue=["cluster-{}".format(x) for x in labels])
    i += 1
    plt.show()

In [None]:
%matplotlib qt

fig = plt.figure(1, figsize=(8, 8))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

# plot the points
ax.scatter(df.Delicassen, df.Frozen, df.Grocery,
           c=df.labels, edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

ax.set_xlabel('Delicassen')
ax.set_ylabel('Frozen')
ax.set_zlabel('Grocery')

___
# Use DBSCAN to detect anomalies in: 
- number of bedrooms and 
- finished square feet of property 

for the filtered dataset you used in the `clustering project` (single unit properties with a logerror).

In [None]:
zlo = w.zillow17()

In [None]:
zlo.shape

In [None]:
zlo.head(2)

In [None]:
zlo = w.clean_zillow(zlo)

In [None]:
zlo.shape

In [None]:
zlo.head(2)

In [None]:
# cluster on subset of features
bdrmsqft = zlo[["bedrooms", "sqft"]]

In [None]:
bdrmsqft.shape

In [None]:
bdrmsqft.head(2)

In [None]:
#create a list of column names from clustered subset of features: nu_df
columns = bdrmsqft.columns.to_list()

In [None]:
columns

In [None]:
#scale and transform the DF
scaler = MinMaxScaler().fit(bdrmsqft)
bdrmsqft = scaler.transform(bdrmsqft)

In [None]:
bdrmsqft.shape

In [None]:
#Construct a DBSCAN object 
dbsc = DBSCAN(eps = .10, min_samples = 20).fit(bdrmsqft)

In [None]:
#add the scaled value columns back onto the dataframe
scaled_columns = ["Scaled_" + column for column in columns]

In [None]:
scaled_columns

In [None]:
# Save a copy of the original dataframe
original_zlo = zlo.copy()

In [None]:
original_zlo.shape

In [None]:
# Create a dataframe containing the scaled values
scaled_df = pd.DataFrame(bdrmsqft, columns=scaled_columns)

In [None]:
scaled_df.head(2)

In [None]:
# Merge the scaled and non-scaled values into one dataframe
zlo = zlo.merge(scaled_df, on=zlo.index)

In [None]:
zlo.shape

In [None]:
zlo = zlo.drop(columns=['key_0'])

In [None]:
zlo.tail(2)

In [None]:
zlo.shape

In [None]:
#extract cluster labels and outliers 
labels = dbsc.labels_
zlo['labels'] = labels

In [None]:
sns.scatterplot(zlo.bathrooms, zlo.sqft, hue=df.labels)
plt.show()

In [None]:
def densitybasedcluster(df, nu_df):
    """
    
    """
    #create a list of column names from clustered subset of features: nu_df
    columns = nu_df.columns.to_list()
    
    #scale and transform the DF
    scaler = MinMaxScaler().fit(nu_df)
    nu_df = scaler.transform(nu_df)
    
    #Construct a DBSCAN object 
    dbsc = DBSCAN(eps = .10, min_samples = 20).fit(nu_df)
    
    #add the scaled value columns back onto the dataframe
    scaled_columns = ["Scaled_" + column for column in columns]
    
    # Save a copy of the original dataframe
    original_df = df.copy()
    
    # Create a dataframe containing the scaled values
    scaled_df = pd.DataFrame(nu_df, columns=scaled_columns)
    
    # Merge the scaled and non-scaled values into one dataframe
    df = df.merge(scaled_df, on=df.index)
    df = df.drop(columns=['key_0'])
    
    #extract cluster labels and outliers 
    labels = dbsc.labels_
    df['labels'] = labels

In [None]:
densitybasedcluster(zlo, bdrmsqft)

In [None]:
bdrmsqft.shape, zlo.shape

In [None]:
len(bdrmsqft.index), len(zlo.index)