In [142]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN, KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [48]:
permits = pd.read_csv('data/final_permits.csv')

In [49]:
permits.columns

Index(['bin_no', 'house_no', 'street_name', 'job_no', 'zip_code',
       'job_start_date', 'latitude', 'longitude', 'subway_count_tenth_mi',
       'subway_count_half_mi', 'subway_count_one_mi',
       'subway_count_two_five_mi', 'hist_dist_name', 'park_count_tenth_mi',
       'park_count_half_mi', 'park_count_one_mi', 'park_count_two_five_mi',
       'closest_park', 'name_closest_park', 'complaints2014_15',
       'complaints2015_16', 'complaints2016_17', 'complaints2017_18',
       'complaints2018_19', 'complaints2019_20', 'total_complaints',
       'borough_BROOKLYN', 'borough_MANHATTAN', 'borough_QUEENS',
       'borough_STATEN ISLAND', 'owners_business_type_CORPORATION',
       'owners_business_type_HPD', 'owners_business_type_INDIVIDUAL',
       'owners_business_type_NYC AGENCY', 'owners_business_type_NYCHA/HHC',
       'owners_business_type_OTHER', 'owners_business_type_OTHER GOV'T AGENCY',
       'owners_business_type_PARTNERSHIP', 'non-profit_Y', 'in_hist_dist_1'],
      dtype

In [50]:
permits.head()

Unnamed: 0,bin_no,house_no,street_name,job_no,zip_code,job_start_date,latitude,longitude,subway_count_tenth_mi,subway_count_half_mi,...,owners_business_type_CORPORATION,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1
0,1056547,2686,BROADWAY,121207354,10025,2022-05-11,40.798817,-73.96874,5,15,...,0,0,0,0,0,0,0,1,0,0
1,1812187,140,HILLSIDE AVENUE,121189524,10040,2022-05-11,40.860296,-73.926125,2,15,...,1,0,0,0,0,0,0,0,0,0
2,2823631,368,EAST 152 STREET,220586168,10455,2022-05-11,40.818565,-73.918118,0,9,...,0,0,1,0,0,0,0,0,0,0
3,3429007,3410,FARRAGUT ROAD,321588215,11210,2022-05-18,40.636513,-73.943944,0,11,...,0,0,1,0,0,0,0,0,0,0
4,3121674,1457,FLATBUSH AVENUE,321827163,11210,2022-05-11,40.634773,-73.949721,0,11,...,1,0,0,0,0,0,0,0,0,0


In [51]:
permits.set_index('job_no', verify_integrity = True, inplace = True)

In [52]:
permits.head()

Unnamed: 0_level_0,bin_no,house_no,street_name,zip_code,job_start_date,latitude,longitude,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,...,owners_business_type_CORPORATION,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1
job_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
121207354,1056547,2686,BROADWAY,10025,2022-05-11,40.798817,-73.96874,5,15,42,...,0,0,0,0,0,0,0,1,0,0
121189524,1812187,140,HILLSIDE AVENUE,10040,2022-05-11,40.860296,-73.926125,2,15,34,...,1,0,0,0,0,0,0,0,0,0
220586168,2823631,368,EAST 152 STREET,10455,2022-05-11,40.818565,-73.918118,0,9,60,...,0,0,1,0,0,0,0,0,0,0
321588215,3429007,3410,FARRAGUT ROAD,11210,2022-05-18,40.636513,-73.943944,0,11,13,...,0,0,1,0,0,0,0,0,0,0
321827163,3121674,1457,FLATBUSH AVENUE,11210,2022-05-11,40.634773,-73.949721,0,11,20,...,1,0,0,0,0,0,0,0,0,0


In [86]:
features = ['subway_count_tenth_mi',
       'subway_count_half_mi', 'subway_count_one_mi',
       'subway_count_two_five_mi',
       'park_count_tenth_mi', 'park_count_half_mi', 'park_count_one_mi',
       'park_count_two_five_mi', 'closest_park', 'total_complaints', 'borough_BROOKLYN', 'borough_MANHATTAN',
       'borough_QUEENS', 'borough_STATEN ISLAND',
       'owners_business_type_CORPORATION', 'owners_business_type_HPD',
       'owners_business_type_INDIVIDUAL', 'owners_business_type_NYC AGENCY',
       'owners_business_type_NYCHA/HHC', 'owners_business_type_OTHER',
       "owners_business_type_OTHER GOV'T AGENCY",
       'owners_business_type_PARTNERSHIP', 'non-profit_Y', 'in_hist_dist_1']

In [87]:
X = permits[features]

In [55]:
X.head()

Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,...,owners_business_type_CORPORATION,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1
job_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
121207354,5,15,42,195,2,17,33,214,0.0,2255,...,0,0,0,0,0,0,0,1,0,0
121189524,2,15,34,153,1,10,42,280,0.049425,701,...,1,0,0,0,0,0,0,0,0,0
220586168,0,9,60,215,1,32,93,418,0.086285,1578,...,0,0,1,0,0,0,0,0,0,0
321588215,0,11,13,95,0,3,15,153,0.141644,293,...,0,0,1,0,0,0,0,0,0,0
321827163,0,11,20,87,0,3,14,127,0.199042,642,...,1,0,0,0,0,0,0,0,0,0


In [56]:
X.dtypes

subway_count_tenth_mi                        int64
subway_count_half_mi                         int64
subway_count_one_mi                          int64
subway_count_two_five_mi                     int64
park_count_tenth_mi                          int64
park_count_half_mi                           int64
park_count_one_mi                            int64
park_count_two_five_mi                       int64
closest_park                               float64
total_complaints                             int64
borough_BROOKLYN                             int64
borough_MANHATTAN                            int64
borough_QUEENS                               int64
borough_STATEN ISLAND                        int64
owners_business_type_CORPORATION             int64
owners_business_type_HPD                     int64
owners_business_type_INDIVIDUAL              int64
owners_business_type_NYC AGENCY              int64
owners_business_type_NYCHA/HHC               int64
owners_business_type_OTHER     

In [88]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [58]:
dbscan = DBSCAN()
dbscan.fit(X_scaled)

DBSCAN()

In [59]:
dbscan.labels_

array([-1, -1, -1, ..., 13, 13,  8])

In [60]:
pd.Series(dbscan.labels_).unique()

array([ -1,   0,   1,   2,   3,   4,   5,  66,   6,   7,   8,   9,  10,
       104, 115, 114,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,
        21,  22,  23,  24,  30,  25,  26,  27,  28,  29,  31,  32,  33,
        34,  35,  36,  37,  38, 123,  39,  40,  41,  42,  50,  43,  44,
        45,  53,  46,  76, 131,  47,  80,  48,  65,  49,  51,  52,  98,
        54,  55,  56,  57,  58,  59,  60,  61, 113,  62,  63,  64,  74,
        67,  68, 112,  69,  70,  71,  72,  84,  73,  75,  77,  78,  79,
        82,  95,  81, 125, 126,  83,  85,  86,  87,  88,  89, 108, 132,
        90,  91,  92,  93,  94, 101,  96,  97,  99, 100, 102, 103, 105,
       106, 107, 109, 110, 120, 111, 116, 117, 118, 119, 121, 122, 124,
       127, 128, 129, 130, 133])

## The Silhouette Score
Not great.

In [61]:
silhouette_score(X_scaled, dbscan.labels_)

0.03942418684596389

# Pairplot
Doesn't work. See error below.

In [63]:
permits_X = X
permits_X['cluster']=dbscan.labels_
permits_X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  permits_X['cluster']=dbscan.labels_


Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,...,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1,cluster
job_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
121207354,5,15,42,195,2,17,33,214,0.0,2255,...,0,0,0,0,0,0,1,0,0,-1
121189524,2,15,34,153,1,10,42,280,0.049425,701,...,0,0,0,0,0,0,0,0,0,-1
220586168,0,9,60,215,1,32,93,418,0.086285,1578,...,0,1,0,0,0,0,0,0,0,-1
321588215,0,11,13,95,0,3,15,153,0.141644,293,...,0,1,0,0,0,0,0,0,0,0
321827163,0,11,20,87,0,3,14,127,0.199042,642,...,0,0,0,0,0,0,0,0,0,-1


In [None]:
#sns.pairplot(permits_X, corner = True, hue = 'cluster', palette = 'bright');
#LinAlgError: singular matrix

# CHANGING EPSILON


In [124]:
dbscan = DBSCAN(eps = 1)
dbscan.fit(X_scaled)

DBSCAN(eps=1)

In [125]:
pd.Series(dbscan.labels_).unique()

array([-1,  0,  1,  2,  3,  4,  5,  6, 21,  8, 64,  7,  9, 10, 11, 12, 46,
       13, 14, 15, 41, 16, 17, 18, 19, 20, 61, 22, 23, 56, 57, 24, 25, 26,
       62, 40, 27, 28, 63, 29, 30, 51, 31, 34, 32, 33, 35, 47, 36, 37, 38,
       39, 59, 48, 42, 54, 43, 44, 45, 55, 49, 50, 52, 53, 58, 60, 65, 66])

In [126]:
silhouette_score(X_scaled, dbscan.labels_)

0.053407599630612465

In [127]:
dbscan = DBSCAN(eps = 5)
dbscan.fit(X_scaled)

DBSCAN(eps=5)

In [128]:
pd.Series(dbscan.labels_).unique()

array([ 0,  1,  2, -1,  3,  4,  5,  6,  7])

In [129]:
silhouette_score(X_scaled, dbscan.labels_)

0.41644131036113946

In [151]:
dbscan = DBSCAN(eps = 10)
dbscan.fit(X_scaled)

DBSCAN(eps=10)

In [152]:
pd.Series(dbscan.labels_).unique()

array([ 0,  1,  2,  3, -1])

In [153]:
silhouette_score(X_scaled, dbscan.labels_)

0.6502845646748202

I got this approach to finding the count of each label at [this stackoverflow answer](https://stackoverflow.com/a/28663910/5394724)

In [154]:
unique, counts = np.unique(dbscan.labels_, return_counts=True)

count = dict(zip(unique, counts))
count

{-1: 12, 0: 7092, 1: 42, 2: 21, 3: 9}

In [179]:
dbscan = DBSCAN(eps = 15)
dbscan.fit(X_scaled)

DBSCAN(eps=15)

In [180]:
pd.Series(dbscan.labels_).unique()

array([ 0,  1,  2, -1])

In [181]:
silhouette_score(X_scaled, dbscan.labels_)

0.7163018602039024

In [182]:
type(dbscan.labels_)

numpy.ndarray

I got this approach to finding the count of each label at [this stackoverflow answer](https://stackoverflow.com/a/28663910/5394724)

In [183]:
unique, counts = np.unique(dbscan.labels_, return_counts=True)

count = dict(zip(unique, counts))
count

{-1: 4, 0: 7142, 1: 21, 2: 9}

-1 is noise ([DBSCAN documentation](https://scikit-learn.org/stable/modules/clustering.html#dbscan))

In [184]:
permits_X = X
permits_X['cluster']=dbscan.labels_
permits_X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  permits_X['cluster']=dbscan.labels_


Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,...,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1,cluster
job_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
121207354,5,15,42,195,2,17,33,214,0.0,2255,...,0,0,0,0,0,0,1,0,0,0
121189524,2,15,34,153,1,10,42,280,0.049425,701,...,0,0,0,0,0,0,0,0,0,0
220586168,0,9,60,215,1,32,93,418,0.086285,1578,...,0,1,0,0,0,0,0,0,0,0
321588215,0,11,13,95,0,3,15,153,0.141644,293,...,0,1,0,0,0,0,0,0,0,0
321827163,0,11,20,87,0,3,14,127,0.199042,642,...,0,0,0,0,0,0,0,0,0,0


In [189]:
pd.set_option('display.max_columns', None)

In [190]:
permits_X.groupby(['cluster']).mean()

Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,borough_BROOKLYN,borough_MANHATTAN,borough_QUEENS,borough_STATEN ISLAND,owners_business_type_CORPORATION,owners_business_type_HPD,owners_business_type_INDIVIDUAL,owners_business_type_NYC AGENCY,owners_business_type_NYCHA/HHC,owners_business_type_OTHER,owners_business_type_OTHER GOV'T AGENCY,owners_business_type_PARTNERSHIP,non-profit_Y,in_hist_dist_1
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
-1,0.0,6.0,40.5,231.75,1.5,31.5,85.0,344.75,0.069479,788.0,0.75,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
0,0.221647,5.205545,17.977737,93.854243,0.720106,9.715626,30.652898,144.943153,0.223591,346.403948,0.305657,0.038505,0.290255,0.265472,0.422711,0.0,0.343881,0.018762,0.0,0.0,0.0,0.213526,0.026883,0.005881
1,0.095238,5.904762,25.809524,156.857143,1.190476,15.904762,49.0,239.714286,0.055637,682.190476,0.095238,0.095238,0.190476,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.333333,13.777778,50.777778,317.0,0.777778,19.222222,62.444444,303.555556,0.087729,1014.777778,0.555556,0.333333,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.111111,0.0


# Building Model for Testing
There are some interesting features I don't have data for my test points (who owns, is it a nonprofit)

In [191]:
features_test = ['subway_count_tenth_mi',
       'subway_count_half_mi', 'subway_count_one_mi',
       'subway_count_two_five_mi',
       'park_count_tenth_mi', 'park_count_half_mi', 'park_count_one_mi',
       'park_count_two_five_mi', 'closest_park', 'total_complaints', 'borough_BROOKLYN', 'borough_MANHATTAN',
       'borough_QUEENS', 'borough_STATEN ISLAND', 'in_hist_dist_1']

In [194]:
X = permits[features_test]

In [195]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [197]:
dbscan = DBSCAN(eps = 10)
dbscan.fit(X_scaled)

DBSCAN(eps=10)

In [199]:
silhouette_score(X_scaled, dbscan.labels_)

0.7092676513248021

In [200]:
pd.Series(dbscan.labels_).unique()

array([ 0,  1,  2, -1])

In [201]:
unique, counts = np.unique(dbscan.labels_, return_counts=True)

count = dict(zip(unique, counts))
count

{-1: 3, 0: 7126, 1: 42, 2: 5}

In [202]:
permits_X = X
permits_X['cluster']=dbscan.labels_
permits_X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  permits_X['cluster']=dbscan.labels_


Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,borough_BROOKLYN,borough_MANHATTAN,borough_QUEENS,borough_STATEN ISLAND,in_hist_dist_1,cluster
job_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
121207354,5,15,42,195,2,17,33,214,0.0,2255,0,1,0,0,0,0
121189524,2,15,34,153,1,10,42,280,0.049425,701,0,1,0,0,0,0
220586168,0,9,60,215,1,32,93,418,0.086285,1578,0,0,0,0,0,0
321588215,0,11,13,95,0,3,15,153,0.141644,293,1,0,0,0,0,0
321827163,0,11,20,87,0,3,14,127,0.199042,642,1,0,0,0,0,0


In [203]:
permits_X.groupby(['cluster']).mean()

Unnamed: 0_level_0,subway_count_tenth_mi,subway_count_half_mi,subway_count_one_mi,subway_count_two_five_mi,park_count_tenth_mi,park_count_half_mi,park_count_one_mi,park_count_two_five_mi,closest_park,total_complaints,borough_BROOKLYN,borough_MANHATTAN,borough_QUEENS,borough_STATEN ISLAND,in_hist_dist_1
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.372291,0.0,0.0,0.0,0.0,1.0,0.0
0,0.221302,5.110862,17.747965,93.383525,0.721302,9.720039,30.625737,144.904996,0.154927,347.166152,0.30494,0.037188,0.290065,0.265647,0.0
1,0.47619,24.52381,73.47619,284.047619,0.97619,18.02381,62.261905,279.428571,0.086968,636.238095,0.47619,0.357143,0.142857,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.668434,0.0,0.0,0.0,1.0,0.0,0.0


# Notes on The Clusters
One cluster is just the ones in the historic districts (cluster 1). Cluster 2 is very far from parks, and is in Queens. Cluster 1 is all in the Bronx and is far from parks.
## I think I have some outliers I need to remove. Based on map, yesterday. Look at Tableau.
I could do a check to make sure the point falls within the boundaries of the borough, too.

# Include Longitude and Latitude

In [114]:
features2 = ['latitude', 'longitude','subway_count_tenth_mi',
       'subway_count_half_mi', 'subway_count_one_mi',
       'subway_count_two_five_mi',
       'park_count_tenth_mi', 'park_count_half_mi', 'park_count_one_mi',
       'park_count_two_five_mi', 'closest_park', 'total_complaints', 'borough_BROOKLYN', 'borough_MANHATTAN',
       'borough_QUEENS', 'borough_STATEN ISLAND',
       'owners_business_type_CORPORATION', 'owners_business_type_HPD',
       'owners_business_type_INDIVIDUAL', 'owners_business_type_NYC AGENCY',
       'owners_business_type_NYCHA/HHC', 'owners_business_type_OTHER',
       "owners_business_type_OTHER GOV'T AGENCY",
       'owners_business_type_PARTNERSHIP', 'non-profit_Y', 'in_hist_dist_1']

In [158]:
X2 = permits[features2]

In [166]:
ss = StandardScaler()
X2_scaled = ss.fit_transform(X2)

In [167]:
dbscan2 = DBSCAN(eps=1)
dbscan2.fit(X2_scaled)

DBSCAN(eps=1)

In [168]:
pd.Series(dbscan2.labels_).unique()

array([-1,  0,  1,  2,  3,  4,  5,  6, 21,  8,  7,  9, 10, 11, 12, 42, 13,
       14, 15, 16, 17, 18, 19, 20, 60, 22, 23, 24, 53, 54, 25, 26, 58, 61,
       39, 27, 28, 62, 29, 59, 49, 30, 33, 31, 32, 34, 43, 35, 36, 37, 38,
       65, 45, 51, 40, 41, 44, 52, 46, 47, 48, 50, 55, 56, 57, 63, 64])

In [169]:
silhouette_score(X2_scaled, dbscan2.labels_)

0.03985773625180413

In [170]:
dbscan2 = DBSCAN(eps=10)
dbscan2.fit(X2_scaled)

DBSCAN(eps=10)

In [171]:
pd.Series(dbscan2.labels_).unique()

array([ 0,  1,  2,  3, -1])

In [172]:
silhouette_score(X2_scaled, dbscan2.labels_)

0.6295941401638957

In [173]:
dbscan2 = DBSCAN(eps=15)
dbscan2.fit(X2_scaled)

DBSCAN(eps=15)

In [174]:
pd.Series(dbscan2.labels_).unique()

array([ 0,  1,  2, -1])

In [175]:
silhouette_score(X2_scaled, dbscan2.labels_)

0.7005983949467619

This is a better fit without longitude and latitude