### DBSCAN

In [1]:
import numpy as np, pandas as pd,os,sys
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import colorConverter
import gmplot
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.spatial.distance import cdist,pdist
# %matplotlib inline

In [4]:
pwd

u'/Users/edwingarcia/Dropbox/_Lewis-University/CPSC-59000-Data Science Project/github/Lewis-Thesis/notebook'

#### FILES , DATAFRAMES, MODELS AND PCA

In [24]:
OUTPUT_FOLDER = "../data/output"
SOURCE_FOLDER = "../data/src"

## CREATE FOLDER IF OUTPUT FOLDER IS UNDEFINED
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## IF NO 'data' FOLDER EXISTS, CRATE ONE...
if not os.path.exists(SOURCE_FOLDER):
    os.makedirs(SOURCE_FOLDER)
 
## july-oct-raw2-train.csv
# csv_file = "july-oct-batch2-train.csv" ## raw_input("Enter csv file to load:")
# csv_file = "batch2-train.csv" ## raw_input("Enter csv file to load:")
csv_file = "july-oct-train.csv" ## raw_input("Enter csv file to load:")
## USED TO CONVERT BACK FROM DECOMPOSED VARIABLES TO GEOCORDS
raw_file = "%s/july-oct-raw.csv" %(SOURCE_FOLDER)
raw_map_df = pd.read_csv(raw_file) 

file_str = "%s/%s" %(SOURCE_FOLDER, csv_file)
a = pd.read_csv(file_str) 
a.columns = ['index1', 'address','city', 'day','hour','type','latitude','longitude','parent_incident','state']
a.drop('index1',axis=1,inplace=True)

X = a


In [25]:
X.columns

Index([u'address', u'city', u'day', u'hour', u'type', u'latitude',
       u'longitude', u'parent_incident', u'state'],
      dtype='object')

#### RUN DBSCAN

In [29]:
## REDUCE DATA USING PCA
pca = PCA(n_components=2).fit(X)
pca_2d = pd.DataFrame(pca.transform(X))

# Add PCA cols to DF
X.loc[:,('pca1')] = pca_2d[0]
X.loc[:,('pca2')] = pca_2d[1]

min_samples_list = np.arange(10,11,2)
epsilon = 0.037 ## 0.01905
plt.figure()
for i in min_samples_list:
    db = DBSCAN(eps=epsilon, min_samples=i).fit(pca_2d)
    db_pred = db.fit_predict(pca_2d)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    total_outliers = len(pca_2d) - len(db.core_sample_indices_)
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    fig,(ax1,ax2) = plt.subplots(1,2,figsize=(16,6))
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        
        xy = X[class_member_mask & ~core_samples_mask]
        outliers = ax1.scatter(xy['pca1'], xy['pca2'], marker='^', c='c', s=15, alpha=0.3) ## centroids points
        inliers = ax1.scatter(xy['pca1'], xy['pca2'], marker='s', c='m',s=3,alpha=0.4) 

#         outliers = ax1.plot(xy['pca1'], xy['pca2'], '.', markerfacecolor='m', mec='r', markersize=5, alpha=0.8) ## centroids points
#         inliers = ax1.plot(xy['pca1'], xy['pca2'], '.', markerfacecolor='b', mec='w', markersize=3,alpha=0.6) 
        
        xy = X[class_member_mask & core_samples_mask]
        clusters1 = ax1.scatter(xy['pca1'], xy['pca2'], c=col, marker='o', s=15, alpha=0.5)
        clusters2 = ax2.scatter(xy['pca1'], xy['pca2'], c=col,marker='o', s=10, alpha=0.7)
        
        title1 ="DBSCAN With Outliers and Core Points\nEpsilon=%s, Min. Samples=%s,Clusters=%s, Outliers=%s" %(
            epsilon, i, n_clusters_,total_outliers)
        title2 ="DBSCAN With Clusters Only\nEpsilon=%s, Min. Samples=%s, Clusters=%s" %(epsilon, i, n_clusters_)
        ax1.legend([outliers,inliers],['Outliers','Core Points'])
        ax2.legend([clusters2],['Clusters'])
        ax1.set_title(title1)
        ax2.set_title(title2)
        ax1.set_ylabel("PCA 1")
        ax2.set_ylabel("PCA 1")
        ax1.set_xlabel("PCA 2")
        ax2.set_xlabel("PCA 2")
        
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, db_pred))
#     print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(pca_2d, db_pred))
plt.show()




Estimated number of clusters: 114
Adjusted Rand Index: 1.000


#### REVERT BACK TO ORIGINAL COORDINATES

In [32]:
epsilon = 0.037 # 0.01905 # 
min_samples = 10
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(pca_2d)
db_pred = db.fit_predict(pca_2d)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
clust_centers = db.core_sample_indices_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

#### GET CENTROIDS AND MATCH TO RAW DATASETS

In [33]:
a_labeled = a
a_labeled['label'] = labels
a_centroids = pd.DataFrame([])
gmap_centroids = pd.DataFrame([])
for i in clust_centers:
    a_centroids = a_centroids.append(a_labeled.ix[i])
    gmap_centroids = gmap_centroids.append(raw_map_df.ix[i])

## Match the labels for the gmap coords
gmap_centroids['label'] = a_centroids['label']

In [34]:
plt.scatter(a['longitude'],a['latitude'],marker='.', c='k',s=5,alpha=0.4)
plt.scatter(a_centroids['longitude'],a_centroids['latitude'],c=a_centroids['label'],s=20, marker='o',edgecolors='w', alpha=0.7)

title_str = "DBSCAN With Centroids Mapped into Raw Dataset\nEpsilon=%s, Min. Samples=%s, Clusters=%s" %(epsilon,min_samples,
n_clusters_)
plt.title(title_str)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

#### GENERATE GOOGLE MAP

In [35]:
## GENERATE GOOGLE MAP IN SEPARATE HTML FILE

gmap = gmplot.GoogleMapPlotter(29.8, -95.4, 9.0)
gmap.scatter(raw_map_df['latitude'], raw_map_df['longitude'], '#3B0B39', alpha=0.4, size=60, marker=False)
gmap.heatmap(gmap_centroids['latitude'], gmap_centroids['longitude'],radius=(20))

## PATH IS OUTPUT FOLDER/FILENAME + SEQUENCE NUMBER.html
map_file = "%s/%s-EPS-%s-MIN-SAMPLES-%s.html" %(OUTPUT_FOLDER,"DBSCAN",epsilon, min_samples)
gmap.draw(map_file)

#### GENERATE KML/KMZ FILE FOR MAP LAYER

In [37]:
kml_name = raw_input("Enter name of .kml file you wish to create: ")
kml = gmap_centroids[['latitude','longitude','label']]
kml_output_str = "%s/dbscan-%s-%s.kml.csv" %(OUTPUT_FOLDER,epsilon, kml_name)
kml.to_csv(kml_output_str)

Enter name of .kml file you wish to create: test1


### END OF  DBSCAN CODE

##### REFERENCES:
Part of the code source comes from  scikitlearn website:
http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN