### Import Packages

In [1]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
!pip install sklearn
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering librar

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Download Raw Data
Parse raw  New York data by downloading the file from the provided CDN (credit: Coursera Skills Labs)

In [19]:
# from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import base64, json, re, requests
# r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# soup = BeautifulSoup(r, 'html.parser')

!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json
print('Data downloaded!')
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']


Data downloaded!


### Construct Dataframe
The dataframe will have four columns: 'Borough', 'Neighborhood', 'Latitude', 'Longitude' which will be populated from the neighborhoods_data file just read in the above step

In [20]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


### Isolate Data Relevant to Staten Island

Filter the neighborhood data on the criteria 'Borough' == 'Staten Island' and save as a data frame

In [21]:
staten_data = neighborhoods[neighborhoods['Borough'] == 'Staten Island'].reset_index(drop=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
staten_data = staten_data.sort_values(by=['Neighborhood']).reset_index()
staten_data.drop(labels = 26, axis = 0, inplace = True)
staten_data



Unnamed: 0,index,Borough,Neighborhood,Latitude,Longitude
0,18,Staten Island,Annadale,40.538114,-74.178549
1,41,Staten Island,Arden Heights,40.549286,-74.185887
2,27,Staten Island,Arlington,40.635325,-74.165104
3,28,Staten Island,Arrochar,40.596313,-74.067124
4,35,Staten Island,Bay Terrace,40.553988,-74.139166
5,45,Staten Island,Bloomfield,40.605779,-74.187256
6,46,Staten Island,Bulls Head,40.609592,-74.159409
7,38,Staten Island,Butler Manor,40.506082,-74.229504
8,11,Staten Island,Castleton Corners,40.613336,-74.119181
9,39,Staten Island,Charleston,40.530531,-74.232158


In [22]:
address = 'staten, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of staten are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of staten are 40.5834557, -74.1496048.


### Get Venues From Foursquare
Define necessary variables such as Foursquare credentials

In [23]:
# CLIENT_ID = 'W5F5JZQIGM2CMZA1PXFH4EKDYILDGHTNTLY5URHLPH0BG2Y5' # your Foursquare ID
CLIENT_ID = '3QUM202FZUHVWTG51AKPTDY3HTKOFMVRLPST3UUFP2U5FFTV'
# CLIENT_SECRET = 'Z4UOG4S0UPHRLJLHIMWYLOWDVLYNLTJNGE33JWVYKLO5JNIS' # your Foursquare Secret
CLIENT_SECRET = '4X0YTCJYKDZNU11WQREVZIDKG043MG0EDSS22OZCC2UD22DN'
# ACCESS_TOKEN = 'OCM2RIC4VX0BCASWNXLXI5LQBV5MQOPAYP1FFDEF5BPJJVXV' # your FourSquare Access Token
ACCESS_TOKEN = 'VS0FYTFF1X0QIHW1P30VLNJII15XS0JZGLC0OK12WUPT42V3'
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3QUM202FZUHVWTG51AKPTDY3HTKOFMVRLPST3UUFP2U5FFTV
CLIENT_SECRET:4X0YTCJYKDZNU11WQREVZIDKG043MG0EDSS22OZCC2UD22DN


### Define a "getNearbyVenues" function
Define a function that accepts names, latitudes, longitudes, and search radius as inputs. The function will make an API call to Foursquare using the above credentials and output a list of venues that are within the defined radius of the coordinates entered.

This function will then be called on the list of Staten Island neighborhoods in the data frame to produce a list of all venues in each neighborhood.

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [25]:
staten_venues = getNearbyVenues(names=staten_data['Neighborhood'],
                                   latitudes=staten_data['Latitude'],
                                   longitudes=staten_data['Longitude'])


Annadale
Arden Heights
Arlington
Arrochar
Bay Terrace
Bloomfield
Bulls Head
Butler Manor
Castleton Corners
Charleston
Chelsea
Clifton
Concord
Dongan Hills
Egbertville
Elm Park
Eltingville
Emerson Hill
Fox Hills
Graniteville
Grant City
Grasmere
Great Kills
Greenridge
Grymes Hill
Heartland Village
Huguenot
Lighthouse Hill
Manor Heights
Mariner's Harbor
Midland Beach
New Brighton
New Dorp
New Dorp Beach
New Springville
Oakwood
Old Town
Park Hill
Pleasant Plains
Port Ivory
Port Richmond
Prince's Bay
Randall Manor
Richmond Town
Richmond Valley
Rosebank
Rossville
Sandy Ground
Shore Acres
Silver Lake
South Beach
St. George
Stapleton
Sunnyside
Todt Hill
Tompkinsville
Tottenville
Travis
West Brighton
Westerleigh
Willowbrook
Woodrow


In [26]:

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
staten_venues



Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Annadale,40.538114,-74.178549,Annadale Diner,40.542079,-74.177325,Diner
1,Annadale,40.538114,-74.178549,Il Sogno,40.541286,-74.178489,Restaurant
2,Annadale,40.538114,-74.178549,MTA SIR - Annadale,40.540482,-74.178185,Train Station
3,Annadale,40.538114,-74.178549,Pure Juicery,40.539094,-74.177634,Juice Bar
4,Annadale,40.538114,-74.178549,Sweet Sweet Sue's,40.539763,-74.176963,Bakery
5,Annadale,40.538114,-74.178549,Crown Palace,40.540334,-74.176147,Food
6,Annadale,40.538114,-74.178549,Ocean Sushi,40.541136,-74.178429,Sushi Restaurant
7,Annadale,40.538114,-74.178549,M&M Bagels and Deli,40.541013,-74.177106,Deli / Bodega
8,Annadale,40.538114,-74.178549,Miss Caroline's Dance Elite,40.541297,-74.178489,Dance Studio
9,Annadale,40.538114,-74.178549,Trio pizzeria,40.541478,-74.178428,Pizza Place


### Group Venues by Neighborhood and Examine

Use the groupby function to view the amount of venues per neighborhood. We note that Bulls Head by far has the largest amount of venues.

In [27]:
grouped_venues = staten_venues.groupby('Neighborhood').count()
# grouped_venues.loc[grouped_venues["Venue"] == grouped_venues["Venue"].max()]

### Cluster Venues Based on One-Hot Encoding

Create a dataframe filled with venue categories from staten_venues. We are interested in clustering by concentrations of Italian restaurants and pizza places, and unclassified restaurants, so we can isolate those three categories.

Add the neighborhood column from staten_venues to the dataframe and move to the first column.

For each neighborhood, average the results of each "category" column to obtain a frequency of how many venues in that neighborhood fall into a specific category. (If "Pizza Place" = .33 for Westerleigh, that means a third of all venues there are classified as "Pizza Place")

In [28]:
# one hot encoding
staten_onehot = pd.get_dummies(staten_venues[['Venue Category']], prefix="", prefix_sep="")
staten_onehot = staten_onehot[["Italian Restaurant", "Pizza Place", "Restaurant"]]


# add neighborhood column back to dataframe
staten_onehot['Neighborhood'] = staten_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [staten_onehot.columns[-1]] + list(staten_onehot.columns[:-1])
staten_onehot = staten_onehot[fixed_columns]

staten_onehot.head()
staten_grouped = staten_onehot.groupby('Neighborhood').mean().reset_index()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
print(staten_grouped)
print(staten_data)

         Neighborhood  Italian Restaurant  Pizza Place  Restaurant
0   Annadale           0.000000            0.153846     0.076923  
1   Arden Heights      0.000000            0.250000     0.000000  
2   Arlington          0.000000            0.000000     0.000000  
3   Arrochar           0.105263            0.052632     0.000000  
4   Bay Terrace        0.090909            0.000000     0.000000  
5   Bloomfield         0.000000            0.000000     0.000000  
6   Bulls Head         0.021739            0.086957     0.000000  
7   Butler Manor       0.000000            0.000000     0.000000  
8   Castleton Corners  0.000000            0.157895     0.000000  
9   Charleston         0.000000            0.037037     0.000000  
10  Chelsea            0.000000            0.000000     0.000000  
11  Clifton            0.000000            0.095238     0.000000  
12  Concord            0.111111            0.000000     0.000000  
13  Dongan Hills       0.120000            0.160000     0.0000



### Cluster Neighborhoods

Set the number of clusters to 4 and run a k-means cluster on the neighborhoods. The objective of this is to group neighborhoods with other similar neighborhoods in the vicinity. If opening a pizza store, we will need to consider more than the frequency of pizza parlors, Italian restaurants, and other potential competitors in any individual neighborhood--to have the greatest chance of attracting business, we will need to examine if there are pizza places in any nearing neighborhoods that could still compete within the same cluster.

In [29]:
# set number of clusters
kclusters = 4

staten_grouped_clustering = staten_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(staten_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]




array([2, 2, 0, 1, 1, 0, 3, 0, 2, 0], dtype=int32)

In [32]:
# add clustering labels
# staten_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
staten_grouped[['Cluster Labels']].astype('int32').dtypes
staten_merged = staten_data

# # merge staten_grouped with staten_data to add latitude/longitude for each neighborhood
staten_merged = staten_merged.join(staten_grouped.set_index('Neighborhood'), on='Neighborhood')
staten_merged.dropna(how='any').reset_index(drop=True)
staten_merged
# staten_merged[['Cluster Labels']].astype('int32').dtypes
# staten_merged.head() # check the last columns!

Unnamed: 0,index,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Italian Restaurant,Pizza Place,Restaurant
0,18,Staten Island,Annadale,40.538114,-74.178549,2,0.0,0.153846,0.076923
1,41,Staten Island,Arden Heights,40.549286,-74.185887,2,0.0,0.25,0.0
2,27,Staten Island,Arlington,40.635325,-74.165104,0,0.0,0.0,0.0
3,28,Staten Island,Arrochar,40.596313,-74.067124,1,0.105263,0.052632,0.0
4,35,Staten Island,Bay Terrace,40.553988,-74.139166,1,0.090909,0.0,0.0
5,45,Staten Island,Bloomfield,40.605779,-74.187256,0,0.0,0.0,0.0
6,46,Staten Island,Bulls Head,40.609592,-74.159409,3,0.021739,0.086957,0.0
7,38,Staten Island,Butler Manor,40.506082,-74.229504,0,0.0,0.0,0.0
8,11,Staten Island,Castleton Corners,40.613336,-74.119181,2,0.0,0.157895,0.0
9,39,Staten Island,Charleston,40.530531,-74.232158,0,0.0,0.037037,0.0


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
staten_merged.sort_values(by=['Neighborhood']).reset_index()

### Visualize Clusters

Using Folium, specify a color scheme for the six clusters and add markers to the map for each neighborhood, color-coded according to the cluster assigned.

We can observe that the largest number of neighborhoods belong to the red and green colored clusters (clusters 0 and 4)

In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(staten_merged['Latitude'], staten_merged['Longitude'], staten_merged['Neighborhood'], staten_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Each Cluster

We can more concretely look at the proportion of Italian restaurants and pizza places, and general restaurants per cluster. From staten_merged, isolate any data with a specified cluster label and pull the neighborhood along with its Italian, pizza place, and other restaurant frequency. This will allow us to observe why clusters are grouped in a specific way.

Cluster 0 appears to be a strong candidate for opening a restaurant, with a particularly low frequency for all three categories of competition for our pizza parlor.


In [34]:
staten_aggregate = staten_grouped
staten_aggregate.head()
# staten_merged_sorted = staten_merged.sort_values('Neighborhood')
# staten_aggregate["Cluster"] = staten_merged_sorted["Cluster Labels"]
# fixed_cols = [staten_aggregate.columns[-1]] + list(staten_aggregate.columns[:-1])
# staten_aggregate = staten_aggregate[fixed_cols]
staten_aggregate.groupby("Cluster Labels").mean().head()


Unnamed: 0_level_0,Italian Restaurant,Pizza Place,Restaurant
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.008103,0.007819,0.007126
1,0.158589,0.019544,0.012061
2,0.0,0.205634,0.020032
3,0.033453,0.098554,0.007688


### Explore Individual Clusters

Cluster 0 seems to contain few Italian restaurants, pizza places, and miscellaneous restaurants in generally. This would be a good option to further explore, as there doesn't appear to be much competition for a pizza parlor.

In [35]:
staten_merged.loc[staten_merged['Cluster Labels'] == 0, staten_merged.columns[[1] + list(range(5, staten_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,Italian Restaurant,Pizza Place,Restaurant
2,Staten Island,0,0.0,0.0,0.0
5,Staten Island,0,0.0,0.0,0.0
7,Staten Island,0,0.0,0.0,0.0
9,Staten Island,0,0.0,0.037037,0.0
10,Staten Island,0,0.0,0.0,0.0
17,Staten Island,0,0.0,0.0,0.0
18,Staten Island,0,0.0,0.0,0.0
19,Staten Island,0,0.0,0.0,0.0
20,Staten Island,0,0.0,0.047619,0.0
21,Staten Island,0,0.041667,0.041667,0.041667


Cluster 1 seems to contain a high concentration of Italian restaurants and a lower distribution of pizza places and other restaurants--the presence of so many Italian restaurants will likely provide a competition for us were we to consider this option.

In [36]:
staten_merged.loc[staten_merged['Cluster Labels'] == 1, staten_merged.columns[[1] + list(range(5, staten_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Italian Restaurant,Pizza Place,Restaurant
3,Staten Island,1,0.105263,0.052632,0.0
4,Staten Island,1,0.090909,0.0,0.0
12,Staten Island,1,0.111111,0.0,0.0
14,Staten Island,1,0.2,0.0,0.0
27,Staten Island,1,0.111111,0.0,0.0
28,Staten Island,1,0.166667,0.0,0.0
30,Staten Island,1,0.2,0.0,0.0
33,Staten Island,1,0.178571,0.107143,0.0
34,Staten Island,1,0.1875,0.0,0.0625
37,Staten Island,1,0.210526,0.052632,0.052632


Cluster 2 consists of locations with a high proportion of pizzerias. Opening a restaurant in this cluster would present the highest amount of competition to us.

In [37]:
staten_merged.loc[staten_merged['Cluster Labels'] == 2, staten_merged.columns[[1] + list(range(5, staten_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Italian Restaurant,Pizza Place,Restaurant
0,Staten Island,2,0.0,0.153846,0.076923
1,Staten Island,2,0.0,0.25,0.0
8,Staten Island,2,0.0,0.157895,0.0
25,Staten Island,2,0.0,0.166667,0.083333
41,Staten Island,2,0.0,0.2,0.0
42,Staten Island,2,0.0,0.25,0.0
43,Staten Island,2,0.0,0.2,0.0
47,Staten Island,2,0.0,0.266667,0.0


Cluster 3 consists of a fair concentration of pizza places and a low-moderate concentration of Italian restaurants. This would present us with a moderate level of competition

In [38]:
staten_merged.loc[staten_merged['Cluster Labels'] == 3, staten_merged.columns[[1] + list(range(5, staten_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,Italian Restaurant,Pizza Place,Restaurant
6,Staten Island,3,0.021739,0.086957,0.0
11,Staten Island,3,0.0,0.095238,0.0
13,Staten Island,3,0.12,0.16,0.0
15,Staten Island,3,0.111111,0.111111,0.0
16,Staten Island,3,0.03125,0.09375,0.03125
22,Staten Island,3,0.095238,0.095238,0.0
23,Staten Island,3,0.0,0.125,0.0
29,Staten Island,3,0.0,0.066667,0.0
35,Staten Island,3,0.0,0.083333,0.041667
39,Staten Island,3,0.0,0.136364,0.0
