In [5]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


In [6]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [7]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [8]:
#1. Download and Explore Dataset
!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json
print('Data downloaded!')

Data downloaded!


In [9]:
#Next, let's load the data
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [10]:
newyork_data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

In [11]:
#Notice how all the relevant data is in the features key, which is basically a
#list of the neighborhoods. So, let's define a new variable that includes this data.
neighborhoods_data = newyork_data['features']

In [12]:
#Let's take a look at the first item in this list.
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [13]:
#Tranform the data into a pandas dataframe
#The next task is essentially transforming this data of nested Python dictionaries 
#into a pandas dataframe. So let's start by creating an empty dataframe.

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [14]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [15]:
#Then let's loop through the data and fill the dataframe one row at a time.
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [16]:
#Quickly examine the resulting dataframe.
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [22]:
#And make sure that the dataset has all 5 boroughs and 306 neighborhoods.

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [23]:
#Use geopy library to get the latitude and longitude values of New York City
#In order to define an instance of the geocoder, we need to define a user_agent. 
#We will name our agent ny_explorer, as shown below.

address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [24]:
#Create a map of New York with neighborhoods superimposed on top.
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork


In [26]:
#Folium is a great visualization library. Feel free to zoom into the above
#map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

#However, for illustration purposes, let's simplify the above map and 
#segment and cluster only the neighborhoods in Manhattan. So let's slice
#the original dataframe and create a new dataframe of the Manhattan data.

manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [27]:
#Let's get the geographical coordinates of Manhattan.
address = 'Manhattan, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


In [28]:
#As we did with all of New York City, let's visualizat Manhattan the neighborhoods in it.
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [60]:
#Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.
#Define Foursquare Credentials and Version

CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
CLIENT_ID = 'your-client-ID' # your Foursquare ID
CLIENT_SECRET = 'your-client-secret' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:
Your credentails:
CLIENT_ID: your-client-ID
CLIENT_SECRET:your-client-secret


In [30]:
#Let's explore the first neighborhood in our dataframe
#Get the neighborhood's name.

manhattan_data['Neighborhood']

0             Marble Hill
1               Chinatown
2      Washington Heights
3                  Inwood
4        Hamilton Heights
5          Manhattanville
6          Central Harlem
7             East Harlem
8         Upper East Side
9               Yorkville
10             Lenox Hill
11       Roosevelt Island
12        Upper West Side
13         Lincoln Square
14                Clinton
15                Midtown
16            Murray Hill
17                Chelsea
18      Greenwich Village
19           East Village
20        Lower East Side
21                Tribeca
22           Little Italy
23                   Soho
24           West Village
25       Manhattan Valley
26    Morningside Heights
27               Gramercy
28      Battery Park City
29     Financial District
30          Carnegie Hill
31                   Noho
32           Civic Center
33          Midtown South
34           Sutton Place
35             Turtle Bay
36             Tudor City
37        Stuyvesant Town
38          

In [31]:
#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = manhattan_data['Latitude'] # neighborhood latitude value
neighborhood_longitude = manhattan_data['Longitude'] # neighborhood longitude value

neighborhood_name = manhattan_data['Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))


Latitude and longitude values of 0             Marble Hill
1               Chinatown
2      Washington Heights
3                  Inwood
4        Hamilton Heights
5          Manhattanville
6          Central Harlem
7             East Harlem
8         Upper East Side
9               Yorkville
10             Lenox Hill
11       Roosevelt Island
12        Upper West Side
13         Lincoln Square
14                Clinton
15                Midtown
16            Murray Hill
17                Chelsea
18      Greenwich Village
19           East Village
20        Lower East Side
21                Tribeca
22           Little Italy
23                   Soho
24           West Village
25       Manhattan Valley
26    Morningside Heights
27               Gramercy
28      Battery Park City
29     Financial District
30          Carnegie Hill
31                   Noho
32           Civic Center
33          Midtown South
34           Sutton Place
35             Turtle Bay
36             Tudor City
37   

In [32]:
#Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.
#First, let's create the GET request URL. Name your URL url.



LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
 # create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id=&client_secret=&v=20180605&ll=40.876551,-73.910660&radius=500&limit=100'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=2BGZBCUPEOMMEMSCIJD3FRFSG4HY5BAKCKLRGOBNOZLFOMSO&client_secret=KPU500WGZS0IWVBADUNW3V3GPL44VREMQ5FHCXT3ODPU5Y4M&v=20180605&ll=40.876551,-73.910660&radius=500&limit=100'

In [33]:
#Send the GET request and examine the resutls
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fa58be906eac44a5ff8dddb'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Marble Hill',
  'headerFullLocation': 'Marble Hill, New York',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 22,
  'suggestedBounds': {'ne': {'lat': 40.8810510045, 'lng': -73.90471968052839},
   'sw': {'lat': 40.8720509955, 'lng': -73.9166003194716}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b4429abf964a52037f225e3',
       'name': "Arturo's",
       'location': {'address': '5198 Broadway',
        'crossStreet': 'at 225th St.',
        'lat': 40.87441177110231,
        'lng': -73.91027100981574,
        'labeledLatLngs': [{'label': 'display',


In [34]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [35]:
#Now we are ready to clean the json and structure it into a pandas dataframe.

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns =[col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


  """


Unnamed: 0,name,categories,lat,lng
0,Arturo's,Pizza Place,40.874412,-73.910271
1,Bikram Yoga,Yoga Studio,40.876844,-73.906204
2,Tibbett Diner,Diner,40.880404,-73.908937
3,Starbucks,Coffee Shop,40.877531,-73.905582
4,Dunkin',Donut Shop,40.877136,-73.906666


In [36]:
#And how many venues were returned by Foursquare?
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

22 venues were returned by Foursquare.


In [38]:
#2. Explore Neighborhoods in Manhattan
#Let's create a function to repeat the same process to all the neighborhoods in Manhattan

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id=&client_secret=&v=20180605&ll=40.876551,-73.910660&radius=500&limit=100'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [40]:
#Now write the code to run the above function on each neighborhood and 
#create a new dataframe called manhattan_venues


manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude'])


Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [41]:
#Let's check the size of the resulting dataframe
print(manhattan_venues.shape)
manhattan_venues.head()

(880, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop


In [42]:
#Let's check how many venues were returned for each neighborhood
manhattan_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,22,22,22,22,22,22
Carnegie Hill,22,22,22,22,22,22
Central Harlem,22,22,22,22,22,22
Chelsea,22,22,22,22,22,22
Chinatown,22,22,22,22,22,22
Civic Center,22,22,22,22,22,22
Clinton,22,22,22,22,22,22
East Harlem,22,22,22,22,22,22
East Village,22,22,22,22,22,22
Financial District,22,22,22,22,22,22


In [43]:
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))

There are 18 uniques categories.


In [44]:
#3. Analyze Each Neighborhood
# one hot encoding

manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

Unnamed: 0,Neighborhood,Coffee Shop,Deli / Bodega,Department Store,Diner,Discount Store,Donut Shop,Gym,Ice Cream Shop,Kids Store,Pharmacy,Pizza Place,Sandwich Place,Seafood Restaurant,Steakhouse,Supplement Shop,Tennis Stadium,Video Game Store,Yoga Studio
0,Marble Hill,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Marble Hill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
#And let's examine the new dataframe size.
manhattan_onehot.shape

(880, 19)

In [46]:
#Next, let's group rows by neighborhood and by taking the mean of the frequency
#of occurrence of each category

manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
manhattan_grouped

Unnamed: 0,Neighborhood,Coffee Shop,Deli / Bodega,Department Store,Diner,Discount Store,Donut Shop,Gym,Ice Cream Shop,Kids Store,Pharmacy,Pizza Place,Sandwich Place,Seafood Restaurant,Steakhouse,Supplement Shop,Tennis Stadium,Video Game Store,Yoga Studio
0,Battery Park City,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
1,Carnegie Hill,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
2,Central Harlem,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
3,Chelsea,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
4,Chinatown,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
5,Civic Center,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
6,Clinton,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
7,East Harlem,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
8,East Village,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455
9,Financial District,0.090909,0.045455,0.045455,0.045455,0.090909,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.090909,0.045455,0.045455,0.045455,0.045455,0.045455,0.045455


In [47]:
#Let's confirm the new size
manhattan_grouped.shape


(40, 19)

In [48]:
#Let's print each neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in manhattan_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Battery Park City----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


----Carnegie Hill----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


----Central Harlem----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


----Chelsea----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


----Chinatown----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


----Civic Center----
            venue  freq
0     Coffee Shop  0.09
1  Discount Store  0.09
2             Gym  0.09
3  Sandwich Place  0.09
4     Pizza Place  0.05


In [50]:
#Let's put that into a pandas dataframe
#First, let's write a function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [51]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
1,Carnegie Hill,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
2,Central Harlem,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
3,Chelsea,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
4,Chinatown,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio


In [52]:
#4. Cluster Neighborhoods
#Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [53]:
#Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Manhattan,Marble Hill,40.876551,-73.91066,0,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
1,Manhattan,Chinatown,40.715618,-73.994279,0,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
2,Manhattan,Washington Heights,40.851903,-73.9369,0,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
3,Manhattan,Inwood,40.867684,-73.92121,0,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
4,Manhattan,Hamilton Heights,40.823604,-73.949688,0,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio


In [54]:
#Finally, let's visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [55]:

#5. Examine Clusters
#Now, you can examine each cluster and determine the discriminating venue
#categories that distinguish each cluster. Based on the defining categories, you can then assign
#a name to each cluster. I will leave this exercise to you.

#Cluster 1
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]



Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
1,Chinatown,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
2,Washington Heights,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
3,Inwood,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
4,Hamilton Heights,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
5,Manhattanville,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
6,Central Harlem,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
7,East Harlem,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
8,Upper East Side,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio
9,Yorkville,Coffee Shop,Discount Store,Sandwich Place,Gym,Ice Cream Shop,Deli / Bodega,Department Store,Diner,Donut Shop,Yoga Studio


In [56]:
#Cluster 2
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [57]:
#Cluster 3
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [58]:
#Cluster 4
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [59]:
#Cluster 5
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
