In [None]:
#Import libraries
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

In [None]:
!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json

In [None]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [None]:
NY_neighborhoods= newyork_data['features']

In [None]:
# Dataframe columns and initiate
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

In [None]:
for data in NY_neighborhoods:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [None]:
neighborhoods.head()
neighborhoods.shape

In [None]:
Brooklyn_data = neighborhoods[neighborhoods['Borough'] == 'Brooklyn'].reset_index(drop=True)

In [None]:
Brooklyn_data.head()
Brooklyn_data.shape

In [None]:
address = 'Brooklyn'
geolocator = Nominatim(user_agent="brooklyn_restaurants")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Brooklyn coordinates: {}, {}.'.format(latitude, longitude))

In [None]:
#Create map 
map_brooklyn = folium.Map(location=[latitude, longitude], zoom_start=13)

#Add markers 
for lat, lng, label in zip(Brooklyn_data['Latitude'], Brooklyn_data['Longitude'], Brooklyn_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='slateblue',
        fill=True,
        fill_color='#steelblue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_brooklyn) 
    
map_brooklyn

In [None]:
CLIENT_ID = '3F53LDALYKVG2EL5MVLTXZ2TK0MBECFBK5PM5WT5SVQSOZU1' 
CLIENT_SECRET = 'HDAUY3EAYDRBXUR3L2G05IYCYDTMYINUVM5WBNB02K2WLXPW' 
VERSION = '20210513' 
LIMIT = 100 
radius = 500

In [None]:
Brooklyn_data.loc[0, 'Neighborhood']

In [None]:
#Top 50 venues in Bay Ridge in a 500 mts radius
neighborhood_latitude = Brooklyn_data.loc[0, 'Latitude'] # neighborhood latitude 
neighborhood_longitude = Brooklyn_data.loc[0, 'Longitude'] # neighborhood longitude 
neighborhood_name = Brooklyn_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude of {}: {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 50

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 

In [None]:
results = requests.get(url).json()

In [None]:
# Define the fucntion that will get the venue categories
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# Filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# Clean the columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
print('{} venues were returned using Foursquare.'.format(nearby_venues.shape[0]))
print ('{} unique categories in Bay Ridge.'.format(nearby_venues['categories'].value_counts().shape[0]))
print (nearby_venues['categories'].value_counts()[0:4])

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=3000, LIMIT=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # Create API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    
    return(nearby_venues)

In [None]:
Brooklyn_venues = getNearbyVenues(names=Brooklyn_data['Neighborhood'],
                                   latitudes=Brooklyn_data['Latitude'],
                                   longitudes=Brooklyn_data['Longitude']
                                  )

In [None]:
Brooklyn_venues.shape

In [None]:
Brooklyn_venues.head()

In [None]:
# Drop rows in which Venue Category is only for restaurant
Brooklyn_venues.drop(Brooklyn_venues.loc[Brooklyn_venues['Venue Category']=='Restaurant'].index, inplace=True)

In [None]:
Brooklyn_venues.shape

In [None]:
# Create df for restaurants 
Brooklyn_venues_restaurant = Brooklyn_venues[Brooklyn_venues['Venue Category'].str.contains('Restaurant')].reset_index(drop=True)
Brooklyn_venues_restaurant.index = np.arange(1, len(Brooklyn_venues_restaurant )+1)

In [None]:
print (Brooklyn_venues_restaurant['Venue Category'].value_counts())

In [None]:
Brooklyn_venues_restaurant.shape

In [None]:
Brooklyn_venues_restaurant.head()

In [None]:
# create df for top 10 categories
Brooklyn_venues_restaurant_Top10 = Brooklyn_venues_restaurant['Venue Category'].value_counts()[0:10
                                                                                               ].to_frame(name='frequency')
Brooklyn_venues_restaurant_Top10=Brooklyn_venues_restaurant_Top10.reset_index()
Brooklyn_venues_restaurant_Top10.rename(index=str, columns={"index": "Venue_Category", "frequency": "Frequency"}, inplace=True)
Brooklyn_venues_restaurant_Top10

In [None]:
#Set Venue_Category as Index + visualize

import seaborn as sns
from matplotlib import pyplot as plt

s=sns.barplot(x="Venue_Category", y="Frequency", data=Brooklyn_venues_restaurant_Top10, color='slateblue')
s.set_xticklabels(s.get_xticklabels(), rotation=45, horizontalalignment='right')

plt.title('10 Most Frequently Occuring Venues in Brooklyn', fontsize=30)
plt.xlabel("Venue Category", fontsize=20)
plt.ylabel ("Frequency", fontsize=20)
plt.savefig("Most_Freq_Venues1.png", dpi=300)
fig = plt.figure(figsize=(25,8))
plt.show()

In [None]:
Brooklyn_venues_restaurant.shape
print('There are {} unique categories.'.format(len(Brooklyn_venues_restaurant['Venue Category'].unique())))

In [None]:
#Neighborhood analysis
Brooklyn_restaurant = Brooklyn_venues_restaurant.groupby(['Neighborhood'])['Venue Category'].apply(lambda x: x[x.str.contains('Restaurant')].count())
Brooklyn_restaurant

In [None]:
Brooklyn_restaurant_df  = Brooklyn_restaurant.to_frame().reset_index()
Brooklyn_restaurant_df.columns = ['Neighborhood', 'Number of Restaurant']
Brooklyn_restaurant_df.index = np.arange(1, len(Brooklyn_restaurant_df)+1)
list_rest_no =Brooklyn_restaurant_df['Number of Restaurant'].to_list()
list_dist =Brooklyn_restaurant_df['Neighborhood'].to_list()

In [None]:
# One hot encoding
Brooklyn_onehot = pd.get_dummies(Brooklyn_venues_restaurant[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to df
Brooklyn_onehot['Neighborhood'] = Brooklyn_venues_restaurant['Neighborhood'] 

# Switch neighborhood column to 1st column in df
fixed_columns = [Brooklyn_onehot.columns[-1]] + list(Brooklyn_onehot.columns[:-1])
Brooklyn_onehot = Brooklyn_onehot[fixed_columns]
Brooklyn_onehot.head(10)

In [None]:
Brooklyn_onehot.shape

In [None]:
#Grouping by neighbourhoods and showing the mean of the frequency of occurrence for each category of restaurant
Brooklyn_restaurant_grouped = Brooklyn_onehot.groupby('Neighborhood').mean().reset_index()
Brooklyn_restaurant_grouped.head(10)

In [None]:
Brooklyn_restaurant_grouped.shape

In [None]:
num_top_venues = 10

for hood in Brooklyn_restaurant_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Brooklyn_restaurant_grouped[Brooklyn_restaurant_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
#Create pandas df
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new df
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Brooklyn_restaurant_grouped['Neighborhood']

for ind in np.arange(Brooklyn_restaurant_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Brooklyn_restaurant_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

In [None]:
#Niehgborhood Clustering w KmEANS
#Set number of clusters 
kclusters = 5

Brooklyn_restaurant_grouped_clustering = Brooklyn_restaurant_grouped.drop('Neighborhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Brooklyn_restaurant_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# Add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
Brooklyn_merged = Brooklyn_data

# Merge to add longitude + latitute for each neighborhood
Brooklyn_merged = Brooklyn_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Brooklyn_merged.head() # check the last columns

In [None]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# Set color for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to map
markers_colors = []
for lat, lon, poi, cluster in zip(Brooklyn_merged['Latitude'], Brooklyn_merged['Longitude'], Brooklyn_merged['Neighborhood'], Brooklyn_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
#Cluster 1
Brooklyn_merged.loc[Brooklyn_merged['Cluster Labels'] == 0, Brooklyn_merged.columns[[1] + list(range(5, Brooklyn_merged.shape[1]))]]

In [None]:
#Cluster 2
Brooklyn_merged.loc[Brooklyn_merged['Cluster Labels'] == 1, Brooklyn_merged.columns[[1] + list(range(5, Brooklyn_merged.shape[1]))]]

In [None]:
#Cluster 3
Brooklyn_merged.loc[Brooklyn_merged['Cluster Labels'] == 2, Brooklyn_merged.columns[[1] + list(range(5, Brooklyn_merged.shape[1]))]]

In [None]:
#Cluster 4
Brooklyn_merged.loc[Brooklyn_merged['Cluster Labels'] == 3, Brooklyn_merged.columns[[1] + list(range(5, Brooklyn_merged.shape[1]))]]

In [None]:
#Cluster 5
Brooklyn_merged.loc[Brooklyn_merged['Cluster Labels'] == 4, Brooklyn_merged.columns[[1] + list(range(5, Brooklyn_merged.shape[1]))]]