# Coursera Capstone Project

This is my jupyter notebook for the projects of Applied Data Science Capstone Course

In [1]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## Import libaries

In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
%matplotlib inline 
import matplotlib.pyplot as plt
from scipy import stats
import folium

## Import data

#### Get client restaurant data in Toronto

In [3]:
client_restaurant = pd.read_csv('./Data/Client.csv')

print(client_restaurant.shape)
client_restaurant

(7, 1)


Unnamed: 0,Neighborhood
0,Moore Park
1,Forest Hill West
2,Regent Park
3,Rosedale
4,Island airport
5,Chinatown
6,Central Bay Street


#### Get venues data

In [4]:
nyc_venues = pd.read_csv('./Data/nyc_venues.csv')
nyc_venues.drop(columns='Unnamed: 0', inplace=True)

print(nyc_venues.shape)
nyc_venues.head()

(10242, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop
4,Wakefield,40.894705,-73.847201,SUBWAY,40.890656,-73.849192,Sandwich Place


In [5]:
toronto_venues = pd.read_csv('./Data/toronto_venues.csv')
toronto_venues.drop(columns='Unnamed: 0', inplace=True)

print(toronto_venues.shape)
toronto_venues.head()

(1707, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,The Beaches,43.676357,-79.293031,Beaches Fitness,43.680319,-79.290991,Gym / Fitness Center
4,The Beaches,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop


## Data pre-processing

### New York dataset

#### Get number of same type restaurant foreach neighborhood

In [6]:
nyc_competitors = nyc_venues[nyc_venues['Venue Category'].str.contains('Restaurant')].groupby('Neighborhood').size().to_frame('Competitors').reset_index()

print(nyc_competitors.shape)
nyc_competitors.head()

(262, 2)


Unnamed: 0,Neighborhood,Competitors
0,Allerton,5
1,Annadale,1
2,Arlington,1
3,Arrochar,4
4,Arverne,1


#### Get top 10 venue types foreach neighborhood

In [7]:
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood'] 

# group by neighborhood and calculate the average of each type
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()

#Add Competitors columns to grouped datasets
nyc_grouped = nyc_grouped.set_index('Neighborhood').join(nyc_competitors.set_index('Neighborhood'), how='left').fillna(0).reset_index()


print(nyc_grouped.shape)
nyc_grouped.head()

(301, 432)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,Airport Tram,American Restaurant,Animal Shelter,Antique Shop,...,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Competitors
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [8]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [9]:
# create a new dataframe
nyc_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
nyc_neighborhoods_venues_sorted['Neighborhood'] = nyc_grouped['Neighborhood']

for ind in np.arange(nyc_grouped.drop(columns=['Competitors']).shape[0]):
    nyc_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyc_grouped.drop(columns=['Competitors']).iloc[ind, :], num_top_venues)
    
#Add Competitors
nyc_neighborhoods_venues_sorted = nyc_neighborhoods_venues_sorted.set_index('Neighborhood').join(nyc_competitors.set_index('Neighborhood'), how='left').fillna(0).reset_index()

print(nyc_neighborhoods_venues_sorted.shape)
nyc_neighborhoods_venues_sorted.head()

(301, 12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,Allerton,Pizza Place,Spa,Supermarket,Chinese Restaurant,Deli / Bodega,Food,Fast Food Restaurant,Bakery,Electronics Store,Pharmacy,5.0
1,Annadale,Pub,Cosmetics Shop,Diner,Train Station,Liquor Store,Sports Bar,Pizza Place,Restaurant,Pet Store,Exhibit,1.0
2,Arden Heights,Pharmacy,Deli / Bodega,Pizza Place,Coffee Shop,Home Service,Filipino Restaurant,Event Space,Exhibit,Eye Doctor,Factory,0.0
3,Arlington,Bus Stop,Deli / Bodega,Intersection,American Restaurant,Food Service,Boat or Ferry,Coffee Shop,Yoga Studio,Fish & Chips Shop,Filipino Restaurant,1.0
4,Arrochar,Deli / Bodega,Bus Stop,Italian Restaurant,Liquor Store,Middle Eastern Restaurant,Taco Place,Sandwich Place,Food Truck,Pizza Place,Cosmetics Shop,4.0


### Toronto and Client dataset

#### Get number of same type restaurant foreach neighborhood

In [10]:
toronto_competitors = toronto_venues[toronto_venues['Venue Category'].str.contains('Restaurant')].groupby('Neighborhood').size().to_frame('Competitors').reset_index()

print(toronto_competitors.shape)
toronto_competitors.head()

(32, 2)


Unnamed: 0,Neighborhood,Competitors
0,"Adelaide,King,Richmond",27
1,Berczy Park,12
2,"Brockton,Exhibition Place,Parkdale Village",2
3,Business Reply Mail Processing Centre 969 Eastern,2
4,"Cabbagetown,St. James Town",12


#### Get top 10 venue types foreach neighborhood

In [11]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# group by neighborhood and calculate the average of each type
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#Add Competitors columns to grouped datasets
toronto_grouped = toronto_grouped.set_index('Neighborhood').join(toronto_competitors.set_index('Neighborhood'), how='left').fillna(0).reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(38, 239)


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Competitors
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,27.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,2.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# create a new dataframe
toronto_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
toronto_neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.drop(columns=['Competitors']).shape[0]):
    toronto_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.drop(columns=['Competitors']).iloc[ind, :], num_top_venues)
    
#Add Competitors
toronto_neighborhoods_venues_sorted = toronto_neighborhoods_venues_sorted.set_index('Neighborhood').join(toronto_competitors.set_index('Neighborhood'), how='left').fillna(0).reset_index()

print(toronto_neighborhoods_venues_sorted.shape)
toronto_neighborhoods_venues_sorted.head()

(38, 12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,American Restaurant,Clothing Store,Gym,Hotel,Bakery,Bar,27.0
1,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Café,Farmers Market,Pub,Seafood Restaurant,Cheese Shop,Beer Bar,Italian Restaurant,12.0
2,"Brockton,Exhibition Place,Parkdale Village",Breakfast Spot,Coffee Shop,Café,Burrito Place,Stadium,Bar,Caribbean Restaurant,Furniture / Home Store,Climbing Gym,Italian Restaurant,2.0
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Garden,Pizza Place,Park,Recording Studio,Restaurant,Burrito Place,Brewery,Skate Park,Smoke Shop,2.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Sculpture Garden,Plane,Airport,Airport Food Court,Airport Gate,Harbor / Marina,0.0


#### Merge with client dataset

In [13]:
#Change the neighborhood name in client dataset
for index, item in enumerate(toronto_neighborhoods_venues_sorted['Neighborhood']):
    for index_, item_ in enumerate(client_restaurant['Neighborhood']):
        if item_ in item:
            client_restaurant.at[index_,'Neighborhood'] = item
            break

print(client_restaurant.shape)          
client_restaurant.head()

(7, 1)


Unnamed: 0,Neighborhood
0,"Moore Park,Summerhill East"
1,"Forest Hill North,Forest Hill West"
2,"Harbourfront,Regent Park"
3,Rosedale
4,"CN Tower,Bathurst Quay,Island airport,Harbourf..."


In [14]:
# merged the grouped dataset
client_grouped = client_restaurant.set_index('Neighborhood').join(toronto_grouped.set_index('Neighborhood'), how='inner').reset_index()

print(client_grouped.shape)
client_grouped

(7, 239)


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Competitors
0,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Forest Hill North,Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,"Harbourfront,Regent Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
3,Rosedale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.04,0.01,0.0,0.0,0.0,33.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,...,0.0,0.0,0.012195,0.0,0.0,0.012195,0.0,0.0,0.012195,25.0


In [15]:
# merged the venues sorted dataset
client_neighborhoods_venues_sorted = client_restaurant.set_index('Neighborhood').join(toronto_neighborhoods_venues_sorted.set_index('Neighborhood'), how='inner').reset_index()

print(client_neighborhoods_venues_sorted.shape)
client_neighborhoods_venues_sorted

(7, 12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,"Moore Park,Summerhill East",Playground,Park,Tennis Court,Gym,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,0.0
1,"Forest Hill North,Forest Hill West",Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Donut Shop,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,1.0
2,"Harbourfront,Regent Park",Coffee Shop,Bakery,Pub,Park,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Theater,Bank,7.0
3,Rosedale,Park,Playground,Trail,Dog Run,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Sculpture Garden,Plane,Airport,Airport Food Court,Airport Gate,Harbor / Marina,0.0
5,"Chinatown,Grange Park,Kensington Market",Bar,Café,Vegetarian / Vegan Restaurant,Bakery,Vietnamese Restaurant,Coffee Shop,Dumpling Restaurant,Chinese Restaurant,Mexican Restaurant,Dim Sum Restaurant,33.0
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Bar,Burger Joint,Thai Restaurant,Sandwich Place,Salad Place,Indian Restaurant,Ice Cream Shop,25.0


### Merge client dataset with NYC dataset

In [16]:
# merge the grouped dataset
merged_grouped = pd.concat([client_grouped.set_index('Neighborhood'), nyc_grouped.set_index('Neighborhood')]).fillna(0).reset_index()

print(merged_grouped.shape)
merged_grouped.head()

(308, 451)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Waste Facility,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Forest Hill North,Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Harbourfront,Regent Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Rosedale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# merge the venues sorted datasets 
merged_venues_sorted = pd.concat([client_neighborhoods_venues_sorted.set_index('Neighborhood'), nyc_neighborhoods_venues_sorted.set_index('Neighborhood')]).reset_index()

print(merged_venues_sorted.shape)
merged_venues_sorted.head()

(308, 12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,"Moore Park,Summerhill East",Playground,Park,Tennis Court,Gym,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,0.0
1,"Forest Hill North,Forest Hill West",Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Donut Shop,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,1.0
2,"Harbourfront,Regent Park",Coffee Shop,Bakery,Pub,Park,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Theater,Bank,7.0
3,Rosedale,Park,Playground,Trail,Dog Run,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Sculpture Garden,Plane,Airport,Airport Food Court,Airport Gate,Harbor / Marina,0.0


## Apply clustering algorithm

#### Clustering

In [18]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 10

merged_grouped_clustering = merged_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(merged_grouped_clustering)

#### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood and number of competitors.

In [19]:
nyc_data = nyc_venues.groupby(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude']).size().to_frame('Counts').reset_index().drop(columns=['Counts'])
toronto_data = toronto_venues.groupby(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude']).size().to_frame('Counts').reset_index().drop(columns=['Counts'])
client_data = client_restaurant.set_index('Neighborhood').join(toronto_data.set_index('Neighborhood'), how='inner').reset_index()
merged_data = pd.concat([client_data, nyc_data]).dropna()

In [20]:
# add clustering labels
merged_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge merged_grouped with client_data to add latitude/longitude for each neighborhood
merged = merged_data.join(merged_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print(merged.shape)
merged.head() # check the last columns!

(314, 15)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,"Moore Park,Summerhill East",43.689574,-79.38316,5,Playground,Park,Tennis Court,Gym,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,0.0
1,"Forest Hill North,Forest Hill West",43.696948,-79.411307,5,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Donut Shop,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,1.0
2,"Harbourfront,Regent Park",43.65426,-79.360636,2,Coffee Shop,Bakery,Pub,Park,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Theater,Bank,7.0
3,Rosedale,43.679563,-79.377529,5,Park,Playground,Trail,Dog Run,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,0.0
3,Rosedale,43.679563,-79.377529,5,Bus Station,Fried Chicken Joint,Accessories Store,Baseball Field,Deli / Bodega,Sandwich Place,Pharmacy,Caribbean Restaurant,Supermarket,Jewelry Store,1.0


In [21]:
# Get cluster label of client restaurants
client_labels = client_restaurant.set_index('Neighborhood').join(merged.set_index('Neighborhood'))['Cluster Labels'].unique()
print(client_labels)

#Split a dataset which contains NYC neighborhood that have the same cluster label as client restaurants
merged['is_in'] = merged['Cluster Labels'].isin(client_labels)
merged = merged[merged['is_in'] == True].drop(columns=['is_in'])

print(merged.shape)
merged.head()

[5 1 8 2]
(158, 15)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,"Moore Park,Summerhill East",43.689574,-79.38316,5,Playground,Park,Tennis Court,Gym,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,0.0
1,"Forest Hill North,Forest Hill West",43.696948,-79.411307,5,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Donut Shop,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,1.0
2,"Harbourfront,Regent Park",43.65426,-79.360636,2,Coffee Shop,Bakery,Pub,Park,Café,Breakfast Spot,Restaurant,Mexican Restaurant,Theater,Bank,7.0
3,Rosedale,43.679563,-79.377529,5,Park,Playground,Trail,Dog Run,Fish & Chips Shop,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,0.0
3,Rosedale,43.679563,-79.377529,5,Bus Station,Fried Chicken Joint,Accessories Store,Baseball Field,Deli / Bodega,Sandwich Place,Pharmacy,Caribbean Restaurant,Supermarket,Jewelry Store,1.0


#### Visualize the resulting clusters

In [22]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

#Get NYC location data
address = 'New York City, US'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Neighborhood Latitude'], merged['Neighborhood Longitude'], merged['Neighborhood'], merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters