# COURSERA CAPSTONE FINAL PROJECT, WEEKS 4 & 5

## Find the best location to open a fitness nutrition shop in a shopping mall in Toronto

## Import required libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from pprint import pprint # data pretty printer

import requests # library to handle requests
from bs4 import BeautifulSoup  # library to handle web scraping

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors # Matplotlib and associated plotting modules
from matplotlib import pyplot as plt

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from collections import Counter # count occurrences 

from sklearn.cluster import KMeans # import k-means from clustering stage

### Import and explore Toronto postalcode areas

In [2]:
# import data frame
df=pd.read_csv('Toronto_neighborhoods_WithGeospacial.csv')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [3]:
print('The dataframe has {} boroughs and {} Neighborhood.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 Neighborhood.


#### Use geopy library to get the latitude and longitude values of Toronto.

In [4]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with Neighborhoods superimposed on top.

In [5]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'],\
                                           df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

## Define Foursquare Credentials and Version

In [6]:
# Note that these credentials are saved in Credentials.csv which is not shared on github
Cred_data=pd.read_csv('Credentials.csv')
CLIENT_ID='5LJOJSOHCGXQDQPSJGY3GRZIHB2EJWJEL2ANTCZOUV0FIL5L'#Cred_data['CLIENT_ID'][0]
CLIENT_SECRET='TDF2G1JALYUMVXS0KRN0E4GTOV2WGPW041IBZVDKYJSGVQ43'#Cred_data['CLIENT_SECRET'][0]
VERSION=Cred_data['VERSION'][0]
LIMIT=Cred_data['LIMIT'][0]

**Fetch Foursquare Venue Category Hierarchy**

In [33]:
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION)
category_results = requests.get(url).json()

{'meta': {'code': 429,
  'errorType': 'quota_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5ebeda1b0f5968001cdf3232'},
 'response': {}}

In [31]:
with open('category_results.json', 'w') as f:
    json.dump(category_results, f)

In [32]:
for key, value in category_results['response']['categories'][0].items():
    print(key, len(str(value)))

KeyError: 'categories'

In [None]:
category_list = category_results['response']['categories']
len(category_list)

In [None]:
for data in category_list:
    print(data['id'], data['name'])

In [None]:
Sport_ID='4d4b7105d754a06377d81259'
Shop_ID='4d4b7105d754a06378d81259'

**_Outdoors & Recreation_ & It's Sub-Categories**

Let's create a function to return a dictionary with Ids' & Names of _Outdoors & Recreation_ & It's Sub-Categories.

In [None]:
# function to flatten a 'parent_id' category, returns all categories if checkParentID = False
def flatten_Hierarchy(category_list, checkParentID, category_dict, parent_id = ''):
    for data in category_list:
        
        if checkParentID == True and data['id'] == parent_id:
            category_dict[data['id']] = data['name']
            flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
        
        elif checkParentID == False:
            category_dict[data['id']] = data['name']
            if len(data['categories']) != 0:
                flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
    
    return category_dict

In [None]:
# this function takes the parent ID and returns 'name' and 'id' of all the sub-categories
category_dict = flatten_Hierarchy(category_list, checkParentID=True, category_dict = {}, parent_id = Sport_ID)

In [None]:
category_dict

Now, we have all the categories in _Outdooes and recreation_ with their id's.

#### Explore the first Neighborhood to understand the results of GET Request

Get the Neighborhood's name.

In [None]:
df.loc[0, 'Neighborhood']

Get the Neighborhood's latitude and longitude values.

In [None]:
neighborhood_latitude = df.loc[0, 'Latitude'] # Neighborhood area latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # Neighborhood area longitude value

neighborhood_name = df.loc[0, 'Postalcode'] # postalcode name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

#### Now, let's get the _Sport & Recreation_ that is in this neighborhood within a radius of 500 meters.

First, let's create the GET request URL to search for Venue with requested _Category ID_

In [None]:
LIMIT = 10 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
categoryId = Sport_ID # category ID for "sport and recreation"

# create URL

url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius,
    categoryId,
    LIMIT)
url # display URL

Send the GET request and examine the resutls

In [None]:
results = requests.get(url).json()

In [None]:
results['response']['venues'][0]

The category name of the venue **'Brookbanks Park'** is **'Parks'**.

As, our aim is to segment the neighborhoods of Toronto with respect to the _Sport & Recreation_ in its vicinity. We need to proceed further to fetch this data from all the 103 neighborhoods' venues.

#### Let's create a function to repeat the following process to all the postalcode areas in Toronto:
* Loop through postalcodes
    * Create the API request URL with radius=500, LIMIT=100
    * Make the GET request
    * For each postalcode, return only relevant information for each nearby venue
    * Append all nearby venues to a list
* Unfold the list & append it to dataframe being returned

In [None]:
def getNearbyVenues(ID, names, latitudes, longitudes, radius=1500, LIMIT=500):
    not_found = 0
    print('***Start ', end='')
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(' .', end='')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            ID, # category id
            LIMIT)
            
        try:
            # make the GET request
            results = requests.get(url).json()['response']['venues']
            
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['name'], 
                v['location']['lat'], 
                v['location']['lng'],  
                v['categories'][0]['name']) for v in results])
        except:
            not_found += 1


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    print("\nDone*** with {} venues with incompelete information.".format(not_found))
    return(nearby_venues)

**Let's use pickle library to serialize the information retrieved from GET requests. This step will counter any redundant requests to the Foursquare API.**

In [None]:
import pickle # to serialize and deserialize a Python object structure

# get all sport venues
try:
    with open('Toronto_venues_sport.pkl', 'rb') as f:
        Toronto_venues_sport = pickle.load(f)
    print("---Dataframe Existed and Deserialized---")
except:
    Toronto_venues_sport = getNearbyVenues(ID=Sport_ID, names=df['Neighborhood'],
                                        latitudes=df['Latitude'],
                                        longitudes=df['Longitude']
                                       )
    with open('Toronto_venues_sport.pkl', 'wb') as f:
        pickle.dump(Toronto_venues_sport, f)
    print("---Dataframe Created and Serialized---")

In [None]:
# get all shop venues
try:
    with open('Toronto_venues_shop.pkl', 'rb') as f:
        Toronto_venues_shop = pickle.load(f)
    print("---Dataframe Existed and Deserialized---")
except:
    Toronto_venues_shop = getNearbyVenues(ID=Shop_ID, names=df['Neighborhood'],
                                        latitudes=df['Latitude'],
                                        longitudes=df['Longitude']
                                       )
    with open('Toronto_venues_shop.pkl', 'wb') as f:
        pickle.dump(Toronto_venues_shop, f)
    print("---Dataframe Created and Serialized---")

## 3. Analysis & Machine Learning

#### Let's find out how many unique categories can be curated from all the returned sport venues

In [None]:
print('There are {} uniques categories.'.format(len(Toronto_venues_sport['Venue Category'].unique())))
Toronto_venues_sport.groupby('Venue Category')['Venue Category'].count().sort_values(ascending=False)

In [None]:
# manually create a list of desired sport related categories (> 20 venues)
sport_categories = ['Gym','Gym / Fitness Center','Athletics & Sports','Pool','Skating Rink','Baseball Field',
                    'Tennis Court','Martial Arts Dojo','Yoga Studio','Golf Course','Soccer Field',
                    'Pilates Studio','Skate Park','Recreation Center','Outdoors & Recreation','Basketball Court',
                    'Gym Pool','Sports Club','Hockey Arena','Boxing Gym','Ski Area','Badminton Court',
                    'College Stadium','Climbing Gym','Hockey Rink','Curling Ice','Hockey Field','Stadium',
                    'Rock Climbing Spot','Volleyball Court','Paintball Field','College Gym','College Rec Center','Gymnastics Gym']

Revise the results to keep only venues in the list of sport categories

In [None]:
Toronto_venues_sport = Toronto_venues_sport[Toronto_venues_sport['Venue Category'].isin(sport_categories)].reset_index()
Toronto_venues_sport.head(5)

#### Let's find out how many unique categories can be curated from all the returned shop venues

In [None]:
print('There are {} uniques categories.'.format(len(Toronto_venues_shop['Venue Category'].unique())))
Toronto_venues_shop.groupby('Venue Category')['Venue Category'].count().sort_values(ascending=False)

In [None]:
# manually create a list of desired shop related categories
shop_categories = ['Shopping Mall','Shopping Plaza','Sporting Goods Shop']

Revise the results to keep only venues in the list of sport categories

In [None]:
Toronto_venues_shop = Toronto_venues_shop[Toronto_venues_shop['Venue Category'].isin(shop_categories)].reset_index()
Toronto_venues_shop.head(5)

### Analyze Each neighborhood

In [None]:
# one hot encoding the sport venues
Toronto_onehot_sport = pd.get_dummies(Toronto_venues_sport[['Venue Category']], prefix="", prefix_sep="")
Toronto_onehot_sport.head()

In [None]:
# one hot encoding the shopping venues
Toronto_onehot_shop = pd.get_dummies(Toronto_venues_shop[['Venue Category']], prefix="", prefix_sep="")
Toronto_onehot_shop.head()

In [None]:
print(Toronto_onehot_sport.shape)
print(Toronto_onehot_shop.shape)

In [None]:
# add Neighborhood column back to dataframe
Toronto_onehot_sport['Neighborhood'] = Toronto_venues_sport['Neighborhood'] 
Neighborhood = Toronto_onehot_sport['Neighborhood']
Toronto_onehot_sport.drop(labels=['Neighborhood'], axis=1,inplace = True)
Toronto_onehot_sport.insert(0, 'Neighborhood', Neighborhood)
Toronto_onehot_shop['Neighborhood'] = Toronto_venues_shop['Neighborhood'] 
Neighborhood = Toronto_onehot_shop['Neighborhood']
Toronto_onehot_shop.drop(labels=['Neighborhood'], axis=1,inplace = True)
Toronto_onehot_shop.insert(0, 'Neighborhood', Neighborhood)

Let's count venues of each category in each neighborhood

In [10]:
venue_counts_sport= Toronto_onehot_sport.groupby('Neighborhood').sum()
venue_counts_sport.head()

NameError: name 'Toronto_onehot_sport' is not defined

Let's find out the top 10 sport categories in Toronto

In [None]:
venue_counts_described_sport = venue_counts_sport.describe().transpose()

In [11]:
venue_top10_sport = venue_counts_described_sport.sort_values('max', ascending=False)[0:10]
venue_top10_sport

NameError: name 'venue_counts_described_sport' is not defined

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes =plt.subplots(10, 1, figsize=(15,30), sharex=True)
axes = axes.flatten()

for ax, category in zip(axes, venue_top10_sport.index.values.tolist()):
    data = venue_counts_sport[[category]].sort_values([category], ascending=False)[0:10]
    pal = sns.color_palette("Blues", len(data))
    sns.barplot(x=category, y=data.index, data=data, ax=ax, palette=np.array(pal[::-1]))

plt.tight_layout()
plt.show();

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [12]:
Toronto_grouped_sport = Toronto_onehot_sport.groupby('Neighborhood').mean().reset_index()
Toronto_grouped_sport.head()

NameError: name 'Toronto_onehot_sport' is not defined

#### Let's write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 5 venues for each neighborhood.

In [None]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))  
        
columns

In [13]:
# create a new dataframe
neighborhoods_venues_sorted_sport = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted_sport['Neighborhood'] = Toronto_grouped_sport['Neighborhood']
for ind in np.arange(Toronto_grouped_sport.shape[0]):
    neighborhoods_venues_sorted_sport.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped_sport.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted_sport.head()

NameError: name 'columns' is not defined

### Cluster Neighborhoods

Create a data frame with the total count of sport venues for each neighborhood

In [14]:
Toronto_grouped_sport_count= Toronto_onehot_sport.groupby('Neighborhood').sum().reset_index()
# create a new column which has the sum of all columns for each row
Toronto_grouped_sport_count['#sport venues']=Toronto_grouped_sport_count.sum(axis=1)
# drop all other columns
Toronto_grouped_sport_count.drop(sport_categories,axis=1,inplace=True)
Toronto_grouped_sport_count.head()

NameError: name 'Toronto_onehot_sport' is not defined

Create a data frame with the total count of shopping venues for each neighborhood

In [15]:
Toronto_grouped_shop_count= Toronto_onehot_shop[['Neighborhood','Shopping Mall', 'Shopping Plaza']].groupby('Neighborhood').sum().reset_index()
# create a new column which has the sum of all columns for each row
Toronto_grouped_shop_count['#shopping venues']=Toronto_grouped_shop_count.sum(axis=1)
# drop all other columns
Toronto_grouped_shop_count.drop(['Shopping Mall', 'Shopping Plaza'],axis=1,inplace=True)
Toronto_grouped_shop_count.head()

NameError: name 'Toronto_onehot_shop' is not defined

In [16]:
# create a new data frame with only Sporting Goods Shop column and groupby Neighborhood
Toronto_grouped_SportGoodsShop_count= Toronto_onehot_shop[['Neighborhood','Sporting Goods Shop']].groupby('Neighborhood').sum().reset_index()
# create a new column which has the sum of all columns for each row
Toronto_grouped_SportGoodsShop_count.head()

NameError: name 'Toronto_onehot_shop' is not defined

Combine the previous data frames

In [17]:
#Toronto_grouped_sport.merge(Toronto_grouped_shop, on='Neighborhood')
Toronto_merged_sport_shop=Toronto_grouped_sport_count.merge(Toronto_grouped_shop_count, on='Neighborhood').merge(Toronto_grouped_SportGoodsShop_count, on='Neighborhood')
Toronto_merged_sport_shop.head()

NameError: name 'Toronto_grouped_sport_count' is not defined

Remove the neighborhoods with one or more sporting goods shop from the dataframe

In [18]:
Toronto_merged_sport_shop.drop(Toronto_merged_sport_shop[Toronto_merged_sport_shop['Sporting Goods Shop']!=0].index,inplace=True)
# drop the 'Sporting Goods Shop' column
Toronto_merged_sport_shop.drop('Sporting Goods Shop',axis=1,inplace=True)
Toronto_merged_sport_shop.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
# visualize the data with a scatter plot
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.xlabel("#sport venues")
plt.ylabel("#shopping venues")
plt.show()

Run *k*-means to count Neighborhoods for each cluster label for variable cluster size

In [None]:
Toronto_grouped_clustering = Toronto_merged_sport_shop.drop('Neighborhood', 1)

#### Determine the optimal number of clusters for k-means clustering

**The Elbow Method** - calculate the sum of squared distances of samples to their closest cluster center for different values of k. The value of k after which there is no siginificant decrease in sum of squared distances is choosen.

In [None]:
sum_of_squared_distances = []
K = range(2,20)
for k in K:
    print(k, end=' ')
    kmeans = KMeans(n_clusters=k).fit(Toronto_grouped_clustering)
    sum_of_squared_distances.append(kmeans.inertia_)

In [19]:
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.title('Elbow Method For Optimal k');

NameError: name 'K' is not defined

Elbow method does not seem to help us to determine the optimal number of clusters. Let's use another method.

**The Silhouette Method** - The silhouette value measures how similar a point is to its own cluster (cohesion) compared to other clusters (separation). Source: Wikipedia

In [None]:
from sklearn.metrics import silhouette_score

sil = []
K_sil = range(2,20)
# minimum 2 clusters required, to define dissimilarity
for k in K_sil:
    print(k, end=' ')
    kmeans = KMeans(n_clusters = k).fit(Toronto_grouped_clustering)
    labels = kmeans.labels_
    sil.append(silhouette_score(Toronto_grouped_clustering, labels, metric = 'euclidean'))

In [None]:
plt.plot(K_sil, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette_score')
plt.title('Silhouette Method For Optimal k')
plt.show()

In [20]:
pd.DataFrame({'Sil':sil,'ksil':K_sil}).head(15)

NameError: name 'sil' is not defined

### Let's set number of clusters = 6

In [None]:
# set number of clusters
kclusters = 7

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=50).fit(Toronto_grouped_clustering)

print(Counter(kmeans.labels_))

Let's create a new dataframe that includes the cluster as well as the neighborhood information

In [None]:
# add clustering labels
try:
    Toronto_merged_sport_shop.drop('Cluster Labels', axis=1)
except:
    Toronto_merged_sport_shop.insert(0, 'Cluster Labels', kmeans.labels_)

In [21]:
Toronto_merged_sport_shop.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [22]:
# merge neighborhoods_venues_sorted with Toronto_data to add latitude/longitude for each neighborhood
Toronto_merged_sport_shop = Toronto_merged_sport_shop.join(df.set_index('Neighborhood'), on='Neighborhood')
Toronto_merged_sport_shop.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged_sport_shop['Latitude'], Toronto_merged_sport_shop['Longitude'], Toronto_merged_sport_shop['Neighborhood'], Toronto_merged_sport_shop['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Results

In [None]:
required_column_indices = [2,3,5]
required_column = [list(Toronto_merged_sport_shop.columns.values)[i] for i in required_column_indices]
#required_column_indices = [2,3,6]
required_column

#### Cluster 1

In [23]:
cluster_1 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 0, Toronto_merged_sport_shop.columns[1:12]]
cluster_1.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_1['#sport venues'],y=cluster_1['#shopping venues'])
plt.show()

#### Cluster 2

In [24]:
cluster_2 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 1, Toronto_merged_sport_shop.columns[1:12]]
cluster_2

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_2['#sport venues'],y=cluster_2['#shopping venues'])
plt.show()

#### Cluster 3

In [25]:
cluster_3 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 2, Toronto_merged_sport_shop.columns[1:12]]
cluster_3.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_3['#sport venues'],y=cluster_3['#shopping venues'])
plt.show()

#### Cluster 4

In [26]:
cluster_4 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 3, Toronto_merged_sport_shop.columns[1:12]]
cluster_4.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_4['#sport venues'],y=cluster_4['#shopping venues'])
plt.show()

#### Cluster 5

In [27]:
cluster_5 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 4, Toronto_merged_sport_shop.columns[1:12]]
cluster_5.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_5['#sport venues'],y=cluster_5['#shopping venues'])
plt.show()

#### Cluster 6

In [28]:
cluster_6 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 5, Toronto_merged_sport_shop.columns[1:12]]
cluster_6.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_6['#sport venues'],y=cluster_6['#shopping venues'])
plt.show()

#### Cluster 7

In [29]:
cluster_7 = Toronto_merged_sport_shop.loc[Toronto_merged_sport_shop['Cluster Labels'] == 6, Toronto_merged_sport_shop.columns[1:12]]
cluster_7.head()

NameError: name 'Toronto_merged_sport_shop' is not defined

In [None]:
plt.scatter(x=Toronto_merged_sport_shop['#sport venues'],y=Toronto_merged_sport_shop['#shopping venues'])
plt.scatter(x=cluster_7['#sport venues'],y=cluster_7['#shopping venues'])
plt.show()

In [None]:
colors = ['b', 'c', 'y', 'm', 'r','k','g']
a=plt.scatter(x=cluster_1['#sport venues'],y=cluster_1['#shopping venues'],c=colors[0])
b=plt.scatter(x=cluster_2['#sport venues'],y=cluster_2['#shopping venues'],c=colors[1])
c=plt.scatter(x=cluster_3['#sport venues'],y=cluster_3['#shopping venues'],c=colors[2])
d=plt.scatter(x=cluster_4['#sport venues'],y=cluster_4['#shopping venues'],c=colors[3])
e=plt.scatter(x=cluster_5['#sport venues'],y=cluster_5['#shopping venues'],c=colors[4])
f=plt.scatter(x=cluster_6['#sport venues'],y=cluster_6['#shopping venues'],c=colors[5])
g=plt.scatter(x=cluster_7['#sport venues'],y=cluster_7['#shopping venues'],c=colors[6])

plt.xlabel("#sport venues")
plt.ylabel("#shopping venues")
plt.legend((a, b, c, d, e, f, g),
           ('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7'),
           scatterpoints=1,
           loc='upper right',
           ncol=1,
           fontsize=8,
           bbox_to_anchor=(1.25, 1))