In [1]:
#import all libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
#download data
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [3]:
table_can = soup.find('table')
cells = table_can.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(cells), 3):
    postcode.append(cells[i].text.strip())
    borough.append(cells[i+1].text.strip())
    neighbourhood.append(cells[i+2].text.strip())

In [4]:
#transpose the data into dataframe
df_pc = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_pc.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_pc.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [5]:
#remove the boroughs with empty data
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df_pc.drop(df_pc[df_pc['Borough'] == 'Not assigned'].index, inplace=True)
df_pc.loc[df_pc.Neighbourhood == "Not assigned", "Neighbourhood"] = df_pc.Borough

In [6]:
#combine neighborhoods with same postcodes
df_pc = df_pc.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_pc.columns = ['Postcode', 'Borough', 'Neighbourhood']

In [7]:
df_pc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
#import geographical data csv file
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_8c6994d0c2af4c0e899abe707846c03e = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='Q_MZD0YEV8OjvU36XNXXef4twtJX3Kf8ANbD4VWhNZj9',
    ibm_auth_endpoint="https://iam.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_8c6994d0c2af4c0e899abe707846c03e.get_object(Bucket='finalproject-donotdelete-pr-sovuftm1o0rpex',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
#comebine two datasets together to get a new dataset contains both postcode and geographical information
Combined_data = df_pc.join(df_data_1)
Combined_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [10]:
import json # library to handle JSON files
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import xml

In [11]:
#account information for Foursqaure
CLIENT_ID = 'WWK0OPNLKLO4NYAXIJTHSVUZM2Y5MODZP43EZLKC5M2DHA2V' # your Foursquare ID
CLIENT_SECRET = 'HTAFAKDEZXZNNZ4KWS13C1OD5JBZYXLM5A1TLPXNG0GHXDBX' # your Foursquare Secret
VERSION = '20190511' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WWK0OPNLKLO4NYAXIJTHSVUZM2Y5MODZP43EZLKC5M2DHA2V
CLIENT_SECRET:HTAFAKDEZXZNNZ4KWS13C1OD5JBZYXLM5A1TLPXNG0GHXDBX


In [12]:
#create a new dataset contains all neighborhoods in North York
df_ny = Combined_data[Combined_data['Borough'].str.contains('North York')]

df_ny.reset_index(drop=True, inplace = True)
df_ny.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M2H,North York,Hillcrest Village,M2H,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",M2J,43.778517,-79.346556
2,M2K,North York,Bayview Village,M2K,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",M2L,43.75749,-79.374714
4,M2M,North York,"Newtonbrook, Willowdale",M2M,43.789053,-79.408493
5,M2N,North York,Willowdale South,M2N,43.77012,-79.408493
6,M2P,North York,York Mills West,M2P,43.752758,-79.400049
7,M2R,North York,Willowdale West,M2R,43.782736,-79.442259
8,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
9,M3B,North York,Don Mills North,M3B,43.745906,-79.352188


In [13]:
#import folium
!pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/43/77/0287320dc4fd86ae8847bab6c34b5ec370e836a79c7b0c16680a3d9fd770/folium-0.8.3-py2.py3-none-any.whl (87kB)
[K    100% |████████████████████████████████| 92kB 7.0MB/s eta 0:00:01
[?25hRequirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: jinja2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: numpy in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Requirement not upgraded a

In [14]:
import folium

In [15]:
#use geolocater to add markers and create the map
address = 'North York, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [16]:
# create map of Toronto using latitude and longitude values
map_neighbour = folium.Map(location=[latitude, longitude], zoom_start=11)

In [17]:
# add markers to map
for lat, lng, borough, neighbourhood in zip(df_ny['Latitude'], df_ny['Longitude'], df_ny['Borough'], df_ny['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_neighbour)  
    
map_neighbour

In [18]:
#create a function to get venue information according to geographical information
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [19]:
#create a dataset contain all venue information in each neighborhoods in North York
print('Crawling different neighborhoods inside "North York"')
NY_foursquare_dataset = foursquare_crawler(list(df_ny['Postcode']),
                                                   list(df_ny['Neighbourhood']),
                                                   list(df_ny['Latitude']),
                                                   list(df_ny['Longitude']),)

Crawling different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M2H (and Neighborhoods Hillcrest Village) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M2J (and Neighborhoods Fairview, Henry Farm, Oriole) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M2K (and Neighborhoods Bayview Village) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M2L (and Neighborhoods Silver Hills, York Mills) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M2M (and Neighborhoods Newtonbrook, Willowdale) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M2N (and Neighborhoods Willowdale South) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M2P (and Neighborhoods York Mills West) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M2R (and Neighborhoods Willowdale West) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M3A (and Neighborhoods Parkwoods) SUCCESSFULLY.
10.
Data is Obtained, for the Postal Code M3B (and Neighborhoods 

In [20]:
#define a function to get important venue information based on Foursqaure database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
    
    return(result_df)

In [21]:
ny_venues = get_venue_dataset(NY_foursquare_dataset)

Number of Venuse in Coordination "M2H" Posal Code and "Hillcrest Village" Negihborhood(s) is:
22
Number of Venuse in Coordination "M2J" Posal Code and "Fairview, Henry Farm, Oriole" Negihborhood(s) is:
44
Number of Venuse in Coordination "M2K" Posal Code and "Bayview Village" Negihborhood(s) is:
13
Number of Venuse in Coordination "M2L" Posal Code and "Silver Hills, York Mills" Negihborhood(s) is:
4
Number of Venuse in Coordination "M2M" Posal Code and "Newtonbrook, Willowdale" Negihborhood(s) is:
30
Number of Venuse in Coordination "M2N" Posal Code and "Willowdale South" Negihborhood(s) is:
100
Number of Venuse in Coordination "M2P" Posal Code and "York Mills West" Negihborhood(s) is:
15
Number of Venuse in Coordination "M2R" Posal Code and "Willowdale West" Negihborhood(s) is:
10
Number of Venuse in Coordination "M3A" Posal Code and "Parkwoods" Negihborhood(s) is:
28
Number of Venuse in Coordination "M3B" Posal Code and "Don Mills North" Negihborhood(s) is:
29
Number of Venuse in Coo

In [22]:
print(ny_venues.shape)
ny_venues.head()

(616, 8)


Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,Korean Restaurant,754
1,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,Bakery,692
2,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,Grocery Store,815
3,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,Park,776
4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,Coffee Shop,731


In [23]:
#to get all unique data in venue category
ny_venues['Venue Category'].unique()

array(['Korean Restaurant', 'Bakery', 'Grocery Store', 'Park',
       'Coffee Shop', 'Pizza Place', 'Bank', 'Sandwich Place',
       'Fast Food Restaurant', 'Pharmacy', 'Housing Development',
       'Chinese Restaurant', 'Ice Cream Shop', 'Shopping Mall',
       'Recreation Center', 'Pool',
       'Residential Building (Apartment / Condo)', 'Diner',
       'Convenience Store', 'Toy / Game Store', 'Burger Joint',
       'Movie Theater', 'Electronics Store', 'American Restaurant',
       'Tea Room', 'Salon / Barbershop', 'Department Store', 'Candy Store',
       'Smoothie Shop', 'Clothing Store', 'Japanese Restaurant',
       'Juice Bar', 'Caribbean Restaurant', 'Food Court', 'Theater',
       'Restaurant', 'Sporting Goods Shop', 'Video Game Store',
       'Cosmetics Shop', 'Supermarket', 'Beer Store',
       'Fried Chicken Joint', 'Café', 'Skating Rink', 'Skate Park',
       'Intersection', 'Asian Restaurant', 'Hookah Bar', 'Dessert Shop',
       'Middle Eastern Restaurant', 'Hot Dog Jo

In [24]:
#define all venue categories relate to restaurants
ny_re=ny_venues.loc[ny_venues['Venue Category'].isin(['Korean Restaurant', 
       'Pizza Place', 'Sandwich Place', 'Fast Food Restaurant', 'Chinese Restaurant', 
       'Burger Joint', 'American Restaurant','Tea Room', 
       'Smoothie Shop', 'Japanese Restaurant',
       'Caribbean Restaurant', 'Food Court', 'Restaurant', 
       'Fried Chicken Joint', 'Café', 'Asian Restaurant', 'Hookah Bar', 
       'Middle Eastern Restaurant', 'Hot Dog Joint', 'Indian Restaurant',
       'Ramen Restaurant', 'Steakhouse',
       'Seafood Restaurant', 'Indonesian Restaurant', 'Creperie',
       'Sushi Restaurant', 'Burrito Place','Sports Bar',
       'Comfort Food Restaurant', 'Vietnamese Restaurant', 'Bar', 'Pub', 
       'Italian Restaurant', 'French Restaurant', 'Eastern European Restaurant', 
       'Fish & Chips Shop', 'Food & Drink Shop', 'Salad Place', 'Thai Restaurant',
       'Breakfast Spot', 'Greek Restaurant', 'Cafeteria', 'Dim Sum Restaurant', 
       'Mediterranean Restaurant', 'Ski Chalet', 'Video Store','Falafel Restaurant', 
       'Turkish Restaurant', 'Latin American Restaurant', 'Snack Place', 
       'Portuguese Restaurant', 'Wings Joint', 'Cocktail Bar', 'Empanada Restaurant'])]

In [25]:
ny_re.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,Korean Restaurant,754
5,M2H,Hillcrest Village,43.803762,-79.363452,Pizza Pizza,This spot is popular,Pizza Place,709
7,M2H,Hillcrest Village,43.803762,-79.363452,Subway,This spot is popular,Sandwich Place,711
8,M2H,Hillcrest Village,43.803762,-79.363452,New York Fries,This spot is popular,Fast Food Restaurant,38
11,M2H,Hillcrest Village,43.803762,-79.363452,New Greattime Corp.,This spot is popular,Chinese Restaurant,676


In [26]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_re[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_re['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

ny_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Burger Joint,Burrito Place,Cafeteria,Café,Caribbean Restaurant,...,Snack Place,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint
0,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
ny_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Burger Joint,Burrito Place,Cafeteria,Café,Caribbean Restaurant,...,Snack Place,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint
0,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.0,0.05,0.0,0.05,0.0,0.05,0.0,0.05
3,"CFB Toronto, Downsview East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.111111,0.0
4,Don Mills North,0.0,0.058824,0.058824,0.058824,0.117647,0.0,0.058824,0.058824,0.058824,...,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0
5,Downsview Central,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0
6,Downsview Northwest,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,...,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0
7,Downsview West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
8,"Downsview, North Park, Upwood Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Fairview, Henry Farm, Oriole",0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0


In [28]:
ny_onehot = pd.get_dummies(data = ny_re, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
ny_onehot.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,American Restaurant,Asian Restaurant,Bar,...,Snack Place,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint
0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,754,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M2H,Hillcrest Village,43.803762,-79.363452,Pizza Pizza,This spot is popular,709,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,M2H,Hillcrest Village,43.803762,-79.363452,Subway,This spot is popular,711,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M2H,Hillcrest Village,43.803762,-79.363452,New York Fries,This spot is popular,38,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,M2H,Hillcrest Village,43.803762,-79.363452,New Greattime Corp.,This spot is popular,676,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
#to sum the number of each kind of restaurants in each neighborhood
ny_onehot = ny_onehot.groupby(
    'Neighborhood').sum()

ny_onehot.head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Burger Joint,Burrito Place,Cafeteria,Café,...,Snack Place,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Bathurst Manor, Downsview North, Wilson Heights",437.543283,-794.422593,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
Bayview Village,218.934737,-396.929875,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",874.66565,-1588.394994,1,0,0,1,0,0,0,1,...,0,0,0,1,0,1,0,1,0,1
"CFB Toronto, Downsview East",393.637259,-715.18287,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,2,0,1,0
Don Mills North,743.680399,-1348.987196,0,1,1,1,2,0,1,1,...,0,0,0,0,0,1,0,0,0,0


In [30]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(ny_onehot)

In [31]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = ny_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,American Restaurant,Asian Restaurant,Bar,Breakfast Spot,Burger Joint,Burrito Place,Cafeteria,Café,...,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Video Store,Vietnamese Restaurant,Wings Joint,Total Sum
G1,170.648526,-309.730131,5.5511150000000004e-17,-2.775558e-17,-2.775558e-17,-1.387779e-17,2.775558e-17,-6.938894e-18,-6.938894e-18,0.3,...,-1.387779e-17,-6.938894e-18,-5.5511150000000004e-17,-6.938894e-18,-1.387779e-17,-1.387779e-17,-1.387779e-17,0.3,-6.938894e-18,-135.181605
G4,470.20876,-854.299582,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.25,0.0,0.0,0.5,0.25,0.5,0.0,-373.340821
G3,700.038168,-1270.569307,0.25,0.25,0.5,0.25,1.0,0.0,0.25,0.25,...,0.25,0.0,0.25,0.25,0.25,0.0,0.0,0.5,0.0,-554.531139
G5,845.816584,-1534.877965,1.0,1.0,0.3333333,0.3333333,0.3333333,6.938894e-18,6.938894e-18,1.666667,...,1.387779e-17,6.938894e-18,0.6666667,6.938894e-18,0.3333333,1.387779e-17,0.3333333,0.0,0.3333333,-669.728048
G2,2538.666954,-4605.692582,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,...,1.0,1.0,4.0,0.0,0.0,0.0,0.0,2.0,0.0,-2009.025628


In [32]:
#connect the cluster group with the neighborhoods
neigh_summary = pd.DataFrame([ny_onehot.index, 1 + kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood', 'Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Downsview North, Wilson Heights",4
1,Bayview Village,1
2,"Bedford Park, Lawrence Manor East",5
3,"CFB Toronto, Downsview East",4
4,Don Mills North,3
5,Downsview Central,1
6,Downsview Northwest,4
7,Downsview West,1
8,"Downsview, North Park, Upwood Park",1
9,"Fairview, Henry Farm, Oriole",3


In [33]:
#explore top 10 vnues 
num_top_venues = 10

for hood in ny_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ny_grouped[ny_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Downsview North, Wilson Heights----
                       venue  freq
0             Sandwich Place   0.1
1                 Restaurant   0.1
2                Video Store   0.1
3   Mediterranean Restaurant   0.1
4  Middle Eastern Restaurant   0.1
5           Sushi Restaurant   0.1
6        Fried Chicken Joint   0.1
7                Pizza Place   0.1
8                 Ski Chalet   0.1
9       Fast Food Restaurant   0.1


----Bayview Village----
                       venue  freq
0        Japanese Restaurant   0.4
1                       Café   0.2
2         Chinese Restaurant   0.2
3       Fast Food Restaurant   0.2
4        American Restaurant   0.0
5             Sandwich Place   0.0
6  Latin American Restaurant   0.0
7   Mediterranean Restaurant   0.0
8  Middle Eastern Restaurant   0.0
9                Pizza Place   0.0


----Bedford Park, Lawrence Manor East----
                  venue  freq
0  Fast Food Restaurant  0.15
1    Italian Restaurant  0.15
2   American R

In [34]:
#define a function to find the top 10 venues(restaurants) in each neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Restaurant,Fast Food Restaurant,Middle Eastern Restaurant,Fried Chicken Joint,Mediterranean Restaurant,Sandwich Place,Ski Chalet,Pizza Place,Sushi Restaurant,Video Store
1,Bayview Village,Japanese Restaurant,Café,Chinese Restaurant,Fast Food Restaurant,Wings Joint,Eastern European Restaurant,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court
2,"Bedford Park, Lawrence Manor East",Fast Food Restaurant,Italian Restaurant,Wings Joint,Sandwich Place,Breakfast Spot,Café,Comfort Food Restaurant,Greek Restaurant,Pizza Place,Pub
3,"CFB Toronto, Downsview East",Turkish Restaurant,Vietnamese Restaurant,Italian Restaurant,Café,Latin American Restaurant,Middle Eastern Restaurant,Sandwich Place,Pizza Place,Wings Joint,Eastern European Restaurant
4,Don Mills North,Japanese Restaurant,Pizza Place,Burger Joint,Cafeteria,Restaurant,Greek Restaurant,Café,Salad Place,Breakfast Spot,Thai Restaurant


In [47]:
df_nyre = df_ny.loc[df_ny['Neighbourhood'].isin(['Bayview Village', 'Downsview Central', 'Downsview West',
                                      'Downsview, North Park, Upwood Park', 'Hillcrest Village',
                                      'Humber Summit', 'Parkwoods', 'Victoria Village', 'Willowdale West',
                                      'York Mills West'])]

In [48]:
df_nyre.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M2H,North York,Hillcrest Village,M2H,43.803762,-79.363452
2,M2K,North York,Bayview Village,M2K,43.786947,-79.385975
6,M2P,North York,York Mills West,M2P,43.752758,-79.400049
7,M2R,North York,Willowdale West,M2R,43.782736,-79.442259
8,M3A,North York,Parkwoods,M3A,43.753259,-79.329656


In [49]:
# create map of Toronto using latitude and longitude values
map_neighbour1 = folium.Map(location=[latitude, longitude], zoom_start=11)

In [51]:
# add markers to map
for lat, lng, borough, neighbourhood in zip(df_nyre['Latitude'], df_nyre['Longitude'], 
                                            df_nyre['Borough'], df_nyre['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_neighbour1)  
    
map_neighbour1

In [36]:
#explore the neighborhoods in cluster group 1
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Bayview Village']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Bayview Village,Japanese Restaurant,Café,Chinese Restaurant,Fast Food Restaurant,Wings Joint,Eastern European Restaurant,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court


In [37]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Downsview Central']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downsview Central,Vietnamese Restaurant,Restaurant,Hot Dog Joint,Hookah Bar,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop,Fish & Chips Shop


In [38]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Downsview West']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downsview West,Vietnamese Restaurant,Pizza Place,Hot Dog Joint,Hookah Bar,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop,Fish & Chips Shop


In [39]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Downsview, North Park, Upwood Park']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,"Downsview, North Park, Upwood Park",Dim Sum Restaurant,Mediterranean Restaurant,Pizza Place,Chinese Restaurant,Sandwich Place,Wings Joint,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop


In [40]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Hillcrest Village']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Hillcrest Village,Korean Restaurant,Fast Food Restaurant,Pizza Place,Chinese Restaurant,Sandwich Place,Wings Joint,Dim Sum Restaurant,Fried Chicken Joint,French Restaurant,Food Court


In [41]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Humber Summit']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Humber Summit,Pizza Place,Italian Restaurant,Empanada Restaurant,Wings Joint,Hot Dog Joint,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop


In [42]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Parkwoods']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Parkwoods,Food & Drink Shop,Fish & Chips Shop,Café,Caribbean Restaurant,Chinese Restaurant,Pizza Place,Fast Food Restaurant,Wings Joint,Eastern European Restaurant,Greek Restaurant


In [43]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Victoria Village']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Victoria Village,Café,Pizza Place,Portuguese Restaurant,Wings Joint,Eastern European Restaurant,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop


In [44]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='Willowdale West']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,Willowdale West,Pizza Place,Eastern European Restaurant,Wings Joint,Dim Sum Restaurant,Greek Restaurant,Fried Chicken Joint,French Restaurant,Food Court,Food & Drink Shop,Fish & Chips Shop


In [45]:
neighborhoods_venues_sorted.loc[neighborhoods_venues_sorted['Neighborhood']=='York Mills West']

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,York Mills West,Restaurant,French Restaurant,Wings Joint,Dim Sum Restaurant,Greek Restaurant,Fried Chicken Joint,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant
