In [1]:
# importing libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from bs4 import BeautifulSoup
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries are imported.')

Libraries are imported.


In [2]:
# Loading the dataset which is about postal codes in Toronto
# This dataset was created in week 3. 
df_toronto = pd.read_csv('toronto_data.csv')
df_toronto.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [3]:
df_toronto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    103 non-null    int64  
 1   Postal Code   103 non-null    object 
 2   Borough       103 non-null    object 
 3   Neighborhood  103 non-null    object 
 4   Latitude      103 non-null    float64
 5   Longitude     103 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.0+ KB


In [4]:
# for the city Toronto, latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

In [5]:
# df_toronto['Borough'] == 'North York'

# selecting only neighborhoods regarding to "Scarborough" borough.
north_data = df_toronto[df_toronto['Borough'] == 'North York']
north_data = north_data.reset_index(drop=True).drop(columns = 'Unnamed: 0')
north_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073


In [6]:
address_scar = 'North York, Toronto'
latitude_scar = 43.773077
longitude_scar = -79.257774
print('The geograpical coordinate of "North York" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_north = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(north_data['Latitude'], north_data['Longitude'], north_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_north)  
    
map_north

The geograpical coordinate of "North York" are: 43.773077, -79.257774.


In [7]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [8]:
CLIENT_ID = 'LWK55BGX5WORO1ICN4TIU042KXWZFO3QRDN1TKTNAG2ISUIQ' # your Foursquare ID
CLIENT_SECRET = 'SXKPUMLWIQ4JCXGLW0GVAUHEEPKHZUDW3PPDCX5JWIY2UKOQ' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LWK55BGX5WORO1ICN4TIU042KXWZFO3QRDN1TKTNAG2ISUIQ
CLIENT_SECRET:SXKPUMLWIQ4JCXGLW0GVAUHEEPKHZUDW3PPDCX5JWIY2UKOQ


In [9]:
print('Crawling different neighborhoods inside "North York"')
north_foursquare_dataset = foursquare_crawler(list(north_data['Postal Code']),
                                                   list(north_data['Neighborhood']),
                                                   list(north_data['Latitude']),
                                                   list(north_data['Longitude']),)

Crawling different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M3A (and Neighborhoods Parkwoods) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M4A (and Neighborhoods Victoria Village) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M6A (and Neighborhoods Lawrence Manor, Lawrence Heights) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M3B (and Neighborhoods Don Mills) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M6B (and Neighborhoods Glencairn) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M3C (and Neighborhoods Don Mills) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M2H (and Neighborhoods Hillcrest Village) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M3H (and Neighborhoods Bathurst Manor, Wilson Heights, Downsview North) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M2J (and Neighborhoods Fairview, Henry Farm, Oriole) SUCCESSFULLY.
10.
Data is Obtained, for the Postal Code M3J (and 

In [10]:


import pickle
with open("north_foursquare_dataset.txt", "wb") as fp:   #Pickling
    pickle.dump(north_foursquare_dataset, fp)
print('Received Data from Internet is Saved to Computer.')



Received Data from Internet is Saved to Computer.


In [11]:
with open("north_foursquare_dataset.txt", "rb") as fp:   # Unpickling
    north_foursquare_dataset = pickle.load(fp)
# print(type(north_foursquare_dataset))
# north_foursquare_dataset

In [12]:
# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            # print({'Postal Code': postal_code, 'Neighborhood': neigh, 
            #                   'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
            #                   'Venue': name, 'Venue Summary': summary, 
            #                   'Venue Category': cat, 'Distance': dist})
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [13]:
north_venues = get_venue_dataset(north_foursquare_dataset)

Number of Venuse in Coordination "M3A" Posal Code and "Parkwoods" Negihborhood(s) is:
28
Number of Venuse in Coordination "M4A" Posal Code and "Victoria Village" Negihborhood(s) is:
12
Number of Venuse in Coordination "M6A" Posal Code and "Lawrence Manor, Lawrence Heights" Negihborhood(s) is:
46
Number of Venuse in Coordination "M3B" Posal Code and "Don Mills" Negihborhood(s) is:
30
Number of Venuse in Coordination "M6B" Posal Code and "Glencairn" Negihborhood(s) is:
33
Number of Venuse in Coordination "M3C" Posal Code and "Don Mills" Negihborhood(s) is:
44
Number of Venuse in Coordination "M2H" Posal Code and "Hillcrest Village" Negihborhood(s) is:
20
Number of Venuse in Coordination "M3H" Posal Code and "Bathurst Manor, Wilson Heights, Downsview North" Negihborhood(s) is:
31
Number of Venuse in Coordination "M2J" Posal Code and "Fairview, Henry Farm, Oriole" Negihborhood(s) is:
44
Number of Venuse in Coordination "M3J" Posal Code and "Northwood Park, York University" Negihborhood(s) 

In [14]:
north_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M3A,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,This spot is popular,Caribbean Restaurant,833
1,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,This spot is popular,Park,245
2,M3A,Parkwoods,43.753259,-79.329656,Tim Hortons,This spot is popular,Café,866
3,M3A,Parkwoods,43.753259,-79.329656,A&W,This spot is popular,Fast Food Restaurant,852
4,M3A,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,This spot is popular,Grocery Store,889


In [15]:
north_venues.to_csv('north_venues.csv')

In [16]:
north_venues = pd.read_csv('north_venues.csv')

In [17]:
neigh_list = list(north_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside Scarborough:')
print(len(neigh_list))
print('List of Neighborhoods inside Scarborough:')
neigh_list

Number of Neighborhoods inside Scarborough:
20
List of Neighborhoods inside Scarborough:


['Parkwoods',
 'Victoria Village',
 'Lawrence Manor, Lawrence Heights',
 'Don Mills',
 'Glencairn',
 'Hillcrest Village',
 'Bathurst Manor, Wilson Heights, Downsview North',
 'Fairview, Henry Farm, Oriole',
 'Northwood Park, York University',
 'Bayview Village',
 'Downsview',
 'York Mills, Silver Hills',
 'North Park, Maple Leaf Park, Upwood Park',
 'Humber Summit',
 'Willowdale, Newtonbrook',
 'Bedford Park, Lawrence Manor East',
 'Humberlea, Emery',
 'Willowdale, Willowdale East',
 'York Mills West',
 'Willowdale, Willowdale West']

In [18]:


neigh_venue_summary = north_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()



Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Bathurst Manor, Wilson Heights, Downsview North",31,31,31,31,31,31,31
Bayview Village,15,15,15,15,15,15,15
"Bedford Park, Lawrence Manor East",43,43,43,43,43,43,43
Don Mills,74,74,74,74,74,74,74
Downsview,68,68,68,68,68,68,68


In [19]:
print('There are {} uniques categories.'.format(len(north_venues['Venue Category'].unique())))

print('Here is the list of different categories:')
list(north_venues['Venue Category'].unique())

There are 145 uniques categories.
Here is the list of different categories:


['Caribbean Restaurant',
 'Park',
 'Café',
 'Fast Food Restaurant',
 'Grocery Store',
 'Pharmacy',
 'Supermarket',
 'Fish & Chips Shop',
 'Pizza Place',
 'Food & Drink Shop',
 'Road',
 'Bus Stop',
 'Train Station',
 'Discount Store',
 'Laundry Service',
 'Chinese Restaurant',
 'Coffee Shop',
 'Convenience Store',
 'Shopping Mall',
 'Tennis Court',
 'Cosmetics Shop',
 'Shop & Service',
 'Hockey Arena',
 'Portuguese Restaurant',
 'Golf Course',
 'French Restaurant',
 'Intersection',
 'Playground',
 "Men's Store",
 'Lounge',
 'Gym / Fitness Center',
 'Boutique',
 'Furniture / Home Store',
 'Vietnamese Restaurant',
 'Athletics & Sports',
 'Sushi Restaurant',
 'Greek Restaurant',
 'Dessert Shop',
 'Fried Chicken Joint',
 'Restaurant',
 'Bowling Alley',
 'Pet Store',
 'Clothing Store',
 'Bank',
 'Seafood Restaurant',
 'Accessories Store',
 'Miscellaneous Shop',
 'Event Space',
 'Cheese Shop',
 'Sandwich Place',
 'Sporting Goods Shop',
 'Hobby Shop',
 'Mediterranean Restaurant',
 'Paper / Off

In [20]:
# one hot encoding
north_onehot = pd.get_dummies(data = north_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
north_onehot.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Boutique,Bowling Alley,Breakfast Spot,Bridal Shop,Bubble Tea Shop,Burger Joint,Bus Line,Bus Stop,Business Service,Butcher,Cafeteria,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Creperie,Deli / Bodega,Dentist's Office,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Event Space,Falafel Restaurant,Fast Food Restaurant,Fireworks Store,Fish & Chips Shop,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gas Station,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hobby Shop,Hockey Arena,Hookah Bar,Hot Dog Joint,Hotel,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Laundry Service,Liquor Store,Lounge,Massage Studio,Mediterranean Restaurant,Men's Store,Metro Station,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Movie Theater,Moving Target,New American Restaurant,Other Repair Shop,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Recreation Center,Residential Building (Apartment / Condo),Restaurant,Road,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shop & Service,Shopping Mall,Skating Rink,Ski Area,Ski Chalet,Snack Place,Soccer Field,Sporting Goods Shop,Sports Bar,Sports Club,Steakhouse,Storage Facility,Supermarket,Sushi Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0,M3A,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,This spot is popular,833,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,This spot is popular,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,M3A,Parkwoods,43.753259,-79.329656,Tim Hortons,This spot is popular,866,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,M3A,Parkwoods,43.753259,-79.329656,A&W,This spot is popular,852,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,M3A,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,This spot is popular,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# This list is created manually 
important_list_of_features = [ 
 'Neighborhood',
 'Neighborhood Latitude',
 'Neighborhood Longitude',
 'Park',
 'Tennis Court',
 'Hockey Arena',
 'Golf Course',
 'Playground',
 'Gym / Fitness Center',
 'Athletics & Sports',
 'Gym',
 'Sports Bar',
 'Soccer Field',
 'Pool',
 'Sports Club',
 'Yoga Studio']

In [22]:
north_onehot = north_onehot[important_list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()


north_onehot.head()

Unnamed: 0_level_0,Park,Tennis Court,Hockey Arena,Golf Course,Playground,Gym / Fitness Center,Athletics & Sports,Gym,Sports Bar,Soccer Field,Pool,Sports Club,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Bathurst Manor, Wilson Heights, Downsview North",2,0,0,0,0,0,0,0,0,0,0,0,0
Bayview Village,1,0,0,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",1,0,0,0,0,0,0,0,0,0,0,1,0
Don Mills,1,0,1,0,0,0,0,4,0,0,0,0,0
Downsview,3,0,0,0,0,1,2,1,0,1,0,0,0


In [23]:
north_onehot

Unnamed: 0_level_0,Park,Tennis Court,Hockey Arena,Golf Course,Playground,Gym / Fitness Center,Athletics & Sports,Gym,Sports Bar,Soccer Field,Pool,Sports Club,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Bathurst Manor, Wilson Heights, Downsview North",2,0,0,0,0,0,0,0,0,0,0,0,0
Bayview Village,1,0,0,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",1,0,0,0,0,0,0,0,0,0,0,1,0
Don Mills,1,0,1,0,0,0,0,4,0,0,0,0,0
Downsview,3,0,0,0,0,1,2,1,0,1,0,0,0
"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,0,0,0,0,0,0
Glencairn,1,1,0,0,1,0,0,1,0,0,0,0,0
Hillcrest Village,2,0,0,0,0,0,0,0,0,0,0,0,0
Humber Summit,1,0,0,0,0,0,0,0,0,0,0,0,0
"Humberlea, Emery",1,0,0,1,0,0,0,0,0,0,0,0,0


In [24]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(north_onehot)

In [25]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = north_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Park,Tennis Court,Hockey Arena,Golf Course,Playground,Gym / Fitness Center,Athletics & Sports,Gym,Sports Bar,Soccer Field,Pool,Sports Club,Yoga Studio,Total Sum
G4,3.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,8.0
G2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,6.0
G5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,5.0
G3,2.5,0.166667,1.387779e-17,1.387779e-17,0.166667,0.0,2.775558e-17,5.5511150000000004e-17,1.387779e-17,6.938894e-18,0.1666667,6.938894e-18,6.938894e-18,3.0
G1,0.727273,0.090909,0.09090909,0.1818182,0.272727,0.181818,0.1818182,0.09090909,0.09090909,6.938894e-18,1.387779e-17,0.09090909,6.938894e-18,2.0


In [26]:
neigh_summary = pd.DataFrame([north_onehot.index, 1 + kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood', 'Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Wilson Heights, Downsview North",3
1,Bayview Village,1
2,"Bedford Park, Lawrence Manor East",1
3,Don Mills,2
4,Downsview,4
5,"Fairview, Henry Farm, Oriole",1
6,Glencairn,1
7,Hillcrest Village,3
8,Humber Summit,1
9,"Humberlea, Emery",1


In [27]:
neigh_summary[neigh_summary['Group'] == 4]

Unnamed: 0,Neighborhood,Group
4,Downsview,4


In [28]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 4]['Neighborhood'])[0]
north_venues[north_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M3K',
 'Neighborhood': 'Downsview',
 'Neighborhood Latitude': 43.737473200000004,
 'Neighborhood Longitude': -79.46476329999999}

In [29]:
neigh_summary[neigh_summary['Group'] == 2]

Unnamed: 0,Neighborhood,Group
3,Don Mills,2


In [30]:
neigh_summary[neigh_summary['Group'] == 5]

Unnamed: 0,Neighborhood,Group
16,"Willowdale, Willowdale East",5


In [31]:
neigh_summary[neigh_summary['Group'] == 3]

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Wilson Heights, Downsview North",3
7,Hillcrest Village,3
13,Parkwoods,3
15,"Willowdale, Newtonbrook",3
18,York Mills West,3
19,"York Mills, Silver Hills",3


In [32]:
neigh_summary[neigh_summary['Group'] == 1]

Unnamed: 0,Neighborhood,Group
1,Bayview Village,1
2,"Bedford Park, Lawrence Manor East",1
5,"Fairview, Henry Farm, Oriole",1
6,Glencairn,1
8,Humber Summit,1
9,"Humberlea, Emery",1
10,"Lawrence Manor, Lawrence Heights",1
11,"North Park, Maple Leaf Park, Upwood Park",1
12,"Northwood Park, York University",1
14,Victoria Village,1
