In [9]:
import pandas as pd

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url, attrs={'class': 'sortable'}, header=0);
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
myWorkingDF = df[0]
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
myWorkingDF = myWorkingDF[myWorkingDF.Borough != 'Not assigned']
#More than one neighborhood can exist in one postal code area.
myWorkingDF = myWorkingDF.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
myWorkingDF.loc[myWorkingDF['Neighbourhood']=='Not assigned', 'Neighbourhood']=myWorkingDF['Borough']
myWorkingDF.shape

(103, 3)

In [19]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

geodata = pd.read_csv('http://cocl.us/Geospatial_data')
geodata = geodata.rename(columns={"Postal Code": "Postcode"})
geodata
myNewDF = pd.merge(myWorkingDF, geodata, on='Postcode')
myNewDF

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [47]:
address = 'North York'

geolocator = Nominatim(user_agent="capstone-week3-mfx")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


In [66]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(myNewDF['Latitude'], myNewDF['Longitude'], myNewDF['Borough'], myNewDF['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [32]:
unique = myNewDF.Borough.unique()
unique

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [46]:
onlyToronto_df = myNewDF[(myNewDF['Borough'] == 'East Toronto') | (myNewDF['Borough'] == 'Central Toronto') | (myNewDF['Borough'] == 'Downtown Toronto') | (myNewDF['Borough'] == 'West Toronto')].reset_index(drop=True)
onlyToronto_df.shape

(38, 5)

In [53]:
address = 'East Toronto'

geolocator = Nominatim(user_agent="capstone-week3-mfx")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.653963, -79.387207.


In [65]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(onlyToronto_df['Latitude'], onlyToronto_df['Longitude'], onlyToronto_df['Borough'], onlyToronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [89]:
CLIENT_ID = 'CZ5YIYHZJPCN4R2FL30435GSOGKPZMRZHJ5G4DAPLFCMKVEV' # your Foursquare ID
CLIENT_SECRET = 'ISMLYQTS5WLUWVPEEZU40SDHLG0HIAWXGDPQEPLZGNHAMHE4' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        results
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [90]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

toronto_df = getNearbyVenues(names=onlyToronto_df['Neighbourhood'],
                                   latitudes=onlyToronto_df['Latitude'],
                                   longitudes=onlyToronto_df['Longitude']
                                  )
toronto_df



Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
1,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
2,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
3,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
4,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.672630,-79.287993,Ice Cream Shop
5,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
6,The Beaches,43.676357,-79.293031,Bagels On Fire,43.672864,-79.286784,Bagel Shop
7,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
8,The Beaches,43.676357,-79.293031,Mastermind Toys,43.671453,-79.293971,Toy / Game Store
9,The Beaches,43.676357,-79.293031,The Remarkable Bean,43.672801,-79.287038,Coffee Shop


In [91]:
toronto_df.Neighbourhood.unique()

array(['The Beaches', 'The Danforth West,Riverdale',
       'The Beaches West,India Bazaar', 'Studio District',
       'Lawrence Park', 'Davisville North', 'North Toronto West',
       'Davisville', 'Moore Park,Summerhill East',
       'Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West',
       'Rosedale', 'Cabbagetown,St. James Town', 'Church and Wellesley',
       'Harbourfront,Regent Park', 'Ryerson,Garden District',
       'St. James Town', 'Berczy Park', 'Central Bay Street',
       'Adelaide,King,Richmond',
       'Harbourfront East,Toronto Islands,Union Station',
       'Design Exchange,Toronto Dominion Centre',
       'Commerce Court,Victoria Hotel', 'Roselawn',
       'Forest Hill North,Forest Hill West',
       'The Annex,North Midtown,Yorkville',
       'Harbord,University of Toronto',
       'Chinatown,Grange Park,Kensington Market',
       'CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara',
       'Stn A PO Bo

In [92]:
toronto_df.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"Brockton,Exhibition Place,Parkdale Village",30,30,30,30,30,30
Business Reply Mail Processing Centre 969 Eastern,30,30,30,30,30,30
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15
"Cabbagetown,St. James Town",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
"Chinatown,Grange Park,Kensington Market",30,30,30,30,30,30
Christie,30,30,30,30,30,30
Church and Wellesley,30,30,30,30,30,30


In [94]:
# one hot encoding
manhattan_onehot = pd.get_dummies(toronto_df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighbourhood'] = toronto_df['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Airport Lounge,American Restaurant,Amphitheater,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
manhattan_onehot.shape

(1096, 202)

In [96]:
manhattan_grouped = manhattan_onehot.groupby('Neighbourhood').mean().reset_index()
manhattan_grouped

Unnamed: 0,Neighbourhood,Airport,Airport Lounge,American Restaurant,Amphitheater,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.033333
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.033333,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0
9,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0


In [97]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [116]:
import numpy as np # library to handle data in a vectorized manner

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = manhattan_grouped['Neighbourhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Steakhouse,Coffee Shop,Hotel,Bar,Café,Seafood Restaurant,Burger Joint,Neighborhood,Smoke Shop,Plaza
1,Berczy Park,Café,Seafood Restaurant,Farmers Market,Cocktail Bar,Creperie,French Restaurant,Liquor Store,Fish Market,Pub,Jazz Club
2,"Brockton,Exhibition Place,Parkdale Village",Café,Hotel,Furniture / Home Store,Bakery,Restaurant,Coffee Shop,Neighborhood,Caribbean Restaurant,Sandwich Place,Cocktail Bar
3,Business Reply Mail Processing Centre 969 Eastern,Park,Brewery,Pizza Place,Italian Restaurant,Sushi Restaurant,Gym,Burrito Place,Steakhouse,French Restaurant,Snack Place
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Harbor / Marina,Café,Coffee Shop,Garden,Dog Run,Dance Studio,Park,Scenic Lookout,Sculpture Garden,Airport


In [122]:
from sklearn.cluster import KMeans

kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 3, 1, 1, 1, 1, 2, 2, 0], dtype=int32)

In [124]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,Cluster aLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant,1,1,Japanese Restaurant,Breakfast Spot,Park,Beach,Pub,Bagel Shop,Coffee Shop,Cupcake Shop,Indie Movie Theater,French Restaurant
1,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail,1,1,Japanese Restaurant,Breakfast Spot,Park,Beach,Pub,Bagel Shop,Coffee Shop,Cupcake Shop,Indie Movie Theater,French Restaurant
2,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater,1,1,Japanese Restaurant,Breakfast Spot,Park,Beach,Pub,Bagel Shop,Coffee Shop,Cupcake Shop,Indie Movie Theater,French Restaurant
3,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub,1,1,Japanese Restaurant,Breakfast Spot,Park,Beach,Pub,Bagel Shop,Coffee Shop,Cupcake Shop,Indie Movie Theater,French Restaurant
4,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.67263,-79.287993,Ice Cream Shop,1,1,Japanese Restaurant,Breakfast Spot,Park,Beach,Pub,Bagel Shop,Coffee Shop,Cupcake Shop,Indie Movie Theater,French Restaurant


In [125]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Neighborhood Latitude'], manhattan_merged['Neighborhood Longitude'], manhattan_merged['Neighbourhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters