# IBM Data Science Capstone Project!

## Part 1

In [36]:
# Install required librarries (Uncomment following if any error occurs)

#!conda install -c conda-forge beautifulsoup4 --yes
#!conda install -c conda-forge geopy --yes
#!conda install -c conda-forge folium=0.5.0 --yes
print('Libraries installed!')

Libraries installed!


In [37]:
# Import required packages

import numpy as np
import pandas as pd
import requests
from pandas.io.json import json_normalize
import json
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [38]:
# open Wiki page with Beautiful Soup
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')
# Scrape the HTML
postalCodeList = []
boroughList = []
neighborhoodList = []
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [39]:
# Put the scraped data into dataframe
toronto_neighorhood = [('PostalCode', postalCodeList),
                      ('Borough', boroughList),
                      ('Neighborhood', neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighorhood))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [40]:
# Remove not assigned rows
toronto_df_dropna = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [41]:
# Group neighborhoods by postal and borough
toronto_df_grouped = toronto_df_dropna.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [42]:
# Deal with Not assigned Neighborhood
na_neigh_rows = toronto_df_grouped.Neighborhood == 'Not assigned'
toronto_df_grouped.loc[na_neigh_rows, 'Neighborhood'] = toronto_df_grouped.loc[na_neigh_rows, 'Borough']
toronto_df_grouped[na_neigh_rows]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [43]:
# Cleaned Dataframe
toronto_df_cleaned = toronto_df_grouped
toronto_df_cleaned.shape

(103, 3)

## Part 2

In [44]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Downloaded Coordinates!')
coors = pd.read_csv('toronto_coordinates.csv')
print(coors.shape)
coors.head()

Downloaded Coordinates!
(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [45]:
# merge the two dataframes
toronto_df_temp = toronto_df_cleaned.set_index('PostalCode')
coors_temp = coors.set_index('Postal Code')
toronto_df_coors = pd.concat([toronto_df_temp, coors_temp], axis=1, join='inner')

In [46]:
# Reset index and we will get the toronto dataframe with coordinates
toronto_df_coors.index.name = 'PostalCode'
toronto_df_coors.reset_index(inplace=True)

In [47]:
# Check the toronto dataframe with coordinates
print(toronto_df_coors.shape)
toronto_df_coors.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3

In [48]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [49]:
# Using Folium to create a Map of Toronto with Boroughs markers on top
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(toronto_df_coors['Latitude'], toronto_df_coors['Longitude'], toronto_df_coors['PostalCode'], toronto_df_coors['Borough'], toronto_df_coors['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [50]:
# Reduce the number of Boroughs to explore
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = toronto_df_coors[toronto_df_coors['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

(38, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [51]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [52]:
CLIENT_ID = 'H0ICBSDN4T2HXBDP2RJ2Z15CNK55RULZU0JZOPEL0CCCH0IF' # Foursquare ID
CLIENT_SECRET = 'IUGXCUCARX3V2IZAKNPV4C5UIQNF2TIBFVJBIPAIEQG0LQJQ' # Foursquare Secret
VERSION = '20180604'

In [53]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [54]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1711, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [55]:
# Count of venues returned for each postal code
venues_df.groupby(['PostalCode', 'Borough', 'Neighborhood'])['VenueName'].count()

PostalCode  Borough           Neighborhood                                                                                        
M4E         East Toronto      The Beaches                                                                                               4
M4K         East Toronto      The Danforth West,Riverdale                                                                              42
M4L         East Toronto      The Beaches West,India Bazaar                                                                            20
M4M         East Toronto      Studio District                                                                                          38
M4N         Central Toronto   Lawrence Park                                                                                             4
M4P         Central Toronto   Davisville North                                                                                          8
M4R         Central Toronto   North Toron

In [56]:
# kinds of venues categories
len(venues_df['VenueCategory'].unique())

234

In [57]:
# Venues for each aera

# one hot encoding
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]

print(toronto_central_onehot.shape)
toronto_central_onehot.head()

(1711, 237)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,East Toronto,"The Danforth West,Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# frequency of occurance of each category in an area
toronto_central_venues_freq = toronto_central_onehot.groupby(['PostalCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)
toronto_central_venues_freq.head()

(38, 237)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# 10 most occurance venue types in each area
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_central_venues_freq['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_central_venues_freq['Neighborhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted

Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Boutique,Bar
32,M6J,West Toronto,"Little Portugal,Trinity",Bar,Coffee Shop,Asian Restaurant,Men's Store,Café,New American Restaurant,Bakery,Cocktail Bar,Pizza Place,Restaurant
26,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Café,Chinese Restaurant,Mexican Restaurant,Bakery,Coffee Shop,Dumpling Restaurant,Cocktail Bar
25,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Bookstore,Bar,Italian Restaurant,Japanese Restaurant,Restaurant,Bakery,Chinese Restaurant,Poutine Place,Pub
3,M4M,East Toronto,Studio District,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Seafood Restaurant,Bar,Stationery Store,Fish Market,Coworking Space
34,M6P,West Toronto,"High Park,The Junction South",Café,Mexican Restaurant,Bar,Thai Restaurant,Grocery Store,Fried Chicken Joint,Music Venue,Diner,Discount Store,Cajun / Creole Restaurant
14,M5B,Downtown Toronto,"Ryerson,Garden District",Clothing Store,Coffee Shop,Cosmetics Shop,Café,Bakery,Japanese Restaurant,Restaurant,Ramen Restaurant,Bookstore,Sporting Goods Shop
19,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",Coffee Shop,Aquarium,Hotel,Café,Scenic Lookout,Brewery,Fried Chicken Joint,Restaurant,Bar,Italian Restaurant
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Breakfast Spot,Café,Climbing Gym,Stadium,Burrito Place,Sandwich Place,Caribbean Restaurant,Restaurant,Yoga Studio
18,M5H,Downtown Toronto,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Thai Restaurant,Sushi Restaurant,American Restaurant,Restaurant,Bakery,Steakhouse,Asian Restaurant


In [60]:
# Clustering areas using KMeans algorithm
kclusters = 3
toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['PostalCode', 'Borough', 'Neighborhoods'], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_venues_freq_clustering)

toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.join(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('PostalCode'), on='PostalCode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_central_clustered_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,0,Gym,Park,Trail,Playground,Concert Hall,Comic Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0,Park,Playground,Trail,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442,1,Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Boutique,Bar
32,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,1,Bar,Coffee Shop,Asian Restaurant,Men's Store,Café,New American Restaurant,Bakery,Cocktail Bar,Pizza Place,Restaurant
26,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049,1,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Café,Chinese Restaurant,Mexican Restaurant,Bakery,Coffee Shop,Dumpling Restaurant,Cocktail Bar
25,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049,1,Café,Bookstore,Bar,Italian Restaurant,Japanese Restaurant,Restaurant,Bakery,Chinese Restaurant,Poutine Place,Pub
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Seafood Restaurant,Bar,Stationery Store,Fish Market,Coworking Space
34,M6P,West Toronto,"High Park,The Junction South",43.661608,-79.464763,1,Café,Mexican Restaurant,Bar,Thai Restaurant,Grocery Store,Fried Chicken Joint,Music Venue,Diner,Discount Store,Cajun / Creole Restaurant
14,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,1,Clothing Store,Coffee Shop,Cosmetics Shop,Café,Bakery,Japanese Restaurant,Restaurant,Ramen Restaurant,Bookstore,Sporting Goods Shop
19,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752,1,Coffee Shop,Aquarium,Hotel,Café,Scenic Lookout,Brewery,Fried Chicken Joint,Restaurant,Bar,Italian Restaurant


In [61]:
# Show those clusters onto a map

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], toronto_central_clustered_df['Longitude'], toronto_central_clustered_df['PostalCode'], toronto_central_clustered_df['Borough'], toronto_central_clustered_df['Neighborhood'], toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Upon observing the result, we can name the clusters as follow:  

Cluster 0: Living area (with mostly park, trail, school; and some small businesses)  
Cluster 1: Roselawn - Central Toronto (nothing here except a garden)  
Cluster 2: Business area (with lots of business venues)