### Install packages needed to scrape webpage and parse the data

In [1]:
##!conda install -c anaconda beautifulsoup4 -y
##!conda install -c anaconda lxml -y
##!conda install -c anaconda requests -y
##!conda config --add channels conda-forge
##!conda install -c conda-forge geopy --yes
##!conda install -c conda-forge folium=0.5.0 --yes

In [2]:
##  imports and functions
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML     
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium # plotting library
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

###### Get the data and Loop Thru table data and create CSV file

In [3]:
myDoc = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(myDoc,'lxml')
f = open('table_data.csv','w')

f.write('POSTCODE,BOROUGH,NEIGBOURHOOD'+'\n')
table = soup.find("table")
##table.prettify()
csv_line = ""
for tr in table.find_all("tr"):
    for td in tr.find_all("td"):
        csv_line = csv_line.strip() + td.text.strip() + ","
    f.write(csv_line[0:-1]+'\n')
    csv_line = ""
f.close()

###### Read CSV into data frame and display data

In [4]:
df = pd.read_csv('table_data.csv')
df2 = df[df['BOROUGH'] != 'Not assigned'].groupby(["POSTCODE", "BOROUGH"], as_index=False).agg(lambda x: ", ".join(x))
df2.head(10)


Unnamed: 0,POSTCODE,BOROUGH,NEIGBOURHOOD
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


###### for Neighborhood="Not assigned", make the value the same as Borough

In [5]:
for index, row in df2.iterrows():
    if row["NEIGBOURHOOD"] == "Not assigned":
        row["NEIGBOURHOOD"] = row["BOROUGH"]

In [6]:
df2

Unnamed: 0,POSTCODE,BOROUGH,NEIGBOURHOOD
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
df2.shape

(103, 3)

###### Q2


In [8]:
##read in lat long data from csv file
df_latlng = pd.read_csv('https://cocl.us/Geospatial_data')
df_latlng.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
##merge dataframes
df3 = df2.merge(df_latlng,left_on='POSTCODE', right_on='Postal Code' )

In [10]:
##Drop the extra column
df3.drop('Postal Code', axis=1, inplace=True)
df3.head()

Unnamed: 0,POSTCODE,BOROUGH,NEIGBOURHOOD,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


###### Q3

In [11]:
{
    "tags": [
        "hide_input",
    ]
}
CLIENT_ID = 'CK44XVF0XHD341ERVYWMTKQFUZGM2METCHMV3WNQL1AOCARP' # your Foursquare ID
CLIENT_SECRET = 'M4FVY0XDZLVOBBUZRMSESEE54Z0NH43I1IT5NSAA5ZVDYHB1' # your Foursquare Secret


In [12]:
##HIDDEN CELL w/Credentials

In [13]:
##Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['BOROUGH'], df3['NEIGBOURHOOD']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [15]:

# filter borough names that contain the word Toronto
borough_names = list(df3.BOROUGH.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
print(borough_with_toronto)

# create a new DataFrame with only boroughs that contain the word Toronto
toronto_df_new = df3[df3['BOROUGH'].isin(borough_with_toronto)].reset_index(drop=True)
print(toronto_df_new.shape)
toronto_df_new

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
(39, 5)


Unnamed: 0,POSTCODE,BOROUGH,NEIGBOURHOOD,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [16]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['BOROUGH'], toronto_df_new['NEIGBOURHOOD']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [17]:
## Foursquare API required inputs
VERSION = '20180604'
LIMIT = 100
radius = 500
venue_list = []

##loop thru and get info from 4square
for lat, long, post_cd, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['POSTCODE'], toronto_df_new['BOROUGH'], toronto_df_new['NEIGBOURHOOD']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()

    items = results['response']['groups'][0]['items']

    for venue in items:
        venue_list.append(( 
            post_cd,
            borough,
            neighborhood, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [18]:
# 
# dconvert the venue list into a new DataFrame
venues_df = pd.DataFrame(venue_list)
#define the column names
venues_df.columns = ['PostalCode','Borough', 'NEIGBOURHOOD', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()


(1685, 7)


Unnamed: 0,PostalCode,Borough,NEIGBOURHOOD,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,M4E,East Toronto,The Beaches,Upper Beaches,43.680563,-79.292869,Neighborhood


In [19]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))
print('There are {} uniques Neighborhoods.'.format(len(venues_df['NEIGBOURHOOD'].unique())))

There are 232 uniques categories.
There are 38 uniques Neighborhoods.


In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add borough and neighborhood column back to dataframe
 
toronto_onehot['NEIGBOURHOOD'] = venues_df['NEIGBOURHOOD'] 

#  borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-2:]) + list(toronto_onehot.columns[:-2])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

toronto_grouped = toronto_onehot.groupby([ 'NEIGBOURHOOD']).mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(1685, 233)
(38, 233)


Unnamed: 0,NEIGBOURHOOD,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Analyze the Data and put it all together

In [21]:
##create  new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['NEIGBOURHOOD']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['NEIGBOURHOOD'] = toronto_grouped['NEIGBOURHOOD']

for ind in np.arange(toronto_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,NEIGBOURHOOD,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,Bakery,Restaurant,Burger Joint,Salad Place,Asian Restaurant,Sushi Restaurant
1,Berczy Park,Coffee Shop,Farmers Market,Seafood Restaurant,Steakhouse,Café,Cheese Shop,Beer Bar,Cocktail Bar,Bakery,Irish Pub
2,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Coffee Shop,Café,Yoga Studio,Pet Store,Restaurant,Italian Restaurant,Burrito Place,Intersection,Bar
3,Business Reply Mail Processing Centre 969 Eastern,Skate Park,Burrito Place,Recording Studio,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa,Pizza Place,Restaurant,Smoke Shop
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service,Sculpture Garden,Rental Car Location,Boat or Ferry,Harbor / Marina,Airport Gate,Airport Food Court,Airport


In [22]:
##Run k-means to cluster the Toronto areas into 6 clusters.

# number of clusters
kclusters = 6
toronto_grouped_clustering = toronto_grouped.drop('NEIGBOURHOOD', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [23]:
postcode_venues_sorted['Cluster label'] = kmeans.labels_
toronto_merged = pd.merge(postcode_venues_sorted, toronto_df_new, how='left',
        on='NEIGBOURHOOD', validate="1:1")
toronto_merged.head()

Unnamed: 0,NEIGBOURHOOD,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster label,POSTCODE,BOROUGH,Latitude,Longitude
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,Bakery,Restaurant,Burger Joint,Salad Place,Asian Restaurant,Sushi Restaurant,1,M5H,Downtown Toronto,43.650571,-79.384568
1,Berczy Park,Coffee Shop,Farmers Market,Seafood Restaurant,Steakhouse,Café,Cheese Shop,Beer Bar,Cocktail Bar,Bakery,Irish Pub,1,M5E,Downtown Toronto,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Coffee Shop,Café,Yoga Studio,Pet Store,Restaurant,Italian Restaurant,Burrito Place,Intersection,Bar,1,M6K,West Toronto,43.636847,-79.428191
3,Business Reply Mail Processing Centre 969 Eastern,Skate Park,Burrito Place,Recording Studio,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa,Pizza Place,Restaurant,Smoke Shop,1,M7Y,East Toronto,43.662744,-79.321558
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport Service,Sculpture Garden,Rental Car Location,Boat or Ferry,Harbor / Marina,Airport Gate,Airport Food Court,Airport,1,M5V,Downtown Toronto,43.628947,-79.39442


In [24]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, pc, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['NEIGBOURHOOD'], toronto_merged['POSTCODE'], toronto_merged['Cluster label']):
    label = folium.Popup(str(neigh) + '(' + str(pc) + '): Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters