In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
#Get the website html information from wikipedia
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_url = requests.get(website_url).text
soup = BeautifulSoup(website_url, 'lxml')

In [3]:
#Collect information from each column of the wikipedia table 
postCodes = []
borough = []
neighborhood = []

table_col = [postCodes, borough, neighborhood]

index = 0

for info in soup.table.find_all('td'):
    if(index == 0):
        postCodes.append(info.text)
    if(index == 1):
        borough.append(info.text)
    if(index == 2):
        neighborhood.append(info.text)
    index += 1
    if(index == 3):
        index = 0

In [4]:
#Build a Pandas Dataframe with the lists from above
import pandas as pd
dictA = {'PostalCode': postCodes, 'Borough': borough, 'Neighborhood': neighborhood}
df = pd.DataFrame(dictA)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [5]:
#Drop all rows with boroughs not assigned
index = []

for i in range(len(df['Borough'])):
    if(df['Borough'][i] == 'Not assigned'):
        index.append(int(i))

df.drop(index, inplace = True)

#Get rid of \n in Neighborhood column
df['Neighborhood'] = df.Neighborhood.str.replace('\n', '')
df.reset_index(drop = True, inplace = True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [6]:
#Replace all Not assigned in Neighborhood Column with its respective Borough name
for i in range(len(df['Neighborhood'])):
    if(df['Neighborhood'][i] == 'Not assigned'):
        df['Neighborhood'].replace('Not assigned', df['Borough'][i], inplace = True)

In [7]:
#Combine repeated PostalCode rows
df = df.groupby(['PostalCode', 'Borough']).agg(', '.join)
df.reset_index(inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
#Print the number of rows in the Dataframe
print('The number of rows in the dataframe:', df.shape[0])

The number of rows in the dataframe: 103


In [9]:
#Get data on latitudes and longitudes for each Postal Code
df2 = pd.read_csv('Geospatial_Coordinates.csv')
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
#Rename the column of Postal Code in df2
df2.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)

In [11]:
#Combine the first dataframe with the second one
df_final = pd.merge(df, df2, on = 'PostalCode')
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
import folium
from geopy.geocoders import Nominatim

In [1]:
#Latitude and Longitude of Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

NameError: name 'Nominatim' is not defined

In [14]:
#Add points to Toronto Map and display it
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel

for lat, lng, label in zip(df_final['Latitude'],df_final['Longitude'], df_final['Borough']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)
    
venues_map

In [15]:
#Define Foursquare Credentials and Version
CLIENT_ID = 'G0Z44BXYHPTIMJ3KM0S40VH5QULQBK2EYJ0FER2CZWQEG2Q2' # your Foursquare ID
CLIENT_SECRET = '1WOYHFDY05OQZGTQ23OVFWPXYABQV2SSX4UV4SMHT2TC15XK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: G0Z44BXYHPTIMJ3KM0S40VH5QULQBK2EYJ0FER2CZWQEG2Q2
CLIENT_SECRET:1WOYHFDY05OQZGTQ23OVFWPXYABQV2SSX4UV4SMHT2TC15XK


In [16]:
#Observed Neighborhoods

limit = 100
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venues_list in venues_list for item in venues_list])
    
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [17]:
neighborhoods = getNearbyVenues(df_final['Borough'], df_final['Latitude'], df_final['Longitude'])

Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
East York
East York
East Toronto
East York
East York
East York
East Toronto
East Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
North York
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
North York
North York
York
York
Downtown Toronto
Wes

In [18]:
neighborhoods.head()

Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Scarborough,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Scarborough,43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,Scarborough,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,Scarborough,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,Scarborough,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [19]:
unique_categories = len(neighborhoods['Venue Category'].unique())
print('There are {} unique venue categories'.format(unique_categories))

There are 279 unique venue categories


In [20]:
count_venues = pd.get_dummies(neighborhoods[['Venue Category']], prefix="", prefix_sep = "")
count_venues['Borough'] = neighborhoods['Borough']

fixed_columns = [count_venues.columns[-1]] + list(count_venues.columns[:-1])
count_venues = count_venues[fixed_columns]

count_venues.head()

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
count_venues = count_venues.groupby('Borough').mean().reset_index()
count_venues.head()

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017857,...,0.0,0.008929,0.0,0.0,0.008929,0.0,0.0,0.0,0.0,0.008929
1,Downtown Toronto,0.0,0.000779,0.000779,0.000779,0.000779,0.001559,0.002338,0.001559,0.015588,...,0.002338,0.01325,0.002338,0.0,0.004677,0.0,0.006235,0.000779,0.0,0.001559
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013889,0.0,0.013889,0.0,0.0,0.0,0.013889
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0


In [22]:
ranking = pd.DataFrame()

for i in count_venues['Borough']:
    freq = {}
    print('----' + i + '----')
    temp = count_venues[count_venues['Borough'] == i]
    temp.set_index('Borough', inplace = True)
    
    for col in temp.columns:
        freq[col] = [temp.loc[i, col]]
            
    temp_df = pd.DataFrame(freq).transpose().reset_index()
    temp_df.columns = ['Venue Category', 'Frequency']
    temp_df.sort_values('Frequency', ascending = False, inplace = True)
    temp_df.reset_index(drop = True, inplace = True)
    
    ranking[i] = temp_df.iloc[0:5, 0]
    
    print(temp_df.head())
    print("\n")
    


----Central Toronto----
   Venue Category  Frequency
0     Coffee Shop   0.080357
1  Sandwich Place   0.062500
2     Pizza Place   0.053571
3            Park   0.053571
4            Café   0.044643


----Downtown Toronto----
  Venue Category  Frequency
0    Coffee Shop   0.091193
1           Café   0.053780
2     Restaurant   0.031956
3          Hotel   0.029618
4         Bakery   0.026500


----East Toronto----
       Venue Category  Frequency
0    Greek Restaurant      0.072
1         Coffee Shop      0.064
2  Italian Restaurant      0.048
3      Ice Cream Shop      0.040
4                Café      0.032


----East York----
  Venue Category  Frequency
0    Coffee Shop   0.083333
1           Park   0.055556
2    Pizza Place   0.055556
3   Burger Joint   0.041667
4       Pharmacy   0.041667


----Etobicoke----
         Venue Category  Frequency
0           Pizza Place   0.111111
1        Sandwich Place   0.061728
2           Coffee Shop   0.049383
3              Pharmacy   0.049383
4  

In [23]:
ranking = ranking.transpose()

ranking.reset_index(drop = False, inplace = True)

ranking.columns = ['Borough', '1st Common Venue', '2nd Common Venue', '3rd Common Venue', '4th Common Venue', '5th Common Venue']



ranking.head()

Unnamed: 0,Borough,1st Common Venue,2nd Common Venue,3rd Common Venue,4th Common Venue,5th Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Pizza Place,Park,Café
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Café
3,East York,Coffee Shop,Park,Pizza Place,Burger Joint,Pharmacy
4,Etobicoke,Pizza Place,Sandwich Place,Coffee Shop,Pharmacy,Fast Food Restaurant


In [24]:
from sklearn.cluster import KMeans

In [25]:
first = list(set([item for item in ranking['1st Common Venue']]))
first

['Convenience Store',
 'Pizza Place',
 'Fast Food Restaurant',
 'Coffee Shop',
 'Greek Restaurant',
 'Bar']

In [26]:
#clusters_table = pd.DataFrame()
#clusters_table.index = [i for i in first]
#clusters_table.columns = [i for i in ranking['Borough']]

cluster_table = count_venues.transpose()
cluster_table.columns = [i for i in ranking['Borough']]
cluster_table.drop(index = 'Borough', axis = 1, inplace = True)

cluster_table.reset_index(drop = True, inplace = True)
cluster_table

Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,0,0,0,0,0,0,0.00416667,0,0,0,0
1,0,0.000779423,0,0,0,0,0,0,0,0,0
2,0,0.000779423,0,0,0,0,0.00416667,0,0,0,0
3,0,0.000779423,0,0,0,0,0,0,0,0,0
4,0,0.000779423,0,0,0,0,0,0,0,0,0
5,0,0.00155885,0,0,0,0,0,0,0,0,0
6,0,0.00233827,0,0,0,0,0,0,0,0,0
7,0,0.00155885,0,0,0,0,0,0,0,0,0
8,0.0178571,0.0155885,0.024,0,0.0123457,0.0909091,0.00833333,0,0.0113636,0,0
9,0,0.00155885,0,0,0,0,0,0,0,0,0


In [27]:
kmeans = KMeans(n_clusters = 6, random_state=0).fit(cluster_table)
labels = kmeans.labels_
labels

array([1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 3, 4, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 4, 1, 0, 1, 1, 1, 0, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 4, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       4, 1, 1, 1, 3, 1, 1, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 4, 4, 4, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 4, 1, 1, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 0])

In [28]:
df_final.drop(['PostalCode', 'Borough'], axis = 1)
df_final['Cluster Labels'] = pd.Series(kmeans.labels_)
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1


In [29]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(6)
ys = [i + x + (i*x)**2 for i in range(6)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighborhood'], df_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters