In [3]:
from bs4 import BeautifulSoup

In [4]:
html_doc = open('List_of_postal_codes_of_Canada__M.html',encoding='utf-8',errors='ignore')

In [5]:
soup=BeautifulSoup(html_doc,'html.parser')

In [6]:
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [7]:
import pandas as pd

In [8]:
dfs = pd.read_html(str(soup.table))

In [9]:
df=dfs[0]

## The input data consist of three columns: PostalCode, Borough, and Neighborhood as a result of web scraping

In [10]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## In next cell we drop rows where Borough is Not assigned

In [11]:
drop_list=[]

for i in df.index:
    if df.loc[i,'Borough'] == 'Not assigned':
        drop_list.append(i)
        
df.drop(drop_list,axis=0,inplace=True)

In [12]:
df.shape

(210, 3)

## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [13]:
for i in df.index:
    if df.loc[i,'Neighbourhood'] == 'Not assigned':
        print('Found')
        df.loc[i,'Neighbourhood'] = df.loc[i,'Borough']

Found


In [14]:
df['Postcode'].unique()

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

In [15]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## More than one neighborhood can exist in one postal code area. In the next cell we club them.

In [16]:
df_new = pd.DataFrame()
df_new['Postcode'] = df['Postcode'].unique()
df_new['Borough'] = pd.np.nan
df_new['Neighbourhood'] = pd.np.nan

for index,val in enumerate(df_new['Postcode']):
    temp_list=[]
    for j in df.index:
        if df.loc[j,'Postcode']==val:
            temp_list.append(df.loc[j,'Neighbourhood'])
            boro_j = j
    df_new.loc[index,'Neighbourhood'] = ', '.join(temp_list)
    df_new.loc[index,'Borough'] = df.loc[boro_j,'Borough']

# First Part of Assignment is completed and output is seen in next cell

In [17]:
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Shape of the dataFrame is shown in next cell

In [18]:
df_new.shape

(103, 3)

## Read Location information from csv file

In [19]:
df2=pd.read_csv('Geospatial_Coordinates.csv')

In [20]:
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
df2.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [22]:
df2.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [23]:
df3 = pd.merge(df_new,df2,on='Postcode')

## Second Part of Assignment is complete. Result in next cell.

In [24]:
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Filtering Neighborhoods using the column Borough with word 'Toronto' in it

In [25]:
neighborhood_names = ['Central Toronto','Downtown Toronto','West Toronto','East Toronto']
neighborhoods=pd.DataFrame(columns=df3.columns)
for index,j in enumerate(df3['Borough']):
    if j in neighborhood_names:
        neighborhoods.loc[index] = df3.loc[index]

In [26]:
neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [27]:
import folium
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=12)
map_toronto

# Third Part of Assignment

In [28]:
# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Lets Study neighbourhood of Toronto

In [29]:
CLIENT_ID = 'HLA3MIYDXWRD1MTILQ0M0J3XMH5TNFAOPTPBZRHMMU53YXL4' # your Foursquare ID
CLIENT_SECRET = 'CILVKEPKXAUDTC3FEA0KU2F5VNZM33XASCFYI3JZQIAEYEJX' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HLA3MIYDXWRD1MTILQ0M0J3XMH5TNFAOPTPBZRHMMU53YXL4
CLIENT_SECRET:CILVKEPKXAUDTC3FEA0KU2F5VNZM33XASCFYI3JZQIAEYEJX


In [31]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
address = 'Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

43.653963 -79.387207


In [32]:
search_query = 'Italian'
radius = 500
print(search_query + ' .... OK!')

Italian .... OK!


In [33]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=HLA3MIYDXWRD1MTILQ0M0J3XMH5TNFAOPTPBZRHMMU53YXL4&client_secret=CILVKEPKXAUDTC3FEA0KU2F5VNZM33XASCFYI3JZQIAEYEJX&ll=43.653963,-79.387207&v=20180604&query=Italian&radius=500&limit=30'

In [34]:
import requests
results = requests.get(url).json()
results.keys()

dict_keys(['meta', 'response'])

In [35]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
from pandas.io.json import json_normalize
dataframe = json_normalize(venues)
dataframe.head()

Unnamed: 0,categories,hasPerk,hierarchy,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.postalCode,location.state,name,referralId,venuePage.id
0,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",False,"[{'name': 'Atrium On Bay', 'lang': 'en', 'id':...",573df789498e03dd8e54b166,595 Bay St,CA,Toronto,Canada,Dundas St,405,"[595 Bay St (Dundas St), Toronto ON M5G 2C2, C...","[{'label': 'display', 'lat': 43.65616, 'lng': ...",43.65616,-79.38319,M5G 2C2,ON,Mustachio Italian Eatery,v-1574266785,
1,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",False,,51bf3866498e55ee55df8db0,,CA,Toronto,Canada,,127,"[Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65499143746528...",43.654991,-79.387897,,ON,The Fresh Italian,v-1574266785,
2,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",False,"[{'name': 'Village by the Grange', 'lang': 'en...",526fe29411d2aeb3803013b0,"109 McCaul Street, Unit #42",CA,Toronto,Canada,Dundas Street West,288,"[109 McCaul Street, Unit #42 (Dundas Street We...","[{'label': 'display', 'lat': 43.653889, 'lng':...",43.653889,-79.390785,M5T 3K5,ON,The Fresh Italian Eatery,v-1574266785,
3,"[{'id': '4bf58dd8d48988d12c951735', 'name': 'E...",False,,4bfc0289c3ba9521c00f9653,136 Beverley St,CA,Toronto,Canada,Dundas Street,555,"[136 Beverley St (Dundas Street), Toronto ON, ...","[{'label': 'display', 'lat': 43.65402694219784...",43.654027,-79.394104,,ON,Italian Consulate Toronto,v-1574266785,
4,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",False,,4f88cf84e4b002b90ab3b9b9,,CA,,Canada,,434,[Canada],"[{'label': 'display', 'lat': 43.65053979517576...",43.65054,-79.384603,,,LA's Italian + Bar,v-1574266785,


In [36]:
dataframe.columns

Index(['categories', 'hasPerk', 'hierarchy', 'id', 'location.address',
       'location.cc', 'location.city', 'location.country',
       'location.crossStreet', 'location.distance',
       'location.formattedAddress', 'location.labeledLatLngs', 'location.lat',
       'location.lng', 'location.postalCode', 'location.state', 'name',
       'referralId', 'venuePage.id'],
      dtype='object')

In [37]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,postalCode,state,id
0,Mustachio Italian Eatery,Italian Restaurant,595 Bay St,CA,Toronto,Canada,Dundas St,405,"[595 Bay St (Dundas St), Toronto ON M5G 2C2, C...","[{'label': 'display', 'lat': 43.65616, 'lng': ...",43.65616,-79.38319,M5G 2C2,ON,573df789498e03dd8e54b166
1,The Fresh Italian,Italian Restaurant,,CA,Toronto,Canada,,127,"[Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65499143746528...",43.654991,-79.387897,,ON,51bf3866498e55ee55df8db0
2,The Fresh Italian Eatery,Italian Restaurant,"109 McCaul Street, Unit #42",CA,Toronto,Canada,Dundas Street West,288,"[109 McCaul Street, Unit #42 (Dundas Street We...","[{'label': 'display', 'lat': 43.653889, 'lng':...",43.653889,-79.390785,M5T 3K5,ON,526fe29411d2aeb3803013b0
3,Italian Consulate Toronto,Embassy / Consulate,136 Beverley St,CA,Toronto,Canada,Dundas Street,555,"[136 Beverley St (Dundas Street), Toronto ON, ...","[{'label': 'display', 'lat': 43.65402694219784...",43.654027,-79.394104,,ON,4bfc0289c3ba9521c00f9653
4,LA's Italian + Bar,Italian Restaurant,,CA,,Canada,,434,[Canada],"[{'label': 'display', 'lat': 43.65053979517576...",43.65054,-79.384603,,,4f88cf84e4b002b90ab3b9b9
5,Classic italian style pizza food truck,Food Truck,CNE Midway,CA,Toronto,Canada,,530,"[CNE Midway, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.652144, 'lng':...",43.652144,-79.381118,,ON,4c787c9181bca0936180fa14
6,john's italian cafe,Italian Restaurant,27 Baldwin Street,CA,Toronto,Canada,,546,"[27 Baldwin Street, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65612672798775...",43.656127,-79.393301,,ON,53daae5b498e9c9597c19b23
7,Sbarro,Pizza Place,220 Yonge Street,CA,Toronto,Canada,in Toronto Eaton Centre,533,"[220 Yonge Street (in Toronto Eaton Centre), T...","[{'label': 'display', 'lat': 43.655413, 'lng':...",43.655413,-79.380896,M5B 2H1,ON,4b4a2d09f964a520687d26e3
8,Little Anthony's,Italian Restaurant,121 Richmond St. W,CA,Toronto,Canada,at York St.,462,"[121 Richmond St. W (at York St.), Toronto ON ...","[{'label': 'display', 'lat': 43.65029624519052...",43.650296,-79.384513,M5H 2K1,ON,4b846dd4f964a520dc3431e3


In [38]:
dataframe_filtered.name

0                  Mustachio Italian Eatery
1                         The Fresh Italian
2                  The Fresh Italian Eatery
3                 Italian Consulate Toronto
4                        LA's Italian + Bar
5    Classic italian style pizza food truck
6                       john's italian cafe
7                                    Sbarro
8                          Little Anthony's
Name: name, dtype: object

In [39]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel

# add a red circle marker to represent the Conrad Hotel
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Conrad Hotel',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the Italian restaurants as blue circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

# Cluster Data

In [40]:
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [43]:
from sklearn.preprocessing import StandardScaler

X = neighborhoods.values[:,3:]
X = pd.np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[-0.55411276,  0.78095369],
       [-0.43044373,  0.29228045],
       [-0.67199019,  0.38624909],
       [ 0.3876069 ,  2.58611446],
       [-0.95850566,  0.44262974],
       [-0.39675107,  0.0667712 ],
       [ 0.09715793, -0.87262416],
       [-0.7113125 ,  0.14193917],
       [ 0.07427709, -1.39853026],
       [-1.12705845,  0.21710981],
       [-0.82401202, -0.79748556],
       [ 0.52396714,  1.00652703],
       [-0.85597019,  0.22180664],
       [-1.29618231, -1.02289335],
       [ 0.07399582,  1.98424734],
       [-0.81242884,  0.26878829],
       [-0.32971095,  1.30732174],
       [ 2.58931104,  0.02918855],
       [ 1.8935657 , -0.72234429],
       [ 1.93858158, -0.00839143],
       [ 1.26509061, -0.57205642],
       [-0.24094915, -1.99942544],
       [ 2.05076118, -0.42175786],
       [ 0.23215444, -0.42175786],
       [-0.77998911, -1.77410844],
       [ 1.57946453,  0.02918855],
       [-0.19461216, -0.27145129],
       [-0.6687215 , -2.52509122],
       [ 0.95086585,

In [52]:
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
num_clusters = 4

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

neighborhoods['labels'] = labels
print(labels)

[3 3 3 0 3 3 1 3 1 3 1 0 3 1 0 3 0 2 2 2 2 1 2 3 1 2 3 1 2 3 2 3 3 3 3 3 3
 0]


In [55]:
# add markers to map
color_map = {0:'blue',1:'red',2:'green',3:'orange'}
for lat, lng, borough, neighborhood,label_color in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], 
                                                 neighborhoods['Borough'], neighborhoods['Neighbourhood'],
                                                neighborhoods['labels']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color_map[label_color],
        fill=True,
        fill_color=color_map[label_color],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [46]:
color_map = {0:'blue',1:'red',2:'green'}

In [47]:
color_map[0]

'blue'