# Segmenting and Clustering Neighborhoods in Toronto

## Step 1: Scraping wikipedia

In [1]:
# Importing libraries
#!conda install -c conda-forge beautifulsoup4=4.6.3 --yes  # Installing beautifulsoup, the most popular Python web scraping tool
#!conda install -c conda-forge geocoder --yes 
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd 
from geopy.geocoders import Nominatim
import folium

print("Done !")

Done !


In [2]:
pageurl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(pageurl)
soup = BeautifulSoup(page, 'lxml')
column_names = ['PostCode', 'Borough', 'Neighborhood'] 
lis = []
listes = soup.find_all('table', class_ = "wikitable sortable")
for elt in listes:
    lignes = elt.find_all('tr')
    for ligne in lignes:
        lin = ['','','']
        cels = ligne.find_all('td')
        for idx, cel in enumerate(cels):
            lin[idx] = cel.get_text()
            
        lis.append([lin[0], lin[1], lin[2][:-1]])

toronto = pd.DataFrame(lis[1:], columns=column_names)
toronto.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Step 2: Removing the 'Not assigned'

In [3]:
toronto = toronto[toronto.Borough != 'Not assigned']
toronto.reset_index(drop=True, inplace=True)

for idx, row in toronto.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

## Step 3: Grouping Neighborhoods

In [4]:
clean_df = toronto.groupby(['PostCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
clean_df.shape

(103, 3)

## Step 4: getting coordinates

In [5]:
#import geocoder # import geocoder
coordi = pd.read_csv("http://cocl.us/Geospatial_data")
final_df = clean_df.join(coordi)
#for idx, row in clean_df.iterrows():
#    # initialize your variable to None
#    lat_lng_coords = None
#    while(lat_lng_coords is None):
#        g = geocoder.google('{}, Toronto, Ontario'.format(row['PostCode']))
#        lat_lng_coords = g.latlng
#    row['latitude'] = lat_lng_coords[0]
#    row['longitude'] = lat_lng_coords[1]
del final_df['Postal Code']
final_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Step 5: Let's geolocate Toronto

In [6]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [7]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

reduced_df = final_df[final_df['Borough'].str.contains("Toronto")]

# add markers to map
for lat, lng, label in zip(reduced_df['Latitude'], reduced_df['Longitude'], reduced_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto