# Segmenting and Clustering Neighborhoods in Toronto

## Part I

### Goal : Transform the Wikipedia data into a pandas dataframe 

Import Libraries

In [7]:
import pandas as pd
import numpy as np 
import bs4 as bs
import requests
import lxml.html as lh
import urllib.request

Extract Data

In [9]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data_extract = requests.get(url).text
wikipedia_data = bs.BeautifulSoup(data_extract, 'lxml')
table = wikipedia_data.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))
#wikipedia_data

In [10]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Transform the data and eliminate the Borough 'Not assigned'

In [11]:
data_transform = data[data['Borough'] != 'Not assigned']
data_transform = data_transform.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
data_transform.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


In [13]:
data_transform['Neighbourhood'] = np.where(data_transform['Neighbourhood'] == 'Not assigned', data_transform['Borough'], data_transform['Neighbourhood'])

In [14]:
data_transform.shape


(103, 3)

## Part 2 : get the latitude and the longitude coordinates of each neighborhood

In [15]:
geospatial_url = "https://cocl.us/Geospatial_data"
geospatial_data = pd.read_csv(geospatial_url)
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
geospatial_data.columns = ['Postal Code', 'Latitude', 'Longitude']

In [20]:
data_fusion = pd.merge(data_transform, geospatial_data, on='Postal Code')
data_fusion.head(12)

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316
5,Central Toronto,M4V,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
6,Central Toronto,M5N,Roselawn,43.711695,-79.416936
7,Central Toronto,M5P,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,Downtown Toronto,M4W,Rosedale,43.679563,-79.377529


## Part 3 : Visualize the Toronto neighbourhoods

In [29]:
! pip install folium==0.5.0
import folium # plotting library


latitude = 43.651070
longitude = -79.347015



In [30]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data_fusion['Latitude'], data_fusion['Longitude'], data_fusion['Borough'], data_fusion['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto