# Segmenting and Clustering Neighbourhoods in Toronto

## Excercise 1

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import urllib

In [2]:
# fetching data from wikipedia page and storing into canada_data
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
canada_data = requests.get(url).text
 
tabla = canada_data[canada_data.find("<table"):canada_data.find("</table>")+8]
df = pd.read_html(tabla, header = 0)[0]

In [3]:
df.dtypes

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

In [4]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df = df[df.Borough != "Not assigned"]

In [5]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df.Neighbourhood[df.Neighbourhood == "Not assigned"] = df.Borough[df.Neighbourhood == "Not assigned"]

In [6]:
#Rows will be combined by Postcode to compose the name of all neighbourhoods
def neighbourhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighbourhood'].tolist()))
                    
grp = df.groupby(['Postal Code', 'Borough'])
newDf = grp.apply(neighbourhood_list).reset_index(name='Neighbourhood')

In [7]:
#the number of rows of dataframe
newDf.shape

(103, 3)

In [8]:
#Cleaned Dataframe
# Changing column name from Postcode to PostalCode
newDf = newDf.rename(columns = {'Postcode':'PostalCode'})
newDf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
#Export dataframe to new CSV file for excercise 2
newDf.to_csv('Toronto_Part_I_dataframe.csv', index=False)

## Excercise 2

In [10]:
!pip install geocoder



In [11]:
import numpy as np
import pandas as pd
import geocoder

In [12]:
#Use the Geocoder to find coordinates
#Take M5G as an example
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords


get_latlng('M5G')

[43.65609000000006, -79.38492999999994]

In [13]:
#Use CSV file to add coordinates
#Use a given link of csv file that has the geographical coordinates of each postal code
url='http://cocl.us/Geospatial_data'
geo_data = pd.read_csv(url).set_index("Postal Code")
geo_data.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [14]:
#Using neighbourhoods in toronto dataframe that created in Excercise (which stored in csv file)
toronto_data = pd.read_csv('Toronto_Part_I_dataframe.csv').set_index("Postal Code")
toronto_data.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [15]:
#Joining both dataframes together
joinData = toronto_data.join(geo_data)
joinData.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
#Export dataframe to new CSV file for excercise 3
joinData.to_csv('Toronto_Part_II_dataframe.csv',index=False)