# Segmenting and Clustering Neighbourhoods in Toronto

## Excercise 1

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import urllib

In [2]:
# fetching data from wikipedia page and storing into canada_data
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
canada_data = requests.get(url).text
 
tabla = canada_data[canada_data.find("<table"):canada_data.find("</table>")+8]
df = pd.read_html(tabla, header = 0)[0]

In [3]:
df.dtypes

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

In [4]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df = df[df.Borough != "Not assigned"]

In [5]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df.Neighbourhood[df.Neighbourhood == "Not assigned"] = df.Borough[df.Neighbourhood == "Not assigned"]

In [6]:
#Rows will be combined by Postcode to compose the name of all neighbourhoods
def neighbourhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighbourhood'].tolist()))
                    
grp = df.groupby(['Postal Code', 'Borough'])
newDf = grp.apply(neighbourhood_list).reset_index(name='Neighbourhood')

In [7]:
#the number of rows of dataframe
newDf.shape

(103, 3)

In [8]:
#Cleaned Dataframe
# Changing column name from Postcode to PostalCode
newDf = newDf.rename(columns = {'Postcode':'PostalCode'})
newDf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
#Export dataframe to new CSV file for excercise 2
newDf.to_csv('Toronto_Part_I_dataframe.csv', index=False)