## Segmenting and Clustering Neighborhoods in Toronto

#### Importing libraries

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd



#### Data source

In [13]:
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipage = requests.get(link)
page = wikipage.text

In [14]:
soup = BeautifulSoup(page, 'html.parser')


column_names = ['Postcode', 'Borough', 'Neighbourhood'] 

neighbs=[]

tableSoup = soup.find_all("table")
table = tableSoup[0]  #getting the first instance table from all found tables
rows = table.find_all("tr")
for row in rows:
    cols = row.find_all("td")
    if len(cols) == 0 : continue
    neighbs.append({'Postcode': cols[0].text,'Borough': cols[1].text,'Neighbourhood': cols[2].text.split("\n")[0]})


df=pd.DataFrame(neighbs,columns=column_names)




#### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [15]:
column_names = ['Postcode', 'Borough', 'Neighbourhood'] 
df.columns=column_names



In [16]:

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [17]:


df=df.drop(df[df['Borough']=='Not assigned'].index).reset_index(drop=True)


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [18]:
##replacing value conditionally
df.loc[df['Neighbourhood'] =='Not assigned', 'Neighbourhood'] = df['Borough']

In [19]:
df.shape

(212, 3)

#### Grouping by Postcode and  Borough Columns, concatenating Neighbourhoods

In [20]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [21]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [22]:
df.shape

(103, 3)

In [23]:
### Reading data from Geospatial_Data file downloaded from "http://cocl.us/Geospatial_data"

In [24]:
geo_url = 'http://cocl.us/Geospatial_data'
geo_df=pd.read_csv(geo_url)

In [25]:
### Merging Main data set with Geo Data

In [26]:

df = pd.merge(left=df,right=geo_df, how='left', left_on='Postcode', right_on='Postal Code')
df.drop('Postal Code',axis=1,inplace=True)
df.rename(columns={'Postcode':'PostalCode'},inplace=True)

In [27]:
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
