# <center><u> Clustering Neighbourhoods in Toronto </u><center>

### <center>Coursera Capstone Project - Neeraj Tripathi<center>


## <center><u>Part 1</u></center>

# Part 1
### Scraping html table from Wikipedia into a DataFrame

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
req=requests.get(url)



### Parsing the web html file with BeautifulSoup package

In [23]:
# Parse the html with Soup
page=bsoup(req.text,"html.parser")
page.head()

[<meta charset="utf-8"/>,
 <title>List of postal codes of Canada: M - Wikipedia</title>,
 <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xc6xewpAME8AABFFwk0AAAAN","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":926306543,"wgRevisionId":926306543,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wg

### Extracting table from html

In [4]:
table=page.table

results=table.find_all('tr')
nrows=len(results)
print(nrows)

print(results[-1])
results[0:5]

288
<tr>
<td>M9Z</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>


[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>]

In [5]:
# columns for the dataframe
columns = results[0].text.split()
columns

['Postcode', 'Borough', 'Neighbourhood']

In [6]:
# Filling up the dataframe by extracting cells from html table

df=pd.DataFrame({}, columns=columns)
records =[]

for i in range(1, nrows):
    row = results[i].text.split('\n')
    record = pd.Series({columns[0]: row[1], columns[1]: row[2], columns[2]: row[3]}, name=i)
    
    df = df.append(record)
print(df.shape)
df.head(5)

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor
287,M9Z,Not assigned,Not assigned


In [8]:
# drops those rows where 'Not assigned' appears in column '[Borough]'
df = df[~df.Borough.str.contains("Not assigned")]
df = df.reset_index(drop=True)
df.shape

(210, 3)

### Combine rows with same PostCode but different neighborhood

In [9]:
df['Postcode'].nunique()

103

In [10]:
df2 = df.copy()                   # precaution to avoid corrupting the original dataframe

n=0
nrows2 = df.shape[0]-1

# fill using iteration
while n < nrows2 :
    post1=df2.iloc[n,0]
    #post1
    m=n+1
    post2=df2.iloc[m,0]
    #post2
    neigh1=df2.iloc[n,2]
    neigh2=df2.iloc[m,2]
    if post1==post2:
        df2.Neighbourhood[n,2] = neigh1=neigh1+','+neigh2
        #df2 = df2[df2.Neighbourhood != 'neigh2']
        df2=df2.drop(df2.index[m])
        nrows2=nrows2-1
        df2 = df2.reset_index(drop=True)
    else:
        n=n+1
print(df2.shape)
df2.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Kingsway Park South West,Mimico NW,The Queensw..."
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Assign same neighbourhood as borough where not assigned

In [11]:
(df2['Neighbourhood']=='Not assigned').sum()

1

In [19]:
df2.loc[df2['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df2['Borough']
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Kingsway Park South West,Mimico NW,The Queensw..."
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [13]:
df2.shape

(103, 3)

# <hr>
## <center>End of Part 1</center>

### Part 1 Ends here

 <hr>
 <hr>

# <center>Part 2</center>

In [16]:
import json
from geopy.geocoders import Nominatim

### Retrieve the Latitude and Longitude coordinates for every Postal Code

In [17]:
url='http://cocl.us/Geospatial_data'
df_pcodes=pd.read_csv(url)
df_pcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the Latitude & Longitude data

In [22]:
# rename the column "Postal Code" to "PostalCode" in order to do a proper merger
df_pcodes.columns = ['Postcode', 'Latitude', 'Longitude']

# sort the original dataframe
df2.sort_values(by=['Postcode'], inplace=True)

# merge step
neighborhoods=pd.merge(df2,df_pcodes, how='right', on = 'Postcode')
neighborhoods.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# <hr>
## <center>End of Part 2</center>

 <hr>
 <hr>