# Toronto neighbourhoods and boroughs web scraping

## 1. First part of the assignment - 

First, I have to import the libraries needed

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

Now, I get the contents of the Wikipedia page using the BeautifulSoup package

In [2]:
site = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(site, 'lxml')

#print(soup.prettify())

table= soup.find('table', class_='wikitable sortable')

rows_table = table.find_all('tr')

#print(rows_table)

list = []

for tr in rows_table:
    td=tr.find_all('td')
    row_txt= [pr.text for pr in td]
    list.append(row_txt)
    
#print (list)

My next step is to take only the rows that have an assigned Borough

In [3]:
df = pd.DataFrame(list, columns=['PostalCode', 'Borough', 'Neighborhood'])[1:]

df = df[df['Borough'] != 'Not assigned']

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n


Now, I need to make sure that more than one neighbourhood having the same 'PostalCode' is grouped together

In [4]:
same_pcode = df["PostalCode"].duplicated(keep= False)

df1 = df[-same_pcode]

print(df1.head())

df2 = df[same_pcode]

print(df2.head())

   PostalCode       Borough        Neighborhood
3         M3A    North York         Parkwoods\n
4         M4A    North York  Victoria Village\n
9         M7A  Queen's Park      Not assigned\n
11        M9A     Etobicoke  Islington Avenue\n
15        M3B    North York   Don Mills North\n
   PostalCode           Borough        Neighborhood
5         M5A  Downtown Toronto      Harbourfront\n
6         M5A  Downtown Toronto       Regent Park\n
7         M6A        North York  Lawrence Heights\n
8         M6A        North York    Lawrence Manor\n
12        M1B       Scarborough             Rouge\n


Due to some issues with the duplicated function on python, it didn't work in one go. Hence I needed to keep repeating it multiple times before it worked like it was supposed to. Instead of using loop, I hard coded this part.

In [5]:
bool1= df2.duplicated('PostalCode', keep='last')
df3= df2[bool1]
df4 = df2[-bool1]
bool2 = df3.duplicated('PostalCode', keep='last')
df5 = df3[bool2]
df6 = df3[-bool2]
#print(df4)
#print(df5)
bool3 = df5.duplicated('PostalCode', keep='last')
df7= df5[bool3]
df8 = df5[-bool3]
bool4= df7.duplicated('PostalCode', keep='last')
df9 = df7[bool4]
df10 = df7[-bool4]
bool5= df9.duplicated('PostalCode', keep='last')
df11 = df9[bool5]
df12 = df9[-bool5]
bool6= df11.duplicated('PostalCode', keep='last')
df13 = df11[bool6]
df14 = df11[-bool6]
bool7= df13.duplicated('PostalCode', keep='last')
df15 = df13[bool7]
df16 = df13[-bool7]
bool8= df15.duplicated('PostalCode', keep='last')
df17 = df15[bool8]
df18 = df15[-bool8]
final1 = pd.merge(df18, df16, on= ['PostalCode', 'Borough'],how='outer')
final2 = pd.merge(final1, df16, on= ['PostalCode', 'Borough'],how='outer')
final3 = pd.merge(final2, df14, on= ['PostalCode', 'Borough'],how='outer')
final4=  pd.merge(final3, df12, on= ['PostalCode', 'Borough'],how='outer')
final5=  pd.merge(final4, df10, on= ['PostalCode', 'Borough'],how='outer')
final6=  pd.merge(final5, df8, on= ['PostalCode', 'Borough'],how='outer')
final7=  pd.merge(final6, df6, on= ['PostalCode', 'Borough'],how='outer')
final8=  pd.merge(final7, df4, on= ['PostalCode', 'Borough'],how='outer')
final9=  pd.merge(final8, df1, on= ['PostalCode', 'Borough'],how='outer')
final9 = final9.replace(np.nan, '', regex=True)
columnNumbers = [x for x in range(final9.shape[1])]  

columnNumbers.remove(4) #removing column integer index 0
final9 = final9.iloc[:, columnNumbers]

final9



Unnamed: 0,PostalCode,Borough,Neighborhood_x,Neighborhood_y,Neighborhood_y.1,Neighborhood_x.1,Neighborhood_y.2,Neighborhood_x.2,Neighborhood_y.3,Neighborhood_x.3,Neighborhood_y.4
0,M9V,Etobicoke,Albion Gardens\n,Beaumond Heights\n,Humbergate\n,Jamestown\n,Mount Olive\n,Silverstone\n,South Steeles\n,Thistletown\n,
1,M8Y,Etobicoke,Humber Bay\n,King's Mill Park\n,Kingsway Park South East\n,Mimico NE\n,Old Mill South\n,The Queensway East\n,Royal York South East\n,Sunnylea\n,
2,M5V,Downtown Toronto,,CN Tower\n,Bathurst Quay\n,Island airport\n,Harbourfront West\n,King and Spadina\n,Railway Lands\n,South Niagara\n,
3,M9B,Etobicoke,,,,Cloverdale\n,Islington\n,Martin Grove\n,Princess Gardens\n,West Deane Park\n,
4,M4V,Central Toronto,,,,Deer Park\n,Forest Hill SE\n,Rathnelly\n,South Hill\n,Summerhill West\n,
5,M8Z,Etobicoke,,,,Kingsway Park South West\n,Mimico NW\n,The Queensway West\n,Royal York South West\n,South of Bloor\n,
6,M9C,Etobicoke,,,,,Bloordale Gardens\n,Eringate\n,Markland Wood\n,Old Burnhamthorpe\n,
7,M6M,York,,,,,Del Ray\n,Keelesdale\n,Mount Dennis\n,Silverthorn\n,
8,M9R,Etobicoke,,,,,Kingsview Village\n,Martin Grove Gardens\n,Richview Gardens\n,St. Phillips\n,
9,M1V,Scarborough,,,,,Agincourt North\n,L'Amoreaux East\n,Milliken\n,Steeles East\n,


Now, we need to combine all the different neigbourhood columns into a single one

In [6]:
final9.columns =['PostalCode', 'Borough', 'Neighborhood_a', 'Neighborhood_b',
       'Neighborhood_c', 'Neighborhood_d', 'Neighborhood_e', 'Neighborhood_f',
       'Neighborhood_g', 'Neighborhood_h', 'Neighborhood_i']

final9['combined']=final9['Neighborhood_i']+','+final9['Neighborhood_h']+','+final9['Neighborhood_g']+','+final9['Neighborhood_f']+','+final9['Neighborhood_e']+','+final9['Neighborhood_d']+','+final9['Neighborhood_c']+','+final9['Neighborhood_b']+','+final9['Neighborhood_a']

final9= final9.loc[:,['PostalCode', 'Borough', 'combined']]
final9.columns=['PostalCode', 'Borough', 'Neighborhood']
final9.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9V,Etobicoke,",Thistletown\n,South Steeles\n,Silverstone\n,M..."
1,M8Y,Etobicoke,",Sunnylea\n,Royal York South East\n,The Queens..."
2,M5V,Downtown Toronto,",South Niagara\n,Railway Lands\n,King and Spad..."
3,M9B,Etobicoke,",West Deane Park\n,Princess Gardens\n,Martin G..."
4,M4V,Central Toronto,",Summerhill West\n,South Hill\n,Rathnelly\n,Fo..."


Now, we need to clean the data in the Neighborhood column so that the \n is removed

In [7]:
def clean_df(x):
    x=x.replace("\n","").replace(",,,,","").replace(",,,","").replace(",,","")
    return x
final9['Neighborhood']= final9['Neighborhood'].apply(clean_df)
final9.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9V,Etobicoke,",Thistletown,South Steeles,Silverstone,Mount O..."
1,M8Y,Etobicoke,",Sunnylea,Royal York South East,The Queensway ..."
2,M5V,Downtown Toronto,",South Niagara,Railway Lands,King and Spadina,..."
3,M9B,Etobicoke,",West Deane Park,Princess Gardens,Martin Grove..."
4,M4V,Central Toronto,",Summerhill West,South Hill,Rathnelly,Forest H..."


I have cleaned the table contents and now I just need to print the table and its shape

In [8]:
final9

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9V,Etobicoke,",Thistletown,South Steeles,Silverstone,Mount O..."
1,M8Y,Etobicoke,",Sunnylea,Royal York South East,The Queensway ..."
2,M5V,Downtown Toronto,",South Niagara,Railway Lands,King and Spadina,..."
3,M9B,Etobicoke,",West Deane Park,Princess Gardens,Martin Grove..."
4,M4V,Central Toronto,",Summerhill West,South Hill,Rathnelly,Forest H..."
5,M8Z,Etobicoke,",South of Bloor,Royal York South West,The Quee..."
6,M9C,Etobicoke,",Old Burnhamthorpe,Markland Wood,Eringate,Bloo..."
7,M6M,York,",Silverthorn,Mount Dennis,Keelesdale,Del Ray"
8,M9R,Etobicoke,",St. Phillips,Richview Gardens,Martin Grove Ga..."
9,M1V,Scarborough,",Steeles East,Milliken,L'Amoreaux East,Agincou..."


In [9]:
final9.shape

(103, 3)

This part of the assignment is now complete

## 2. Second part of the assignment - 

In [12]:
Lat_file= pd.read_csv("Geospatial_Coordinates.csv")
Lat_file.columns=['PostalCode', 'Latitude', 'Longitude']

final10= pd.merge(final9, Lat_file, on=['PostalCode'])

final10.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M9V,Etobicoke,",Thistletown,South Steeles,Silverstone,Mount O...",43.739416,-79.588437
1,M8Y,Etobicoke,",Sunnylea,Royal York South East,The Queensway ...",43.636258,-79.498509
2,M5V,Downtown Toronto,",South Niagara,Railway Lands,King and Spadina,...",43.628947,-79.39442
3,M9B,Etobicoke,",West Deane Park,Princess Gardens,Martin Grove...",43.650943,-79.554724
4,M4V,Central Toronto,",Summerhill West,South Hill,Rathnelly,Forest H...",43.686412,-79.400049


In [13]:
final10

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M9V,Etobicoke,",Thistletown,South Steeles,Silverstone,Mount O...",43.739416,-79.588437
1,M8Y,Etobicoke,",Sunnylea,Royal York South East,The Queensway ...",43.636258,-79.498509
2,M5V,Downtown Toronto,",South Niagara,Railway Lands,King and Spadina,...",43.628947,-79.394420
3,M9B,Etobicoke,",West Deane Park,Princess Gardens,Martin Grove...",43.650943,-79.554724
4,M4V,Central Toronto,",Summerhill West,South Hill,Rathnelly,Forest H...",43.686412,-79.400049
5,M8Z,Etobicoke,",South of Bloor,Royal York South West,The Quee...",43.628841,-79.520999
6,M9C,Etobicoke,",Old Burnhamthorpe,Markland Wood,Eringate,Bloo...",43.643515,-79.577201
7,M6M,York,",Silverthorn,Mount Dennis,Keelesdale,Del Ray",43.691116,-79.476013
8,M9R,Etobicoke,",St. Phillips,Richview Gardens,Martin Grove Ga...",43.688905,-79.554724
9,M1V,Scarborough,",Steeles East,Milliken,L'Amoreaux East,Agincou...",43.815252,-79.284577


The Second Part of the assignment is now complete