## Week 3 Assignment - Toronto Neighborhood Segmentation

#### Step 1: Webscraping wikipedia page table

In [10]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [11]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [49]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'}).tbody
rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print(columns)

['Postal Code', 'Borough', 'Neighborhood']


#### Prepare Dataframe to store wikidata

In [50]:
df = pd.DataFrame(columns=columns)

In [51]:
print(df)

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []


In [52]:
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 3:
        values = [tds[0].text.replace('\n',""),tds[1].text.replace('\n',""),tds[2].text.replace('\n',"")]
    else:
        values = [td]
        
    print(values)
    
    df = df.append(pd.Series(values,index =columns), ignore_index = True)
    print(df)

['M1A', 'Not assigned', 'Not assigned']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
['M2A', 'Not assigned', 'Not assigned']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
1         M2A  Not assigned  Not assigned
['M3A', 'North York', 'Parkwoods']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
1         M2A  Not assigned  Not assigned
2         M3A    North York     Parkwoods
['M4A', 'North York', 'Victoria Village']
  Postal Code       Borough      Neighborhood
0         M1A  Not assigned      Not assigned
1         M2A  Not assigned      Not assigned
2         M3A    North York         Parkwoods
3         M4A    North York  Victoria Village
['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront']
  Postal Code           Borough               Neighborhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2

In [21]:
df.shape

(180, 3)

#### Remove data from dataframe where Borough is 'Not assigned"

In [53]:
df_borough = df[df['Borough'] != 'Not assigned']

In [54]:
df_borough.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### combine neighbors as mentioned "If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough." 

In [57]:
df1 = df_borough.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x.astype(str))).reset_index()

In [58]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [60]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

for i in range(len(df1.index)):
    if df1.iloc[i,1] == 'Not assigned':
        df1.iloc[i,1] = df1.iloc[i,2]
    else:
        df.iloc[i,1] = df1.iloc[i,1]

In [61]:
df1['Borough'].unique()     

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [62]:
df1.shape

(103, 3)

### ---------------------------------------------------------------------------------------------------------------------------

#### Get location attributes for each 

In [63]:
from geopy import geocoders
from geopy import Nominatim
import pgeocode

In [64]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [65]:
locator = pgeocode.Nominatim('CA')
df1['latitude'] = ""
df1['longitude'] = ""

In [66]:
for j in range(0, len(df1.index)):
    df1['latitude'][j] = locator.query_postal_code(df1.iloc[j,0]).latitude
    df1['longitude'][j] = locator.query_postal_code(df1.iloc[j,0]).longitude

In [67]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
