In [159]:
#import needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from sklearn.cluster import KMeans #for clustering
import matplotlib.cm as cm
import matplotlib.colors as colors


In [160]:
#import libraries for scraping website
import requests
from bs4 import BeautifulSoup

## Scraping Toronto Neighborhood Data from Wikipedia

The data for the neigborhoods in the Toronto are available through this link https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [161]:
#get the wikipedia page code
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#parse the page code
soup = BeautifulSoup(source, 'lxml')

In [162]:
#parse the table
table = soup.find('table', class_ = 'wikitable sortable')
table

#name columns
columns = ['Postcode', 'Borough', 'Neighborhood']
    
#get the rows titles from the table and add to list
table_rows = table.find_all('tr')
rows = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [td.text for td in td]
    rows.append(row)

In [163]:
#create the dataframe to house Toronto neighborhood data
neighborhood_df = pd.DataFrame(rows, columns = columns)
neighborhood_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


Cleaning up the dataframe by dropping the rows with not assigned boroughs, and removing the /n from the Neighborhood names

In [164]:
#Drop not assigned boroughs from the dataframe
neighborhood_df = neighborhood_df[neighborhood_df['Borough'] != 'Not assigned']
neighborhood_df.drop(index = 0, inplace = True)

#remove \n from value in the neeighborhood column
temp_df = neighborhood_df['Neighborhood'].str.split("\n", n = 1, expand = True)
neighborhood_df['Neighborhood'] = temp_df[0]

#replace not assigned in the Neighborhood column with value in Borough
neighborhood_df['Neighborhood'].replace('Not assigned', neighborhood_df['Borough'], inplace = True)
neighborhood_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


To have all neighborhoods with similar postal codes in the same row, groupby-apply to join the neighborhoods together.

In [165]:
neighborhood = neighborhood_df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
neighborhood.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [166]:
neighborhood.shape

(103, 3)