In [1]:
# Import BeautifulSoup package; this package scrapes data quite easily
from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
# Obtain dataset from Wikipedia page; use Beautiful Soup to scrape the data
wikiURL = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
torontodata = BeautifulSoup(wikiURL, 'lxml')

In [3]:
# Remove the "table" and "td" tags from the HTML table
table = torontodata.find('table')
fields = table.find_all('td') 

pd.set_option('display.max_rows', 200)  # Creates the full dataset;
                                        # 200 is the maximum number of rows
                                        # but we don't anticipate our data
                                        # will exceed this number

# Create empty arrays for postcode, borough, and neighborhood
postcode = []
borough = []
neighborhood = []

# Loop through the table and populate the fields in the arrays
for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighborhood.append(fields[i+2].text.strip())

# Transpose the axes in the arrays
df_toronto_pbn = pd.DataFrame(data=[postcode, borough, neighborhood]).transpose()
df_toronto_pbn.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Replace any data that says "Not Assigned" for the Borough column
df_toronto_pbn['Borough'].replace('Not assigned', np.nan, inplace=True)
df_toronto_pbn.dropna(subset=['Borough'], inplace=True)

# Use groupby to group the Neighborhoods in Toronto
df_toronto = df_toronto_pbn.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_toronto.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Replace the "Not Assigned" column in Neighborhood to "Queen's Park" (this applies to Queen's Park only)
df_toronto['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

# Read the CSV file that has the geographical coordinates of each postal code in Toronto
df_geocodes = pd.read_csv('http://cocl.us/Geospatial_data')

# Create new columns in the table Latitude and Longitude
df_geocodes.columns = ['PostalCode', 'Latitude', 'Longitude']

# Merge the dataframes "toronto" and "geocodes" and assign it to a new dataframe called "torontogeocodes"
df_post = pd.merge(df_toronto, df_geocodes, on=['PostalCode'], how='inner')
df_torontogeocodes = df_post[['Borough', 'Neighborhood', 'PostalCode', 'Latitude', 'Longitude']].copy()

# Display the torontogeocodes dataframe for Toronto with the geographical coordinates for each neighborhood
df_torontogeocodes

Unnamed: 0,Borough,Neighborhood,PostalCode,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029
7,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
8,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476
9,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848
