**This notebook scrapes the wiki page of Toronto's Neighbourhood List and saves the data in a DataFrame**

> Importing libraries

In [0]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as BSoup

> Preparing the scraper-soup

In [0]:
BASE_URI = 'https://en.wikipedia.org'
page = requests.get(BASE_URI+'/wiki/List_of_postal_codes_of_Canada:_M')

# Using BeautifulSoup to make fetch happen
soup = BSoup(page.content, 'html.parser')

# Picking the target elements
neighbourhood_on_page = soup.select('table.wikitable tbody tr')


> Extracting values from the target elements of the table of values in the page.
    
    This step also involves cleaning the data on the basis of the following criteria:
    
    i) Ignoring the values with Boroughs as 'Not assigned'
    
    ii) Equating the value of Neighbourhood with that of its Borough if it is 'Not assigned'    
    iii) Merging the Neighbourhood values of the same Postalcode into one row

In [0]:
# Extracting values from target elements:
neighbourhood_data = list()
for neighbourhood in neighbourhood_on_page:
  neighbour_dict = dict()
  element_chain = list()
  for child in neighbourhood.children:
    if 'get_text' in dir(child):
      element = child.get_text()
    else:
      element = child.string
    element_chain.append(element)

  # Cleaning: encoding strings and replacing newline characters from returned HTML leaves
  element_chain = [element for element in map(lambda chain: str(chain).replace('\n', ''), element_chain)]

  # Cleaning: removing escape characters from returned list of elements
  elements = [element for element in filter(lambda element: element, element_chain)]
  if elements[1] != 'Not assigned': 
    # Filtering "Not assigned" Boroughs
    if elements[2] == 'Not assigned':
      # Cleaning "Not assigned" Neighbourhoods
      elements[2] = elements[1]
    neighbourhood_data.append(elements)

> Saving the data in a DataFrame and printing the number of rows of the dataframe

In [13]:
df_cols = neighbourhood_data.pop(0)
neighbourhood_df = pd.DataFrame(neighbourhood_data, columns=df_cols)

grouped_df = pd.DataFrame({'Neighbourhood':neighbourhood_df.groupby('Postcode').apply(lambda x: ','.join(x.Neighbourhood))})
grouped_df.reset_index(inplace=True)

merged_df = pd.merge(neighbourhood_df[['Postcode', 'Borough']], grouped_df, how="inner", on="Postcode")
merged_df.drop_duplicates(inplace=True)

print("Number of rows in the dataframe = {}".format(merged_df.shape[0]))

Number of rows in the dataframe = 103
