# Capstone Project Week 3 - Almost There ;)

## Data scraping, data frame construction, data exploration, segmentation and clustering

In [1]:
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20181101100342-0000


In [4]:
# Get the neighborhood table from the wiki: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

neighborhoodDataWikiUrl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
neighborhoodDataRequest  = requests.get(neighborhoodDataWikiUrl)

neighborhoodDataSource = neighborhoodDataRequest.text
neighborhoodDataSoup = BeautifulSoup(neighborhoodDataSource, "lxml")

neighborhoodTableRaw = neighborhoodDataSoup.find('table', attrs={'class':'wikitable'})
neighborhoodTableRows = neighborhoodTableRaw.find_all('tr')
neighborhoodTableHeaders = neighborhoodTableRaw.find_all('th')

# First construct a dictionary that will be used to create the data frame. The Keys will be the row headers and the values will be lists 
# for example {'Postcode': [], 'Borough': [], 'Neighbourhood': []} etc
#
# This dictionary will need cleaned later to remove any data we don't want
neighborhoodDataDict = {}

# Add the headers as the dictionary keys 
for headerTag in neighborhoodTableHeaders:
    
    headerRawString = headerTag.string
    
    # The last header has a newline char, we want to remove this to keep everything neat and working
    headerRawStriped = headerRawString.strip()
    
    # Add the header as a key to the dictionary by assigning the header as an empty list into the dictionary
    neighborhoodDataDict[headerRawStriped] = []

# Debug/testing
# print(neighborhoodDataDict)
# print()

# Now add each of the table values into the related lists, skipping the first item in the list as this is the headers
for rowTag in neighborhoodTableRows[1:]:
    
    # Add the rows tableData to the related dictionary locations
    tdSet = rowTag.find_all('td')
    
    rowPostcodeTag = tdSet[0]
    rowBoroughTag = tdSet[1]
    rowNeighbourhoodTag = tdSet[2]

    # The tags for each row may have links, this will make the value embeded deeper into the tag   
    aSetPostcode = rowPostcodeTag.find_all('a')
    aSetBorough = rowBoroughTag.find_all('a')
    aSetNeighbourhood = rowNeighbourhoodTag.find_all('a')
    
    Postcode = rowPostcodeTag.string.strip() if not aSetPostcode else aSetPostcode[0].string.strip()
    Borough = rowBoroughTag.string.strip() if not aSetBorough else aSetBorough[0].string.strip()
    Neighbourhood = rowNeighbourhoodTag.string.strip() if not aSetNeighbourhood else aSetNeighbourhood[0].string.strip()
    
    # Housekeeping as per assignment rules
    if ( Borough == 'Not assigned' ): 
        # Skip this row
        continue
    
    # Add the rows to the dictionary
    neighborhoodDataDict['Postcode'].append(Postcode)
    neighborhoodDataDict['Borough'].append(Borough)
    neighborhoodDataDict['Neighbourhood'].append(Neighbourhood)

# Create the dataFrame
dataFrameRaw = pd.DataFrame.from_dict(neighborhoodDataDict)

# Housekeeping, combining multiple neighbourhoods for postcodes 
dataFrameCleanGroup = dataFrameRaw.groupby(['Postcode', 'Neighbourhood'])
dataFrameClean = dataFrameCleanGroup.agg('min')

print(dataFrameClean.to_string())

In [3]:
print(dataFrameClean.shape)

(212, 1)
