# Part 1 - Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import random
import requests

# module to convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

# import tools for webscraping
from bs4 import BeautifulSoup
from urllib.request import urlopen

import folium

## Scrap data from Wikipedia about Toronto

In the first phase of the project the goal is to get the Postal Codes, the Boroughs and the names of all the Neighborhood in Toronto. This data is parsed into a DataFrame for further analysis.

### 1. Getting the webconten via BeautifulSoup

Defining a function to get html-content and using this function to get the content. Filling a DataFrame with the scraped content.

In [84]:
# Defining a function to get the html-content of a given webpage

def getHTMLContent(link):
    source = requests.get(link).text
    soup = BeautifulSoup(source, 'html.parser')
    return soup

In [94]:
# getting the webcontent from Wikipedia

link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
content = getHTMLContent(link)
table = content.find('table')

In [95]:
# define DataFrame

column_names = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = column_names)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [96]:
# fill DataFrame

for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [97]:
df.shape

(180, 3)

### 2. Cleaning the Data

Cleaning the scraped data: Removing rows with no assigned Borough and Neighborhood. Merging Neighborhoods for PostalCodes that have multiple Neighborhoods. 

In [98]:
# droping emty rows
df.drop(df[df.Borough == 'Not assigned'].index , inplace = True)

In [99]:
df.shape

(103, 3)

In [109]:
# group by PostalCode, join Neighborhoods, rename Column
df2 = df.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
df2 = df2.reset_index(drop=False)
df2.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [111]:
# merge df2 and df on PostalCode
df_merge = pd.merge(df, df2, on='PostalCode')
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [118]:
df_merge

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [119]:
df_merge.shape

(103, 3)