# Part 1 - Segmenting and Clustering Neighborhoods in Toronto

In [112]:
# Importing libraries

import pandas as pd
import numpy as np
import random
import requests

# module to convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

# import tools for webscraping
from bs4 import BeautifulSoup
from urllib.request import urlopen

import folium

## Scrap data from Wikipedia about Toronto

In the first phase of the project the goal is to get the Postal Codes, the Boroughs and the names of all the Neighborhood in Toronto. This data is parsed into a DataFrame for further analysis.

### 1. Getting the webconten via BeautifulSoup

Defining a function to get html-content and using this function to get the content. Filling a DataFrame with the scraped content.

In [6]:
# Defining a function to get the html-content of a given webpage

def getHTMLContent(link):
    source = requests.get(link).text
    soup = BeautifulSoup(source, 'html.parser')
    return soup

In [7]:
# getting the webcontent from Wikipedia

link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
content = getHTMLContent(link)
table = content.find('table')

In [8]:
# define DataFrame

column_names = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = column_names)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
# fill DataFrame

for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
df.shape

(180, 3)

### 2. Cleaning the Data

Cleaning the scraped data: Removing rows with no assigned Borough and Neighborhood. Merging Neighborhoods for PostalCodes that have multiple Neighborhoods. 

In [11]:
# droping emty rows
df.drop(df[df.Borough == 'Not assigned'].index , inplace = True)

In [12]:
df.shape

(103, 3)

In [13]:
# group by PostalCode, join Neighborhoods, rename Column
df2 = df.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
df2 = df2.reset_index(drop=False)
df2.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [14]:
# merge df2 and df on PostalCode
df_merge = pd.merge(df, df2, on='PostalCode')
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [15]:
df_merge

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [16]:
df_merge.shape

(103, 3)

# Part 2 - Segmenting and Clustering Neighborhoods in Toronto


## Getting the Geolocation for the Neighborhoods

Using the GeoCoder Python Package the longitude and latitude for each Neighborhood in the DataFrame are calculated and appended to the DataFrame.

In [113]:
from geopandas.tools import geocode

In [177]:
# defining a function to get long and lat via geocoder for the postal codes in df_merge
def get_loc(adress):
    lati = []
    longi = []
    
    for a in adress:
    
        result = geocode('{}, Toronto'.format(a), provider = 'nominatim')
    
        point = result.geometry.iloc[0]
        lati.append(point.y)
        longi.append(point.x)
    
    return lati, longi

In [168]:
postal_code = df_merge['Borough']

In [210]:
# get latitude and longitude from postal code via get_loc function
latitude, longitude = get_loc(postal_code)

In [213]:
# read coordinates from file
df_coords = pd.read_csv('Geospatial_Coordinates.csv')
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [214]:
# merge coordinates data and neighborhood data on postal code
df_coords.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
coords_merged = pd.merge(df_coords, df_merge, on = 'PostalCode')
coords_merged.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [218]:
df_geo = coords_merged[['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
df_geo

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
