<h1 align='center'>Segmenting and Clustering Neighborhoods in Toronto </h1>
<p>Author: Long Han</p>
<p>Project: Data Science Capstone</p>
<p>Date: Feb 15 2020</p>

## Prepare Environtment

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library
#!conda install -c conda-forge geocoder --yes
import geocoder # import geocoder
from bs4 import BeautifulSoup
import lxml
import requests
print('Libraries imported.')

Libraries imported.


In [2]:
# download the data and parse
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(r.text, 'html.parser')
table=soup.find('table', attrs={'class':'wikitable sortable'})

#get headers:
headers=table.findAll('th')
for i, head in enumerate(headers): headers[i]=str(headers[i]).replace("<th>","").replace("</th>","").replace("\n","")

#Find all items and skip first one:
rows=table.findAll('tr')
rows=rows[1:len(rows)]

# skip all meta symbols and line feeds between rows:
for i, row in enumerate(rows): rows[i] = str(rows[i]).replace("\n</td></tr>","").replace("<tr>\n<td>","")

#1. make dataframe, expand rows and drop the old one:
df_raw=pd.DataFrame(rows)
df_raw[headers] = df_raw[0].str.split("</td>\n<td>", n = 2, expand = True) 
df_raw.drop(columns=[0],inplace=True)

#2. ignore if not assigned boroughs:
df_raw = df_raw.drop(df_raw[(df_raw.Borough == "Not assigned")].index)

#4. give "Not assigned" Neighborhoods same name as Borough:
df_raw.Neighbourhood.replace("Not assigned", df_raw.Borough, inplace=True)
df_raw.Neighbourhood.fillna(df_raw.Borough, inplace=True)

# drop duplicate rows:
df_raw=df_raw.drop_duplicates()

# extract titles from columns
df_raw.update(
    df_raw.Neighbourhood.loc[
        lambda x: x.str.contains('title')
    ].str.extract('title=\"([^\"]*)',expand=False))

df_raw.update(
    df_raw.Borough.loc[
        lambda x: x.str.contains('title')
    ].str.extract('title=\"([^\"]*)',expand=False))

# delete Toronto annotation from Neighbourhood:
df_raw.update(
    df_raw.Neighbourhood.loc[
        lambda x: x.str.contains('Toronto')
    ].str.replace(", Toronto",""))
df_raw.update(
    df_raw.Neighbourhood.loc[
        lambda x: x.str.contains('Toronto')
    ].str.replace("\(Toronto\)",""))

#3. combine multiple neighborhoods with the same post code
df = pd.DataFrame({'Postcode':df_raw.Postcode.unique()})
df['Borough']=pd.DataFrame(list(set(df_raw['Borough'].loc[df_raw['Postcode'] == x['Postcode']])) for i, x in df.iterrows())
df['Neighborhood']=pd.Series(list(set(df_raw['Neighbourhood'].loc[df_raw['Postcode'] == x['Postcode']])) for i, x in df.iterrows())
df['Neighborhood']=df['Neighborhood'].apply(lambda x: ', '.join(x))
df.dtypes
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [3]:
df.shape[0]

103

## Add location coordinate

In [4]:
#add Geo-spatial data
dfcoord= pd.read_csv("http://cocl.us/Geospatial_data")
dfcoord.rename(columns={'Postal Code':'Postcode'}, inplace=True)
dfcoord.set_index("Postcode")
df.set_index("Postcode")
toronto_data=pd.merge(df, dfcoord)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


## Create a Map of Toronto

In [10]:
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)

# create map using latitude and longitude values
toronto_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map