## IBM DataScience Professional Certificate Capstone
### Week 3: Segmenting and Clustering Neighborhoods in Toronto
Peter F. 

Import all libraries that may be necessary

In [1]:
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim 
from sklearn.cluster import KMeans

import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
import requests

### Web Scraping 
Scrape the Wikipedia page with the table of all neighbourhoods in Toronto, Ontario. 
- Create a BeautifulSoup object and exctract the table
- Create a pandas dataframe from the extracted table 

In [2]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1012118802'
response = requests.get(url)

soup = BeautifulSoup(response.text,'html.parser')
table = soup.find_all('table')

In [3]:
df = pd.read_html(str(table))[0]

In [4]:
df.shape

(180, 3)

In [5]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Cleaning
The table contains undefined values, 'Not assigned'. 
- Remove these by pattern matching a regular expression, and 
- create a new table where the regex pattern returned false
- reset the index of the new table

In [6]:
pattern = r"[Nn]ot [Aa].+"
unassigned = df['Borough'].str.contains(pattern)

In [7]:
toronto = df[~unassigned].reset_index(drop=True)

In [8]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
toronto.shape

(103, 3)

77 entries removed, 103 rows remain. Table is now clean. 

### Importing GeoSpatial data 
- create a geospatial dataframe
-  merge geospatial dataframe to the existing dataframe

In [10]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
tor2 = toronto.merge(coords, on='Postal Code')

In [12]:
tor2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Geographic Data 
Get the geographic coordidates of Toronto, ON

In [13]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print(f"The geographical coordinates of Toronto, ON are {latitude},{longitude}")

The geographical coordinates of Toronto, ON are 43.6534817,-79.3839347


#### Create a Map Centered on the Coordinates given above
Make this Notebook Trusted to load map: File -> Trust Notebook

In [14]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 11)

for lat, lon, borough, neighbourhood in zip(tor2['Latitude'], tor2['Longitude'], tor2['Borough'], tor2['Neighbourhood']):
    label = f"{neighbourhood}, {borough}"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat,lon], 
                        radius=5, 
                        popup=label, 
                        color='blue', 
                        fill=True, 
                        fill_color='#89cc31',
                        fill_opacity=0.7, parse_html=False ).add_to(map_toronto)
    
map_toronto

### Clustering

Run *k*-Means to cluster the neighbourhood into 5 clusters
- drop non-numerical data otherwise the algorithm will fail

In [15]:
k = 5
toronto_cluster = tor2.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_cluster)
kmeans.labels_
tor2.insert(0, 'Cluster Labels', kmeans.labels_)

In [16]:
tor2.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Visualize Clusters

Preview the resulting clusters

In [17]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat, lon, neighbourhood, cluster in zip(tor2['Latitude'], tor2['Longitude'], tor2['Neighbourhood'], tor2['Cluster Labels']):
    label = folium.Popup(f"Cluster {cluster}", parse_html=True)
    folium.CircleMarker([lat, lon], 
                        radius=5, 
                        popup=label, 
                        color=rainbow[cluster-1], 
                        fill=True, 
                        fill_color=rainbow[cluster-1],
                        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters