<h1>Capstone Project - Toronto Neighborhood</h1>
<h2>Week - 3: Part - 3</h2>

<strong>1. Import required libraries</strong>

In [18]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Libraries imported.


<strong>2. Scrape the Wikipedia page</strong>

In [19]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


<strong>3. Convert the first row as Header</strong>

In [20]:
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


<h3>Cleaning Data</h3>

<strong>4. Find and Process 'Not assigned' values</strong>

In [21]:
df = df.dropna()
df = df[df.Borough != 'Not assigned']
df = df.replace('\n','', regex=True)
df = df.groupby(['Postcode','Borough'], sort=False).agg( ','.join)
df.reset_index(inplace = True)
df[df.Neighbourhood == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
4,M7A,Queen's Park,Not assigned


<strong>5. Replace the 'Neighbourhood' values with 'Borough' if the value is 'Not assigned'</strong>

In [22]:
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df['Borough']

In [23]:
df[df.Postcode == 'M7A']

Unnamed: 0,Postcode,Borough,Neighbourhood
4,M7A,Queen's Park,Queen's Park


In [24]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


<strong>6. Number of rows in the dataframe</strong>

In [25]:
df.shape

(103, 3)

<h3>Process Geospatial Data</h3>

<strong>7. Read Geospatial Data</strong>

In [26]:
df['Latitude'] = '0';
df['Longitude'] = '0';

df_geo = pd.read_csv("https://cocl.us/Geospatial_data")
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<strong>8. Merge Geospatial Data with Dataframe</strong>

In [27]:
for i in df.index:
    for j in df_geo.index:
        if df.iloc[i, 0] == df_geo.iloc[j, 0]:
            df.iloc[i, 3] = df_geo.iloc[j, 1]
            df.iloc[i, 4] = df_geo.iloc[j, 2]
            
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Rouge,Malvern",43.8067,-79.1944
7,M3B,North York,Don Mills North,43.7459,-79.3522
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.7064,-79.3099
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.6572,-79.3789


<h3>Clustering the neighborhoods in Toronto</h3>

<strong>9. Clustering Toronto Neighborhoods</strong>

In [28]:
df_toronto = df.copy()
df_toronto = df_toronto[df.Borough.str.contains("Toronto")]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.6543,-79.3606
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.6572,-79.3789
15,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
19,M4E,East Toronto,The Beaches,43.6764,-79.293
20,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733


<h3>Generate Map for Toronto Neighborhood</h3>

<strong>10. Map to Visualize Toronto Neighborhoods</strong>

In [32]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = df_toronto['Latitude']
Y = df_toronto['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df_toronto['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map