<a href="https://colab.research.google.com/github/liwenjing2012/Cousera_Capstone/blob/main/Segmenting_and_Clustering_Neighborhoods_in_Toronto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Segmenting and Clustering Neighborhoods in the City of Toronto
Scrape the Wikipedia page and wrangle the data, clean it, and then read it into a pandas  dataframe so that it is in a structured format like the New York dataset.

In [75]:
#import packages needed for the assignment
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
#!pip install geocoder
import geocoder
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Scrape data from wiki page and wrangle the data

In [None]:
#scrape data from wiki page
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')
print(soup.title)

#read html tables into pandas data frame
df = pd.read_html(str(soup.table))[0]
df.head()

Read html clean the data and transform into _pandas_ data frame: 


*   Ignore cells with a borough that is Not assigned.
*   Combine neighborhoods from the cell where more than one nighborhoods exist.



In [61]:
#read html
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


Report the number of rows of the dataframe

In [62]:
df.shape

(103, 3)

There are 103 rows of the dataframe.

### Get the latitude and the longitude coordinates of each neighborhood

In [63]:
#Read geographical coordinates of the neighborhoods using the Geocoder package
geo_cord = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
geo_cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the geographical dataframe with the neighborhood dataframe

In [95]:
geo_cord.rename(columns = {'Postal Code':'PostalCode'},inplace = True)
df_merge = pd.merge(df,geo_cord, on = 'PostalCode')
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### Explore and cluster the neighborhoods in Toronto
Work with only boroughs that contain the word Toronto. 

In [96]:
#get only the postals with boroughs containing the word Toronto
df_toronto = df_merge[df_merge['Borough'].str.contains('Toronto',regex=False)].reset_index()
df_toronto

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,19,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
9,35,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


#### Create a map of Toronto with neighborhoods

In [97]:
lati = 43.662744
lngi = -79.321558
#create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lati,lngi],zoom_start=10)

#add markers to the map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighborhood']):
  label = '{},{}'.format(neighborhood,borough)
  label = folium.Popup(label,parse_html=True)
  folium.CircleMarker(
      [lat,lng],
      radius = 5,
      popup = label,
      color = 'blue',
      fill = True,
      fill_color = '#3186cc',
      fill_opacity = 0.7,
      parse_html = False).add_to(map_toronto)

map_toronto


In [83]:
df_toronto.groupby('Borough').count()

Unnamed: 0_level_0,index,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Central Toronto,9,9,9,9,9
Downtown Toronto,17,17,17,17,17
Downtown Toronto Stn A,1,1,1,1,1
East Toronto,4,4,4,4,4
East Toronto Business,1,1,1,1,1
East York/East Toronto,1,1,1,1,1
West Toronto,6,6,6,6,6


#### Use _K-means_ tp cluster the neighborhoods 

In [98]:
kclusters = 5

toronto_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_clustering)
kmeans.labels_
df_toronto.insert(0,'clusters',kmeans.labels_)
df_toronto

Unnamed: 0,clusters,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,4,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,4,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,4,19,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,4,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,4,24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,4,25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,1,30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,1,31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
9,1,35,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


visualize the clustering results

In [104]:
map_clusters = folium.Map(location=[lati,lngi],zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
#ys
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lng, neighborhood,cluster in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Neighborhood'],df_toronto['clusters']):
  label = folium.Popup(str(neighborhood) + 'Cluster' + str(cluster), parse_html=True)
  folium.CircleMarker(
      [lat,lng],
      radius = 5,
      popup = label,
      color = rainbow[cluster-1],
      fill = True,
      fill_color = rainbow[cluster-1],
      fill_opacity = 0.7).add_to(map_clusters)

map_clusters
