# Segmenting and Clustering Neighborhoods in Toronto

## 1. Transform the data in the table on the Wikipedia page into pandas dataframe.

In [1]:
# Download libraries

# library for data analsysis
import pandas as pd 

#library for scraping libraries
import requests 
from bs4 import BeautifulSoup

# library for mapping tools
!pip install geopy 
import geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# map rendering library
!pip install folium
import folium 

print("Libraries Imported!")

Libraries Imported!


In [2]:
#Using Beautiful Soup build the code to scrape Wikipedia page 

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
soup = BeautifulSoup(extracting_data, 'lxml')

print("Scrapping Complete!")

Scrapping Complete!


In [3]:
#find the table in the website

My_table = soup.find('table',{'class':'wikitable sortable'})

print("Table Found!")

Table Found!


In [4]:
#extract the table data of Wikipedia

WP_data = [] 
for row in My_table.find_all("tr"):
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    WP_data.append(cols)
    
print("Data Extracted!")

Data Extracted!


In [5]:
#Create dataframe consisting of three columns: PostalCode, Borough, and Neighborhood

df = pd.DataFrame(WP_data)
df = df.rename(columns={0:"Postcode",1:"Borough",2:"Neighborhood"})
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [6]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

#1 drop the first row
df2 = df.copy()
df2 = df.drop([0])

#2 drop cells with a borough that is Not assigned
df2 = df2.drop(df2[df2['Borough']=='Not assigned'].index)
df2 = df2.reset_index(drop=True)

df2.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
#3 If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

df3 = df2.copy()
df3['Neighborhood'] = df3.apply(lambda row: row['Borough'] if row['Neighborhood'] == 'Not assigned' else row['Neighborhood'],
    axis=1
)

df3.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
#4 if there is more than one neighborhood in one postal code area, combine those rows and separate the neighborhoods with a comma (group the dataframe by postcode and borough,and then join the neighborhoods)

df4 = df3.copy()
df4 = (df4.groupby(['Postcode','Borough'])['Neighborhood']
       .apply(lambda x: ','.join(set(x.dropna())))
       .reset_index())

df4 = pd.DataFrame(df4)

df4.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [9]:
# Use the .shape method to print the number of rows of your dataframe

df4.shape

(103, 3)

## 2.  Get the latitude and the longitude coordinates of each neighborhood 

In [10]:
# read csv file to obtain geographical coordinates of each postal code
Geospacial_Coordinates = pd.read_csv('http://cocl.us/Geospatial_data', sep = ',')

#Merge the two tables
df_final = pd.concat([df4, Geospacial_Coordinates], axis=1)
df_final = df_final .drop(['Postal Code'], axis=1)
df_final .head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## 3. Explore and cluster the neighborhoods in Toronto considering only boroughs that contain the word Toronto

In [11]:
# Find Geographical coordinates of Toronto Canada

address = 'Toronto, Ontario'
geopy.geocoders.options.default_user_agent = "lmharlem"
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto Canada are', latitude, ',' ,longitude)


The geograpical coordinate of Toronto Canada are 43.6534817 , -79.3839347


In [12]:
#import the datafram 

df_final=pd.read_csv ('df_final.csv')

In [13]:
#locating only boroughs that contain the word Toronto

df_toronto = df_final[df_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto.drop(['Unnamed: 0'], axis = 1, inplace=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [14]:
# create map of Toronto including markers

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=13)
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
  label = '{},{}'.format(neighborhood,borough)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7).add_to(map_toronto) 
    
map_toronto