# Segmenting and Clustering Neighborhoods in Toronto

## Import Libraries

In [8]:
# Needed libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import sklearn.cluster
import requests

import csv

## Scrap Wikipedia Page Content

In [137]:
# Scrapping data 
page_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page_content= requests.get(page_url)

## Create the Content Dataframe

In [142]:
df_page = pd.read_html(page_content.content, header=0)[0]
df_page

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


## Ignore cells with a borough that is Not assigned.

In [144]:
df_page_na=df_page[df_page.Borough != 'Not assigned']
df_page_na

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


## The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood 

> we create new index and delete the olde one

In [147]:
df_page_na = df_page_na.reset_index()
del df_page_na['index']
df_page_na

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


## Test if a cell has a "Not assigned" neighborhood

In [183]:
for na in df_page_na['Neighborhood']:
    if na == 'Not assigned':
        print('yes')
print("Nothing to do here")

Nothing to do here


## Test if  more than one neighborhood can exist in one postal code area

In [201]:
duplicate_code = df_page_na[df_page_na.duplicated()]
if duplicate_code.empty:
    print("Nothing to do here")
else:
    print("You have to do more coding...")

Nothing to do here


## Print Dataframe Shape

In [203]:
df_shape = df_page_na.shape
print("The Dataframe Shape is ", df_shape)

The Dataframe Shape is  (103, 3)


# Second Part
## Create Geospatial Dataframe

In [204]:
df_geo = pd.read_csv("Geospatial_Coordinates.csv") 
# Preview the first 5 lines of the loaded data 
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Concatenating Dataframes

In [225]:
df_concat= pd.concat([df_geo, df_page_na], ignore_index=True,  axis=1)
df_concat

Unnamed: 0,0,1,2,3,4,5
0,M1B,43.806686,-79.194353,M3A,North York,Parkwoods
1,M1C,43.784535,-79.160497,M4A,North York,Victoria Village
2,M1E,43.763573,-79.188711,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M1G,43.770992,-79.216917,M6A,North York,Lawrence Manor / Lawrence Heights
4,M1H,43.773136,-79.239476,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...,...,...,...
98,M9N,43.706876,-79.518188,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M9P,43.696319,-79.532242,M4Y,Downtown Toronto,Church and Wellesley
100,M9R,43.688905,-79.554724,M7Y,East Toronto,Business reply mail Processing CentrE
101,M9V,43.739416,-79.588437,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...
