# Segmenting and Clustering Neighborhoods in Toronto

I will explore, segment, and cluster the neighborhoods in the city of Toronto based on the postalcode and borough information. This analysis will be performed using scrapped web  Toronto neighborhood data from <a href="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M">Wikipedia.</a>   

### Scrapping Wikipedia Data

In [1]:
#!curl https://bootstrap.pypa.io/pip/2.7/get-pip.py -o get-pip.py
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Four
!pip install bs4
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html=requests.get(url).text
soup= BeautifulSoup(html, 'html.parser')

In [3]:
#Get table data for transformation
table=soup.find_all('table')
table_rows=table[0].find_all('tr')
table_rows

[<tr>
 <td style="width:11%; vertical-align:top; color:#ccc;">
 <p><b>M1A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>
 </td>
 <td style="width:11%; vertical-align:top; color:#ccc;">
 <p><b>M2A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
 </p>
 </td>
 <td style="width:11%; vertical-align:top;">
 <p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
 </p>
 </td>
 <td style="width:11%; vertical-align:top;">
 <p><b>M4A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
 </p>
 </td>
 <td style="width:11%; vertical-align:top;">
 <p><b>M5A</b><br/><span style="font-size:85%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/>(<a href="/wiki/Regent_Park" ti

#### Tranform the data into a _pandas_ dataframe
The next task is essentially transforming this data of scrapped Html tags into a pandas dataframe. 

In [4]:
hood_data=pd.DataFrame(columns=["PostalCode","Borough","Neighborhood"])

for row in table_rows:
    col=row.find_all('td')
    
    for row_item in col:
        
        if (col != []):
            row_item=row_item.get_text().replace('\n','')
            
            #Formatting Post Code
            values_0=row_item[0:3]
            postal_code = values_0
            
            #Formatting Borough & Neighbourhood
            values_1=row_item[3:].split('(')
            values_1[0]=values_1[0].replace('Not assigned','')
            borough = values_1[0]
            
            hood=np.NAN
            
            if len(values_1)>1:
                values_1[1]=values_1[1].replace(')','')
                values_1[1]=values_1[1].replace('/',',')
                hood = values_1[1] 
            
            hood_dict={"PostalCode":postal_code,"Borough":borough,"Neighborhood":hood}
            #print(hood_dict)
            hood_data=hood_data.append(hood_dict,ignore_index=True)
            
#Select only cells. without empty Borough
scraped_hood_data=hood_data[hood_data['Borough']!='']
scraped_hood_data

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
169,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [5]:
scraped_hood_data= scraped_hood_data.reset_index(drop=True)
scraped_hood_data.shape


(103, 3)

### Clean Data

In [6]:
scraped_hood_data['Borough']

0                                             North York
1                                             North York
2                                       Downtown Toronto
3                                             North York
4                                           Queen's Park
                             ...                        
98                                             Etobicoke
99                                      Downtown Toronto
100    East TorontoBusiness reply mail Processing Cen...
101                                            Etobicoke
102                                            Etobicoke
Name: Borough, Length: 103, dtype: object

In [7]:
scraped_hood_data.loc[scraped_hood_data['Borough'] == 'East TorontoBusiness reply mail Processing Centre969 Eastern', 'Neighborhood']

100    Enclave of M4L
Name: Neighborhood, dtype: object

In [8]:
scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'East TorontoBusiness reply mail Processing Centre969 Eastern'), 'Borough']='East Toronto'

scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'EtobicokeNorthwest'), 'Neighborhood']='Northwest'
scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'EtobicokeNorthwest'), 'Borough']='Etobicoke Northwest'

scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'East YorkEast Toronto'), 'Borough']='East York'

scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'Downtown TorontoStn A PO Boxes25 The Esplanadet'),'Neighborhood']='Stn A PO Boxes 25 The Esplanadet'
scraped_hood_data.loc[(scraped_hood_data['Borough'] == 'Downtown TorontoStn A PO Boxes25 The Esplanadet'),'Borough']='Downtown Toronto'

In [9]:
scraped_hood_data.shape

(103, 3)

## Get Coordinates
We need to get the latitude and the longitude coordinates of each neighborhood. 



In [10]:
def get_lat_( address):
    try:
        geolocator = Nominatim(user_agent="ny_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        return latitude
    except:
        return np.NAN

def get_long_( address):
    try:
        geolocator = Nominatim(user_agent="ny_explorer")
        location = geolocator.geocode(address)
        longitude = location.longitude
        return longitude
    except:
        return np.NAN


In [15]:
#Get Longitutde
scraped_hood_data['Longitude']=scraped_hood_data['PostalCode'].apply(get_long_)

#Get Latitutde
scraped_hood_data['Latitude']=scraped_hood_data['PostalCode'].apply(get_lat_)
scraped_hood_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-76.96231,-12.198334
1,M4A,North York,Victoria Village,8.467,49.48429
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",28.018025,45.440588
3,M6A,North York,"Lawrence Manor , Lawrence Heights",-1.752006,53.794164
4,M7A,Queen's Park,Ontario Provincial Government,26.165951,44.428198


In [16]:
scraped_hood_data['Longitude'].isnull().values.sum()

59

In [17]:
newhood= scraped_hood_data.dropna()
newhood.shape

(44, 5)

In [19]:
# create map of New York using latitude and longitude values
toronto_lat=get_lat_('Toronto')
toronto_long=get_long_('Toronto')
map_toronto = folium.Map(location=[toronto_lat,toronto_long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(newhood['Latitude'], newhood['Longitude'], newhood['Borough'], newhood['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto