# Segmenting and Clustering Neighborhoods in Toronto

# Question1 - To create dataframe

In [1]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd



In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [5]:
page = requests.get(url)

In [6]:
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
wiki_table=soup.find('table',{'class':'wikitable sortable'})

Strip the text

In [8]:
cols=wiki_table.find_all('td')
data = [td.text.strip() for td in cols[:len(cols)]]
PostalCode = data[0:len(cols):3]
Borough = data[1:len(cols):3]
Neighbourhood = data[2:len(cols):3]

In [9]:
wt = pd.DataFrame({'PostalCode':PostalCode, 'Borough': Borough, 'Neighbourhood': Neighbourhood})
wt.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


Cleaning the dataframe

In [10]:
wt_clean = wt[(wt['Borough'] != 'Not assigned')]

In [11]:
#Replacing unassigned with borough
idx = wt_clean[wt_clean['Neighbourhood'] == 'Not assigned'].index.astype('float')
wt_clean['Neighbourhood'].loc[idx] = wt_clean['Borough'].loc[idx]
wt_clean.head(12)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Queen's Park
9,M9A,Downtown Toronto,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


Checking for same postal codes

In [12]:
wt_clean['PostalCode'].unique().shape
pcodes = wt_clean['PostalCode'].unique()

In [13]:
NH_idx =[]
for n in range(len(pcodes)):
    NH_idx.append(wt_clean[wt_clean['PostalCode'] == pcodes[n]].index.values)

In [15]:
nhs = []
bs = []
for nh in range(len(NH_idx)):
    nhs.append(wt_clean['Neighbourhood'].loc[NH_idx[nh]].values.astype('str'))
    bs.append(wt_clean['Borough'].loc[NH_idx[nh][0]])

In [16]:
result = pd.DataFrame({'PostalCode': pcodes, 'Borough': bs, 'Neighbourhood': nhs})
result.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,[Harbourfront]
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]"
4,M7A,Queen's Park,[Queen's Park]
5,M9A,Downtown Toronto,[Queen's Park]
6,M1B,Scarborough,"[Rouge, Malvern]"
7,M3B,North York,[Don Mills North]
8,M4B,East York,"[Woodbine Gardens, Parkview Hill]"
9,M5B,Downtown Toronto,"[Ryerson, Garden District]"


In [17]:
result.shape

(103, 3)

# Question2 - Map dataframe with latitude and longitude

In [32]:
!wget -q -O 'Toronto_long_lat_data.csv'  http://cocl.us/Geospatial_data.csv
print('Data downloaded!')

Data downloaded!


In [37]:
dfll= pd.read_csv("http://cocl.us/Geospatial_data")

In [38]:
dfll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [43]:
dfll.columns=['PostalCode','Latitude','Longitude']

In [44]:
dfll.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [47]:
dfll.set_index("PostalCode")
result.set_index("PostalCode")
toronto_data=pd.merge(result, dfll)
toronto_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,[Parkwoods],43.753259,-79.329656
1,M4A,North York,[Victoria Village],43.725882,-79.315572
2,M5A,Downtown Toronto,[Harbourfront],43.65426,-79.360636
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]",43.718518,-79.464763
4,M7A,Queen's Park,[Queen's Park],43.662301,-79.389494
5,M9A,Downtown Toronto,[Queen's Park],43.667856,-79.532242
6,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
7,M3B,North York,[Don Mills North],43.745906,-79.352188
8,M4B,East York,"[Woodbine Gardens, Parkview Hill]",43.706397,-79.309937
9,M5B,Downtown Toronto,"[Ryerson, Garden District]",43.657162,-79.378937


# Question3 - Generate maps

In [49]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    altair-4.0.0               |             py_0         606 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.1 MB

The following NEW packages will be 

In [51]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [52]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [54]:
map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

How do they cluster together

In [62]:
#Foursquare
CLIENT_ID = 'F5550W10YWYY3GC5JOT3S31A4FX5YOG2OWKRRUYJZCZG5HCL' 
CLIENT_SECRET = 'V3Y0M533TBT115CPQGYZEWWWM5W22STDFOUBCMWICS4EJ3SO' 
VERSION = '20180605'

assume the size of interest

In [63]:
radius=500
LIMIT=100

In [64]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [65]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

['Parkwoods']
['Victoria Village']
['Harbourfront']
['Lawrence Heights' 'Lawrence Manor']
["Queen's Park"]
["Queen's Park"]
['Rouge' 'Malvern']
['Don Mills North']
['Woodbine Gardens' 'Parkview Hill']
['Ryerson' 'Garden District']
['Glencairn']
['Cloverdale' 'Islington' 'Martin Grove' 'Princess Gardens'
 'West Deane Park']
['Highland Creek' 'Rouge Hill' 'Port Union']
['Flemingdon Park' 'Don Mills South']
['Woodbine Heights']
['St. James Town']
['Humewood-Cedarvale']
['Bloordale Gardens' 'Eringate' 'Markland Wood' 'Old Burnhamthorpe']
['Guildwood' 'Morningside' 'West Hill']
['The Beaches']
['Berczy Park']
['Caledonia-Fairbanks']
['Woburn']
['Leaside']
['Central Bay Street']
['Christie']
['Cedarbrae']
['Hillcrest Village']
['Bathurst Manor' 'Downsview North' 'Wilson Heights']
['Thorncliffe Park']
['Adelaide' 'King' 'Richmond']
['Dovercourt Village' 'Dufferin']
['Scarborough Village']
['Fairview' 'Henry Farm' 'Oriole']
['Northwood Park' 'York University']
['East Toronto']
['Harbourfront E

In [67]:
toronto_venues.shape

(2223, 7)