# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

 ### Block 1 *Import libraries*

In [73]:
import requests
import pandas as pd

### *Import data*

In [74]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wiki)

df_raw = pd.read_html(wikipedia_page.content, header=0)[0]
df_new = df_raw[df_raw.Borough != 'Not assigned']
df_new.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Renaming colums

In [76]:
df_new.rename(columns = {'Postal Code':'PostCode'}, inplace = True)
df_new.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True)
df_new.head()

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### *Finding not assigned (however everything it is)*

In [77]:
df_new.loc[df_new.Neighborhood == 'Not assigned']

Unnamed: 0,PostCode,Borough,Neighborhood


In [79]:
df_new.Neighborhood.replace('Not assigned',df_new.Borough,inplace=True)
df_new.head()

Unnamed: 0,PostCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### *Finding duplicating postcodes (In this example there are none)*

In [80]:
df_toronto = df_new.groupby(['PostCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()
df_toronto.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [81]:
df_toronto.shape

(103, 3)

### Block 2 *Adding Coordinates of the postal code from CSV file*

In [82]:
url = 'http://cocl.us/Geospatial_data'
df_geo=pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [83]:
#check the sshape o the csv file
df_geo.shape

(103, 3)

In [84]:
df_toronto = df_toronto.join(df_geo.set_index('Postal Code'), on='PostCode')
df_toronto.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Block 3 *Clustering*

### *Coordenadas*

In [85]:
!conda install -c conda-forge geocoder --yes
import geocoder
from geopy.geocoders import Nominatim 

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Solving environment: / ^C
failed

CondaError: KeyboardInterrupt

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### *Create Map*

In [None]:
!pip install folium==0.5.0
import folium

# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto

### *FourSquare Credentials*

In [98]:
CLIENT_ID = 'GEHGRDNDQ0CH0GJG3DJFG4B4QLLALBZVIVHR3MXRLGYGGGPG' # your Foursquare ID
CLIENT_SECRET = '5BZJ0LMQSCST0P3YHL5NPQ4P0SYC11T5TQEXS0UNPAKLZK2S' # your Foursquare Secret
VERSION = '20200728' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GEHGRDNDQ0CH0GJG3DJFG4B4QLLALBZVIVHR3MXRLGYGGGPG
CLIENT_SECRET:5BZJ0LMQSCST0P3YHL5NPQ4P0SYC11T5TQEXS0UNPAKLZK2S


### *Explore the data, and get the venues in 500 meters range from our first entry*

In [88]:
#df_toronto.loc[0, 'Neighborhood']

In [99]:
neighborhood_latitude = df_toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern, Rouge are 43.806686299999996, -79.19435340000001.


### *URL request*

In [100]:
LIMIT = 1000
radius = 5000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=GEHGRDNDQ0CH0GJG3DJFG4B4QLLALBZVIVHR3MXRLGYGGGPG&client_secret=5BZJ0LMQSCST0P3YHL5NPQ4P0SYC11T5TQEXS0UNPAKLZK2S&v=20200728&ll=43.806686299999996,-79.19435340000001&radius=5000&limit=1000'

### *Resultados*

In [101]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f225789ba97fe39f18b53d1'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 148,
  'suggestedBounds': {'ne': {'lat': 43.85168634500004,
    'lng': -79.13211520730404},
   'sw': {'lat': 43.76168625499995, 'lng': -79.25659159269598}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '542858a0498e22b7cfa91070',
       'name': 'Toronto Pan Am Sports Centre',
       'location': {'address': '875 Morningside Ave',
        'lat': 43.79062336247366,
        'lng': -79.19386919338744,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.79062

### *Catergory of the venue*

In [102]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Summary of the venue

In [103]:
# function that extracts the Summary of the venue
def get_summary_type(row):
    try:
        categories_list = row['summary']
    except:
        categories_list = row['reasons.items']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['summary']

### *We can see that we had only 1 response for our first entry*

In [104]:
import json
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'reasons.items']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# filter the summary for each row
nearby_venues['reasons.items'] = nearby_venues.apply(get_summary_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

#nearby_venues.transpose()
#nearby_venues.shape
nearby_venues.head(2000)

Unnamed: 0,name,categories,lat,lng,items
0,Toronto Pan Am Sports Centre,Athletics & Sports,43.790623,-79.193869,This spot is popular
1,African Rainforest Pavilion,Zoo Exhibit,43.817725,-79.183433,This spot is popular
2,Toronto Zoo,Zoo,43.820582,-79.181551,This spot is popular
3,Polar Bear Exhibit,Zoo,43.823372,-79.185145,This spot is popular
4,Gorilla Exhibit,Zoo Exhibit,43.819080,-79.184235,This spot is popular
5,Morningside Park,Park,43.786546,-79.205322,This spot is popular
6,Australasia Pavillion,Zoo Exhibit,43.822563,-79.183286,This spot is popular
7,Images Salon & Spa,Spa,43.802283,-79.198565,This spot is popular
8,Orangutan Exhibit,Zoo Exhibit,43.818413,-79.182548,This spot is popular
9,Penguin Exhibit,Zoo Exhibit,43.819435,-79.185959,This spot is popular


In [105]:
filtered_columns

['venue.name',
 'venue.categories',
 'venue.location.lat',
 'venue.location.lng',
 'reasons.items']

### *For all venues*

In [106]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'], #) for v in results])
            v['reasons']['items'][0]['summary']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category', #]
                  'Venue Summary']
    
    return(nearby_venues)

In [107]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [114]:
print(toronto_venues.shape)
toronto_venues.head()

(2152, 8)


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Summary
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant,This spot is popular
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course,This spot is popular
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar,This spot is popular
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank,This spot is popular
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store,This spot is popular


### *venues per neighbourghood*

In [185]:
toronto_venues_grouped = toronto_venues.groupby(['Neighbourhood', 'Neighborhood Latitude', 'Neighborhood Longitude'] ).count().reset_index()
toronto_venues_grouped.head()

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Summary
0,Agincourt,43.7942,-79.262029,4,4,4,4,4
1,"Alderwood, Long Branch",43.602414,-79.543484,9,9,9,9,9
2,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,22,22,22,22,22
3,Bayview Village,43.786947,-79.385975,4,4,4,4,4
4,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,27,27,27,27,27


In [186]:
toronto_venues_grouped.shape

(100, 8)

### Ordernada de Menor a mayor nor da una idea del número de clusters

In [140]:
toronto_venues_grouped_sorted = toronto_venues_grouped.sort_values('Venue')
print(toronto_venues_grouped_sorted.shape)
toronto_venues_grouped_sorted.head(20)

(100, 8)


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Summary
62,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,1,1,1,1,1
43,"Humberlea, Emery",43.724766,-79.532242,1,1,1,1,1
53,"Malvern, Rouge",43.806686,-79.194353,1,1,1,1,1
92,Weston,43.706876,-79.518188,1,1,1,1,1
90,"West Deane Park, Princess Gardens, Martin Grov...",43.650943,-79.554724,1,1,1,1,1
74,Scarborough Village,43.744734,-79.239476,1,1,1,1,1
54,"Milliken, Agincourt North, Steeles East, L'Amo...",43.815252,-79.284577,2,2,2,2,2
17,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,2,2,2,2,2
70,Roselawn,43.711695,-79.416936,2,2,2,2,2
99,York Mills West,43.752758,-79.400049,2,2,2,2,2


### Comentario en la lógica del Kmeans

In [156]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 20

toronto_venues_grouped_clustering = toronto_venues_grouped.drop(['Neighbourhood','Venue Latitude','Venue Longitude','Venue Category','Venue Summary'], axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venues_grouped_clustering)

#check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([ 0,  7, 16,  0,  6, 13,  0, 16, 18, 18], dtype=int32)

### Insertamos indice Kmeans

In [163]:
toronto_venues_grouped_merged = toronto_venues_grouped

In [None]:
toronto_venues_grouped_merged.insert(0, 'Cluster Labels', kmeans.labels_)

In [179]:

toronto_venues_grouped_merged_sorted = toronto_venues_grouped_merged.sort_values('Venue')

In [180]:
toronto_venues_grouped_merged_sorted.head(100)

Unnamed: 0,Cluster Labels,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Summary
62,11,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,1,1,1,1,1
43,11,"Humberlea, Emery",43.724766,-79.532242,1,1,1,1,1
53,11,"Malvern, Rouge",43.806686,-79.194353,1,1,1,1,1
92,11,Weston,43.706876,-79.518188,1,1,1,1,1
90,11,"West Deane Park, Princess Gardens, Martin Grov...",43.650943,-79.554724,1,1,1,1,1
74,11,Scarborough Village,43.744734,-79.239476,1,1,1,1,1
54,11,"Milliken, Agincourt North, Steeles East, L'Amo...",43.815252,-79.284577,2,2,2,2,2
17,11,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,2,2,2,2,2
70,11,Roselawn,43.711695,-79.416936,2,2,2,2,2
99,11,York Mills West,43.752758,-79.400049,2,2,2,2,2


### Mapear los clusters

In [187]:
import numpy as np # library to handle data in a vectorized manner

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, venue in zip(toronto_venues_grouped_merged['Neighborhood Latitude'], toronto_venues_grouped_merged['Neighborhood Longitude'], toronto_venues_grouped_merged['Neighbourhood'], toronto_venues_grouped_merged['Cluster Labels'], toronto_venues_grouped_merged['Venue']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + '  Number of venues  ' + str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [175]:
print(map_clusters)

<folium.folium.Map object at 0x7f8d20d20ef0>
