****Toronto's Borough study****

**First step - create the dataframe**

In [2]:
import pandas as pd

In [3]:
# import toronto's borough from the web by panda's function
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Post Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# erase not assigned
df = df[df.Borough != "Not assigned"]

In [39]:
# group where Neighbohhood have the same postal code
df2 = df.groupby(['Post Code','Borough'],sort=False)['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df2.head()

Unnamed: 0,Post Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [40]:
# when Neighborhood is "not assigned" replace with Borough
df2.loc[df2['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df2['Borough']

In [8]:
# print the rows of the database
print("The final number of rows in this dataframe is", df2.shape[0])

The final number of rows in this dataframe is 103


**Second step - Add geolocalization to the dataframe**

In [9]:
# read the geolocalization of postal code from the web "geospatial"
da = pd.read_csv('http://cocl.us/Geospatial_data')

In [10]:
da.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# merge the two dataframes
df_toronto = pd.merge(df2, da, left_on = 'Post Code', right_on = 'Postal Code')
df_toronto.head()

Unnamed: 0,Post Code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


**Third step - Explore and cluster the neighborhoods in Toronto**

In [14]:
import numpy as np 

from geopy.geocoders import Nominatim
import requests 

from pandas.io.json import json_normalize 
import json

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

**Let's check out the neighbourhood of Etobicoke (my brother used to live there)**

In [15]:
df_Scarborough = df_toronto[df_toronto['Borough'] == 'Scarborough'].reset_index(drop=True)
df_Scarborough

Unnamed: 0,Post Code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848


In [20]:
#from wikipedia
e_lat = 43.77
e_long = -79.24

# create map of Scarborough using latitude and longitude values
map_Scarborough = folium.Map(location=[e_lat, e_long], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_Scarborough['Latitude'], df_Scarborough['Longitude'], df_Scarborough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Scarborough)  
    
#map_Scarborough
map_Scarborough

In [21]:
CLIENT_ID = '22CWWLUJADQHFAL4HZ4TG4HQPY1VIPXMJLVCRUV4TXAJFMGP' 
CLIENT_SECRET = 'H05XZGGVBPQT3ZH3Q2WEYWA42QEPA23CZMKGULM4FADJ0LKR'
VERSION = '20200719'

**Write a function to make pulling nearby venues easier**

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=600):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            800, 
            300)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Call foursquare to get the nearby venues for each Neighbourhood**

In [24]:
Scarborough_venues = getNearbyVenues(names=df_Scarborough['Neighborhood'],
                                   latitudes=df_Scarborough['Latitude'],
                                   longitudes=df_Scarborough['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge


**Convert the Venue data to one hot encoding then built a data frame with it and their Neighbourhoods**

In [25]:
Scarborough_onehot = pd.get_dummies(Scarborough_venues[['Venue Category']], prefix="", prefix_sep="")
Scarborough_onehot['Neighbourhood'] = Scarborough_venues['Neighbourhood'] 
fixed_columns = [Scarborough_onehot.columns[-1]] + list(Scarborough_onehot.columns[:-1])
Scarborough_venuelist = Scarborough_onehot.groupby('Neighbourhood').mean().reset_index()

**Find the 5 most common venues for each Neighbourhood**

In [27]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
S_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
S_neighborhoods_venues_sorted['Neighbourhood'] = Scarborough_venuelist['Neighbourhood']

for ind in np.arange(Scarborough_venuelist.shape[0]):
    S_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Scarborough_venuelist.iloc[ind, :], num_top_venues)

S_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Chinese Restaurant,Lounge,Discount Store,Breakfast Spot,Pool Hall
1,"Birch Cliff, Cliffside West",College Stadium,Thai Restaurant,Café,General Entertainment,Diner
2,Cedarbrae,Indian Restaurant,Coffee Shop,Yoga Studio,Burger Joint,Gas Station
3,"Clarks Corners, Tam O'Shanter, Sullivan",Convenience Store,Pharmacy,Pizza Place,Golf Course,Fast Food Restaurant
4,"Cliffside, Cliffcrest, Scarborough Village West",Ice Cream Shop,Hardware Store,Restaurant,Pizza Place,Park


**Run k-means to cluster the neighbourhoods in 5 clusters**

In [28]:
# set number of clusters
kclusters = 5

Scarborough_venuelist_clustering = Scarborough_venuelist.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Scarborough_venuelist_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 3, 1, 1, 2, 1, 1, 1, 1, 1], dtype=int32)

In [29]:
S_neighborhoods_venues_sorted.insert(0, 'Clusters', kmeans.labels_)

**Merge the information back into one dataframe for easy analysis and future use**

In [41]:
df_Scarborough_merged = df_Scarborough

df_Scarborough_merged = df_Scarborough_merged.join(S_neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

df_Scarborough_merged.dropna(axis=0,inplace = True)
df_Scarborough_merged['Clusters'] = df_Scarborough_merged['Clusters'].astype('int')

df_Scarborough_merged

KeyError: ignored

**Visualize the Results**

In [42]:

# create map
map_clusters = folium.Map(location=[e_lat, e_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_Scarborough_merged['Latitude'], df_Scarborough_merged['Longitude'], df_Scarborough_merged['Neighbourhood'],df_Scarborough_merged['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

KeyError: ignored