Author: Nicolo' Sgobba

Task: Explore and cluster the neighborhoods in Toronto (Wikipedia Page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [261]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

from bs4 import BeautifulSoup

print('All libraries imported successfully.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.17.0                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
All libraries imported successfully.


In [262]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

My_table = soup.find('table',{'class':'wikitable sortable'})

In [263]:
lst_dict = []

for items in My_table.find_all('tr')[1::1]:
    data = items.find_all(['th','td'])

    try:
        Postcode = data[0].text
        Borough = data[1].text
        Neighbourhood = data[2].text
    except IndexError:pass
    
    lst_dict.append({'Postcode':Postcode, 'Borough':Borough, 'Neighbourhood': Neighbourhood})

table = pd.DataFrame.from_dict(lst_dict)

In [264]:
TempTable = table[table.Borough != 'Not assigned']

In [265]:
TempTable.Neighbourhood[8] = "Queen's Park"
TempTable = TempTable.replace('\n', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [266]:
temp = TempTable.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [267]:
pd.set_option('expand_frame_repr', False)

print(temp)

    Postcode           Borough                                      Neighbourhood
0        M1B       Scarborough                                     Rouge, Malvern
1        M1C       Scarborough             Highland Creek, Rouge Hill, Port Union
2        M1E       Scarborough                  Guildwood, Morningside, West Hill
3        M1G       Scarborough                                             Woburn
4        M1H       Scarborough                                          Cedarbrae
5        M1J       Scarborough                                Scarborough Village
6        M1K       Scarborough        East Birchmount Park, Ionview, Kennedy Park
7        M1L       Scarborough                    Clairlea, Golden Mile, Oakridge
8        M1M       Scarborough    Cliffcrest, Cliffside, Scarborough Village West
9        M1N       Scarborough                        Birch Cliff, Cliffside West
10       M1P       Scarborough  Dorset Park, Scarborough Town Centre, Wexford ...
11       M1R    

In [268]:
temp.shape

(103, 3)

In [321]:
url="https://cocl.us/Geospatial_data"
code=pd.read_csv(url)
code=code.rename(index=str, columns={'Postal Code': 'Postcode'})
#code.head

In [270]:
neighborhoods = temp.merge(code, on='Postcode', how='left')
neighborhoods.shape

(103, 5)

In [271]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [272]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('Toronto coordinates: {}, {}.'.format(latitude, longitude))



Toronto coordinates: 43.653963, -79.387207.


In [273]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)

In [274]:
map_Toronto

In [275]:
#CLIENT_ID = 'your-client-ID' # your Foursquare ID
CLIENT_ID = 'OHOJGBQX3OYUYLZLB53Z2DIRCUPNMYDD4CXKUJNWWMP12H15' # your Foursquare ID

#CLIENT_SECRET = 'your-client-secret' # your Foursquare Secret
CLIENT_SECRET = 'TPO3OZUZX2ABG5XUJ4P5LCTCYWKWZGEVVNS1O1PGLVE305OM' # your Foursquare Secret

VERSION = '20180604'
LIMIT = 30

print('Credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentials:
CLIENT_ID: OHOJGBQX3OYUYLZLB53Z2DIRCUPNMYDD4CXKUJNWWMP12H15
CLIENT_SECRET:TPO3OZUZX2ABG5XUJ4P5LCTCYWKWZGEVVNS1O1PGLVE305OM


In [276]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
        
        for i in range(1,50):
            current_radius = radius + 50 * (i - 1)
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                current_radius, 
                LIMIT)

            # make the GET request
            groups = requests.get(url).json()['response']['groups']

            selected_group = []
            for group in groups:
                if len(group) > len(selected_group):
                    selected_group = group
                  

            venue_list = []
            if len(selected_group) > 0:
                results = selected_group['items']

                # return only relevant information for each nearby venue
                venue_list = [(
                    name, 
                    lat, 
                    lng, 
                    v['venue']['name'], 
                    v['venue']['location']['lat'], 
                    v['venue']['location']['lng'],  
                    v['venue']['categories'][0]['name'],
                    current_radius
                ) for v in results if v['venue']['categories']]
            
            if len(venue_list) > 10:
                venues_list.append(venue_list)
                break
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Explore Radius']
    
    return(nearby_venues)

In [277]:
Toronto_venues = getNearbyVenues(names=neighborhoods['Neighbourhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

In [278]:
print(Toronto_venues.shape)

(1804, 8)


In [279]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Explore Radius
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Adelaide, King, Richmond",30,30,30,30,30,30,30
Agincourt,15,15,15,15,15,15,15
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",15,15,15,15,15,15,15
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11,11,11,11,11,11,11
"Alderwood, Long Branch",11,11,11,11,11,11,11
"Bathurst Manor, Downsview North, Wilson Heights",19,19,19,19,19,19,19
Bayview Village,12,12,12,12,12,12,12
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25,25
Berczy Park,30,30,30,30,30,30,30
"Birch Cliff, Cliffside West",11,11,11,11,11,11,11


In [280]:
Toronto_venues.groupby('Neighborhood').count().shape

(103, 7)

In [281]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 260 uniques categories.


In [282]:
Toronto_onehot = pd.get_dummies(Tornto_venues[['Venue Category']], prefix="", prefix_sep="")
Toronto_onehot['Neighborhood'] = Tornto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

In [283]:
#Toronto_onehot.shape

In [284]:
toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [285]:
#toronto_grouped.shape

In [286]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    #print(hood)
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    #print('\n')

In [287]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [313]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide\n, King\n, Richmond\n",American Restaurant,Steakhouse,Asian Restaurant,Hotel,Café,Pizza Place,Speakeasy,Smoke Shop,Seafood Restaurant,Plaza
1,Agincourt\n,Lounge,Breakfast Spot,Discount Store,Skating Rink,Sushi Restaurant,Motorcycle Shop,Supermarket,Mediterranean Restaurant,Pool Hall,Seafood Restaurant
2,"Agincourt North\n, L'Amoreaux East\n, Milliken...",BBQ Joint,Pizza Place,Fast Food Restaurant,Gym,Park,Noodle House,Chinese Restaurant,Caribbean Restaurant,Bubble Tea Shop,Shop & Service
3,"Albion Gardens\n, Beaumond Heights\n, Humberga...",Grocery Store,Pizza Place,Fried Chicken Joint,Beer Store,Discount Store,Sandwich Place,Japanese Restaurant,Fast Food Restaurant,Coffee Shop,Pharmacy
4,"Alderwood\n, Long Branch\n",Pizza Place,Pharmacy,Pool,Sandwich Place,Bank,Gas Station,Pub,Skating Rink,Gym,Coffee Shop
5,"Bathurst Manor\n, Downsview North\n, Wilson He...",Coffee Shop,Pharmacy,Frozen Yogurt Shop,Bridal Shop,Sandwich Place,Diner,Middle Eastern Restaurant,Bank,Supermarket,Restaurant
6,Bayview Village\n,Japanese Restaurant,Bank,Grocery Store,Chinese Restaurant,Café,Restaurant,Shopping Mall,Skate Park,Skating Rink,Pharmacy
7,"Bedford Park\n, Lawrence Manor East\n",Italian Restaurant,Fast Food Restaurant,Coffee Shop,Thai Restaurant,Pet Store,Pharmacy,Comfort Food Restaurant,Pub,Café,Butcher
8,Berczy Park\n,Cocktail Bar,Bakery,Café,Seafood Restaurant,Farmers Market,Park,Steakhouse,Jazz Club,Basketball Stadium,Beer Bar
9,"Birch Cliff\n, Cliffside West\n",College Stadium,Park,Skating Rink,Bank,Discount Store,Diner,Café,General Entertainment,Thai Restaurant,Convenience Store


In [316]:
# number of clusters
kclusters = 10
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [318]:
toronto_merged = neighborhoods

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each Neighbourhood neighborhood neighborhoods
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

#toronto_merged.head()

In [319]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [320]:
map_clusters