# Segmenting and Clustering Neighborhoods in Toronto
## Import all necessary libraries

In [21]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib3

# Part One

## Read in the Data
### Create a urllib3 pool, get the data, & convert to a BS4 soup
*Retrieves the html data from the wiki page and convert it to a tag "soup" to be searched later*

In [204]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data, 'html.parser')



### Extract the table text & convert to a Pandas DataFrame 'post'
*I do not assume that there is only one table in the page. I extract the data from the first table encountered.*
*I iterate over the rows of data in the html and convert them to a list of dictionaries that are used to populate the data in the DataFrame*

In [207]:
table = soup.find_all('table')[0]

column_names = [i.text.strip() for i in table.find_all('th')]
table_data = []

for row in table.find_all('tr'):
    columns = [td.text.strip() for td in row.find_all('td')]
    table_data.append(dict(zip(column_names,columns)))
        
post = pd.DataFrame(columns=column_names, data=table_data)

## Clean and Transform the data as necessary
### Take the 'post' frame, clean it, group it, & convert back to a normal DataFrame 'post_grouped'
*In this step, I am using the replace function to make it more simple to remove and replace values using the notnull and fillna functions later*

*Additionally, the apply function takes the SeriesGroupBy object in the Neighbourhood column and converts it to a single string inside of a pandas Series object. The data can be extracted later by splitting the string in that column.*

In [213]:
post.replace(to_replace='Not assigned',value=np.nan,inplace=True)
post = post[post.Borough.notnull()]
post['Neighbourhood'].fillna(post.Borough,inplace=True)

post_grouped = post.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

post_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [234]:
post_grouped.shape

(103, 3)

# Part Two

### Read in the data for the geospatial coordinates
*Please note that I have used the provided csv as the API calls demonstrated provided no data*

In [202]:
geo_coords = pd.read_csv('Geospatial_Coordinates.csv')
geo_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the DataFrames into one
*Note that I am assuming that every 'Postcode' can be joined to a 'Postal Code' from one DataFrame to the other*

*Using the merge function with how='inner' will cause non-matching items to be dropped*

In [214]:
post_and_geo = pd.merge(post_grouped, geo_coords, how='inner', left_on=['Postcode'],right_on=['Postal Code'])
post_and_geo.drop(['Postal Code'],axis=1,inplace=True)
post_and_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part Three
*I have chosen to follow along with the neighborhood clustering demo done in DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb*

### Import libraries needed for this section

In [285]:
try:
    import folium
except:
    !conda install -c conda-forge folium=0.5.0 --yes
    import folium

from sklearn.cluster import KMeans

import requests
import pickle

import matplotlib.cm as cm
import matplotlib.colors as colors

In [1]:
# The code was removed by Watson Studio for sharing.

### Strip out all data that is not relevant to Toronto directly
*Show the number of boroughs and neighborhoods as well*

In [256]:
toronto_only = post_and_geo[post_and_geo['Borough'].str.contains('Toronto')].reset_index()
toronto_only.drop(['index'], inplace=True,axis=1)

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_only['Borough'].unique()),
        toronto_only.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


### Manually enter the center of Toronto rather than rely on another library

In [257]:
t_lat = 43.6532
t_long = -79.3832
print('The geograpical coordinates of Toronto are {}, {}.'.format(t_lat, t_long))

The geograpical coordinates of Toronto are 43.6532, -79.3832.


### Create map of Toronto using latitude and longitude values with neighborhood clusters overlayed

In [324]:
map_toronto = folium.Map(location=[t_lat, t_long], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_only['Latitude'], toronto_only['Longitude'], toronto_only['Borough'], toronto_only['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Use the function created for the New York Neighborhoods Analysis
*Find the original function in DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb*

In [267]:
LIMIT = 100
radius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Extract the venue information for each neighborhood cluster in our dataset
*I am including an option to pickle the data locally in case we do not wish to use more API calls*

In [266]:
try:
    toronto_venues = pickle.load(open("toronto_venues.pandas","rb"))
    print("Loaded data from locally pickled information.")
except:
    print("No Pickled data was found, running API calls and dumping to pickle.")
    
    toronto_venues = getNearbyVenues(names=toronto_only['Neighbourhood'],
                                   latitudes=toronto_only['Latitude'],
                                   longitudes=toronto_only['Longitude']
                                  )
    pickle.dump(toronto_venues, open("toronto_venues.pandas","wb"))

No Pickled data was found, running API calls and dumping to pickle.


### Do a one-hot encoding of the data in the venue information

In [269]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
toronto_onehot.shape

(1706, 229)

### Group the resulting one-hot data for each venue by Neighborhood and get its mean value to determine a frequency

In [270]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()
toronto_grouped.shape

(39, 229)

### Extract the top 10 sorted venues for each neighborhood cluster

In [309]:
num_top_venues = 15

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Steakhouse,Bar,Café,Cosmetics Shop,Breakfast Spot,Asian Restaurant,Burger Joint,Restaurant,Thai Restaurant,Bakery,Sushi Restaurant,Hotel,Gym,American Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Seafood Restaurant,Steakhouse,Café,Cheese Shop,Beer Bar,Bakery,Breakfast Spot,Concert Hall,Bistro,Shopping Mall,Hotel,Beach
2,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Nightclub,Yoga Studio,Bakery,Performing Arts Venue,Pet Store,Convenience Store,Climbing Gym,Burrito Place,Restaurant,Italian Restaurant,Intersection,Bar
3,Business Reply Mail Processing Centre 969 Eastern,Park,Burrito Place,Butcher,Fast Food Restaurant,Farmers Market,Auto Workshop,Recording Studio,Spa,Restaurant,Pizza Place,Smoke Shop,Comic Shop,Skate Park,Garden,Garden Center
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Service,Airport Terminal,Airport Lounge,Sculpture Garden,Rental Car Location,Harbor / Marina,Bar,Boat or Ferry,Coffee Shop,Airport Gate,Airport Food Court,Airport,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


### Set the desired number of clusters and fit the data using KMeans

In [310]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

### Add cluster labels to the venue data and merge it with the with the original data set

In [311]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_only

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Trail,Health Food Store,Pub,Women's Store,...,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,...,Furniture / Home Store,Yoga Studio,Pub,Bakery,Dessert Shop,Spa,Liquor Store,Diner,Brewery,Bookstore
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Park,Sushi Restaurant,Board Shop,Brewery,...,Liquor Store,Burger Joint,Burrito Place,Ice Cream Shop,Fast Food Restaurant,Fish & Chips Shop,Steakhouse,Pub,Sandwich Place,Pet Store
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Bakery,...,Italian Restaurant,American Restaurant,Comfort Food Restaurant,Sandwich Place,Cheese Shop,Pet Store,Park,Gay Bar,Convenience Store,Seafood Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Bus Line,Lake,Swim School,Park,...,Comfort Food Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant


### Generate a map of the clusters that were determined from the data fitting

In [316]:
# create map
map_clusters = folium.Map(location=[t_lat, t_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],
                                  toronto_merged['Longitude'],
                                  toronto_merged['Neighbourhood'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [322]:
from IPython.display import display

cluster_names = ['Restaurant & Food',
                 'Playground & Recreation',
                 'Transportation & Recreation',
                 'Park & Stores',
                 'Recreation & Stores',]

for i in np.arange(kclusters):
    print("Cluster {} ({}):".format(i,cluster_names[i]))
    display(toronto_merged.loc[toronto_merged['Cluster Labels'] == i, 
                       toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]].head())

Cluster 0 (Restaurant & Food):


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
1,"The Danforth West,Riverdale",Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Pub,Bakery,Dessert Shop,Spa,Liquor Store,Diner,Brewery,Bookstore
2,"The Beaches West,India Bazaar",Park,Sushi Restaurant,Board Shop,Brewery,Italian Restaurant,Liquor Store,Burger Joint,Burrito Place,Ice Cream Shop,Fast Food Restaurant,Fish & Chips Shop,Steakhouse,Pub,Sandwich Place,Pet Store
3,Studio District,Café,Coffee Shop,Gastropub,Bakery,Brewery,Italian Restaurant,American Restaurant,Comfort Food Restaurant,Sandwich Place,Cheese Shop,Pet Store,Park,Gay Bar,Convenience Store,Seafood Restaurant
5,Davisville North,Hotel,Park,Gym,Breakfast Spot,Sandwich Place,Food & Drink Shop,Department Store,Women's Store,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
6,North Toronto West,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Salon / Barbershop,Restaurant,Rental Car Location,Café,Chinese Restaurant,Mexican Restaurant,Metro Station,Dessert Shop,Diner,Fast Food Restaurant,Gym / Fitness Center


Cluster 1 (Playground & Recreation):


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
8,"Moore Park,Summerhill East",Park,Playground,Tennis Court,Restaurant,Cuban Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio
10,Rosedale,Park,Playground,Trail,Cupcake Shop,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega


Cluster 2 (Transportation & Recreation):


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
4,Lawrence Park,Bus Line,Lake,Swim School,Park,Colombian Restaurant,Comfort Food Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant


Cluster 3 (Park & Stores):


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
22,Roselawn,Health & Beauty Service,Pool,Garden,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


Cluster 4 (Recreation & Stores):


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,The Beaches,Trail,Health Food Store,Pub,Women's Store,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store


### Results:
*It appears that the algorithm unfortunately lumped together a large number of the items into the first cluster.Most likely because of a heavy emphasis on food in that category. It would have been better to have seen more separation and definition amongst the clusters to provide more nuanced information in the final model.*