In [9]:
import pandas as pd
import numpy as np
import requests
import json
from pandas.io.json import json_normalize

In [10]:
#read the html and create the dataframe. Using pandas to read HTML will bring a list. So only use first list [0] for data frame.
wiki_page_html=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df=pd.DataFrame(wiki_page_html[0])
df.head()

Unnamed: 0,Postal Code,Community,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
#Removing rows with 'not assigned' in them in the Borough column. This column matches the Neighbourhood column as well.
df=df[df.Community!='Not assigned']
df.head()

Unnamed: 0,Postal Code,Community,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [13]:
#rename first column to PostalCode 
df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,PostalCode,Community,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Using Nominatim and geocode to find coordinates.

In [14]:

from geopy.geocoders import Nominatim
locator=Nominatim(user_agent="MyGeocoder")
location=locator.geocode("M3A, North York, Parkwoods")
latitude=location.latitude
longitude=location.longitude
latitude, longitude

(43.7612239, -79.3239857)

In [15]:
#resetting index after dropping some rows with no values
df.reset_index(drop=True, inplace=True)
df.head(3)

Unnamed: 0,PostalCode,Community,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### **Importing the csv provided in lab to mix with my data. This was easier than using geocode to find latitude and longitude coordinates.**

In [16]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497


In [17]:
#sort values so we can simply combine CSV file values to dataframe
df.sort_values(by="PostalCode", inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,PostalCode,Community,Neighbourhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [18]:
#reset index just for better order
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Community,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
#add latitude and longitude columns to current df and add in CSV coordinates
df['Latitude']=df_data_1['Latitude']
df['Longitude']=df_data_1['Longitude']
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,PostalCode,Community,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [21]:
print("There are {} districts in Toronto.".format(len(df['Community'].unique())))
print("Some postal codes are in several neighbourhoods.")

There are 10 districts in Toronto.
Some postal codes are in several neighbourhoods.


In [22]:
#installing folium for maps
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 1.8MB/s ta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


## Creating a folium map of Toronto. 
We are filling in all the dataframe latitudes, longitudes and neighbourhoods with popups.

In [23]:

map_toronto=folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='3186cc',
        fill_opacity=.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### Creating Foursquare Credentials

In [24]:

CLIENT_ID='2S0C1ZDL1WZLKVJU2TVHWMIRKM2APFJPUIFIWW1YNWOYV5E0'
CLIENT_SECRET='CD3AQMP0EHTTTGMDEXXCHSR4AYW514ZX2QMEUSP23FFF5XZ3'
VERSION='20200822'

Locating neighbourhoods. We will segment by Borough for simplicity. Row 70 is downtown Toronto

In [26]:
#locating neighbourhoods. We will segment by Borough for simplicity. 70 is downtown Toronto
print(df.loc[70, 'Neighbourhood'])
print(df.loc[70, 'Community'])

First Canadian Place, Underground city
Downtown Toronto


In [31]:
#creating coordinates for API foursquare.
community_latitude=df.loc[70, 'Latitude']
community_longitude=df.loc[70, 'Longitude']

borough_name=df.loc[70, 'Community']

print("Latitude and longitude values of {} are {}, {}.".format(borough_name,
                                                              community_latitude,
                                                              community_longitude))

Latitude and longitude values of Downtown Toronto are 43.6484292, -79.3822802.


##### Creating a top venue url 

In [32]:
LIMIT=100
radius=500

top_venues_url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    community_latitude,
    community_longitude,
    radius,
    LIMIT
    )

print(top_venues_url)

https://api.foursquare.com/v2/venues/explore?&client_id=2S0C1ZDL1WZLKVJU2TVHWMIRKM2APFJPUIFIWW1YNWOYV5E0&client_secret=CD3AQMP0EHTTTGMDEXXCHSR4AYW514ZX2QMEUSP23FFF5XZ3&v=20200822&ll=43.6484292,-79.3822802&radius=500&limit=100


#### Examining the results through Json.

In [33]:
top_venue_results=requests.get(top_venues_url).json()

top_venue_results

{'meta': {'code': 200, 'requestId': '5f596f37221cc778cc810293'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Financial District',
  'headerFullLocation': 'Financial District, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 165,
  'suggestedBounds': {'ne': {'lat': 43.652929204500005,
    'lng': -79.37607280281344},
   'sw': {'lat': 43.6439291955, 'lng': -79.38848759718655}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad4c05df964a52059f620e3',
       'name': 'Canoe',
       'location': {'address': '66 Wellington St West',
        'crossStreet': 'at Bay Street',
        'lat': 43.647452066183476,
        'lng': -79.38132001815676,
        'labeledL

In [34]:
def get_category_type(row):
    try:
        categories_list=row['categories']
    except:
        categories_list=row['venue.categories']
        
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']

### Cleaning data and structuring it into a pandas dataframe

In [35]:
venues=top_venue_results['response']['groups'][0]['items']

nearby_venues=json_normalize(venues)

filtered_columns=['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues=nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories']=nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns=[col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


Unnamed: 0,name,categories,lat,lng
0,Canoe,Restaurant,43.647452,-79.38132
1,Equinox Bay Street,Gym,43.6481,-79.379989
2,Adelaide Club Toronto,Gym / Fitness Center,43.649279,-79.381921
3,Cactus Club Cafe,American Restaurant,43.649552,-79.381671
4,Pilot Coffee Roasters,Coffee Shop,43.648835,-79.380936


#### should be 100 since I set my API url to 100 limit!

In [36]:
print("{} venues were returned by Foursquare!".format(nearby_venues.shape[0]))

100 venues were returned by Foursquare!


# Time to explore all the area venues
Creating a process to pull all venues in all the areas of Toronto

In [37]:
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng, in zip(names, latitudes, longitudes):
        print(name)
        
        #create the API request URL
        url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
        
        #make the GET request
        results=requests.get(url).json()["response"]['groups'][0]['items']
        
        
        #return only the relevenat information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns=['Neighbourhood',
                              'Neighbourhood Latitude',
                              'Neighbourhood Longitude',
                              'Venue',
                              'Venue Latitude',
                              'Venue Longitude',
                              'Venue Category']
    return(nearby_venues)

In [38]:
toronto_venues=get_nearby_venues(names=df['Neighbourhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

#### Size of resulting dataframe.

In [39]:
print(toronto_venues.shape)
toronto_venues.head()

(2151, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


_Checking how many venues were returned for each neighbourhood_

In [40]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",7,7,7,7,7,7
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
Berczy Park,58,58,58,58,58,58
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",15,15,15,15,15,15
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17


_The number of unique categories from the returned venues._

In [41]:
print("There are {} unique venues.".format(len(toronto_venues['Venue Category'].unique())))

There are 266 unique venues.


## Analyzing each Neighbourhood

In [42]:
#onehot encoding
toronto_onehot=pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#add Neighbourhood column back into dummy Dataframe
toronto_onehot['Neighbourhood']=toronto_venues['Neighbourhood']

#move Neighbourhood column to the first column
fixed_columns=[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot=toronto_onehot[fixed_columns]

toronto_onehot.head(3)

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Let's examine new DF size

In [43]:
toronto_onehot.shape

(2151, 267)

#### Grouping rows by Neighbourhood and by taking the mean of the frequency of occurence of each category

In [44]:
toronto_grouped=toronto_onehot.groupby(by='Neighbourhood').mean().reset_index()

In [45]:
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
1,"Alderwood, Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
3,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
4,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041667,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
5,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.017241,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
6,"Birch Cliff, Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
8,"Business reply mail Processing Centre, South C...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.000000,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000


__Confirming the new size__

In [46]:
toronto_grouped.shape

(96, 267)

### Printing each neighbourhood along with the top 5 most common venues

In [47]:
num_top_venues=5

for hood in toronto_grouped['Neighbourhood']:
    print('------'+hood+'------')
    temp=toronto_grouped[toronto_grouped['Neighbourhood']==hood].T.reset_index()
    temp.columns=['venue', 'freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    
    print('\n')

------Agincourt------
                       venue  freq
0  Latin American Restaurant  0.25
1                     Lounge  0.25
2             Breakfast Spot  0.25
3             Clothing Store  0.25
4         Miscellaneous Shop  0.00


------Alderwood, Long Branch------
            venue  freq
0     Pizza Place  0.29
1     Coffee Shop  0.14
2  Sandwich Place  0.14
3             Pub  0.14
4            Pool  0.14


------Bathurst Manor, Wilson Heights, Downsview North------
           venue  freq
0           Bank  0.09
1    Coffee Shop  0.09
2          Diner  0.04
3  Deli / Bodega  0.04
4  Shopping Mall  0.04


------Bayview Village------
                 venue  freq
0  Japanese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3   Chinese Restaurant  0.25
4    Accessories Store  0.00


------Bedford Park, Lawrence Manor East------
                     venue  freq
0       Italian Restaurant  0.12
1              Coffee Shop  0.08
2           Sandwich Place  0.08
3  

In [48]:
def return_most_common_venues(row, num_top_venues):
    row_categories=row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

 Creating the new dataframe and displaying the top 10 venues for each neighbourhood 

In [49]:
num_top_venues=10

indicators=['st', 'nd', 'rd']

#creating number of columns according to top venues
columns=['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
#create a new dataframe
neighbourhoods_venues_sorted=pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood']=toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pub,Pool,Diner,Deli / Bodega,Department Store,Dessert Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Mobile Phone Shop,Health Food Store,Supermarket,Ice Cream Shop,Sushi Restaurant,Shopping Mall,Restaurant,Deli / Bodega
3,Bayview Village,Café,Japanese Restaurant,Bank,Chinese Restaurant,Yoga Studio,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Pharmacy,Thai Restaurant,Butcher,Pub,Restaurant,Café,Sushi Restaurant


## Cluster Neighbourhoods
Running k-means to cluster the neighbourhood into 5 clusters

In [50]:
from sklearn.cluster import KMeans
#set number of clusters
kclusters=5

toronto_grouped_clustering=toronto_grouped.drop('Neighbourhood', 1)

#run kmeans
kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 1, 1, 2, 3], dtype=int32)

### _Creating a new dataframe that includes a cluster as well as the top 10 venues for each neighbourhood_

In [51]:
#add clustering labels

neighbourhoods_venues_sorted.insert(0, "Cluster Labels", kmeans.labels_)

toronto_merged=df

#merge toronto grouped with toronto data to add latitude/longitude for each neighbourhood
toronto_merged=toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')


toronto_merged.head()

Unnamed: 0,PostalCode,Community,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,1.0,Fast Food Restaurant,Yoga Studio,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1.0,Home Service,Bar,Yoga Studio,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Deli / Bodega
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1.0,Electronics Store,Mexican Restaurant,Intersection,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Bank,Dim Sum Restaurant,Diner
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Soccer Field,Korean Restaurant,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Fried Chicken Joint,Gas Station,Bakery,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Dessert Shop,Dim Sum Restaurant


### Unfortunately, a few neighbourhoods did not pull through so we had to drop those without values. 

In [55]:
toronto_merged.dropna(axis=0, inplace=True)
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype(int)

Visualizing the clusters. We will create a map with the clusters for each neighbourhood.

In [57]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_toronto_clusters=folium.Map(location=[district_latitude, district_longitude], zoom_start=10)

x=np.arange(kclusters)
ys=[i + x + (i*x)**2 for i in range(kclusters)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]

markers_colors=[]
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label=folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(map_toronto_clusters)
    
map_toronto_clusters

In [91]:
toronto_merged.loc[toronto_merged['Cluster Labels']==0,  toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Community,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
90,Etobicoke,0,River,Yoga Studio,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center,Curling Ice


In [125]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Community,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,1,Fast Food Restaurant,Yoga Studio,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run
1,Scarborough,1,Home Service,Bar,Yoga Studio,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Deli / Bodega
2,Scarborough,1,Electronics Store,Mexican Restaurant,Intersection,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Bank,Dim Sum Restaurant,Diner
3,Scarborough,1,Coffee Shop,Soccer Field,Korean Restaurant,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
4,Scarborough,1,Fried Chicken Joint,Gas Station,Bakery,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Dessert Shop,Dim Sum Restaurant
5,Scarborough,1,Playground,Construction & Landscaping,Yoga Studio,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
6,Scarborough,1,Train Station,Bus Station,Coffee Shop,Department Store,Discount Store,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Yoga Studio
7,Scarborough,1,Bakery,Ice Cream Shop,Metro Station,Soccer Field,Park,Intersection,Bus Station,Bus Line,Diner,Discount Store
8,Scarborough,1,Motel,American Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Dance Studio
9,Scarborough,1,College Stadium,Café,Skating Rink,General Entertainment,Yoga Studio,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store


In [137]:
toronto_merged.loc[toronto_merged['Cluster Labels']==2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Community,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,North York,2,Park,Yoga Studio,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
23,North York,2,Construction & Landscaping,Convenience Store,Park,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Yoga Studio
40,East York,2,Park,Convenience Store,Intersection,Yoga Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
50,Downtown Toronto,2,Park,Trail,Playground,Yoga Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
74,York,2,Park,Women's Store,Pool,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
98,York,2,Park,Convenience Store,Yoga Studio,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant


In [144]:
toronto_merged.loc[toronto_merged['Cluster Labels']==3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Community,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,North York,3,Cafeteria,Yoga Studio,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Dance Studio


In [157]:
toronto_merged.loc[toronto_merged['Cluster Labels']==4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Community,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,North York,4,Park,Food & Drink Shop,Yoga Studio,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
44,Central Toronto,4,Park,Bus Line,Swim School,Yoga Studio,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
100,Etobicoke,4,Park,Bus Line,Sandwich Place,Mobile Phone Shop,Yoga Studio,Discount Store,Dessert Shop,Dim Sum Restaurant,Diner,Distribution Center
