# Segmenting and Clustering Neighborhoods in Toronto

## Web scrapping Wikipedia for Canada neighbors and postal codes and transformation into dataframe

In [3]:
import requests
import urllib.request
import time
from urllib.request import urlopen
!pip install "ipython-beautifulsoup[bs4]"
from bs4 import BeautifulSoup
import pandas as pd
print('....... Done')

Collecting ipython-beautifulsoup[bs4]
  Downloading https://files.pythonhosted.org/packages/e8/3f/6bc064a5bde8256ef78f747f142be72f44252fbe6135ab9d2d11e1c7cb8c/ipython_beautifulsoup-0.3-py2.py3-none-any.whl
Collecting beautifulsoup4 (from ipython-beautifulsoup[bs4])
[?25l  Downloading https://files.pythonhosted.org/packages/1a/b7/34eec2fe5a49718944e215fde81288eec1fa04638aa3fb57c1c6cd0f98c3/beautifulsoup4-4.8.0-py3-none-any.whl (97kB)
[K     |████████████████████████████████| 102kB 4.0MB/s ta 0:00:011
Collecting soupsieve>=1.2 (from beautifulsoup4->ipython-beautifulsoup[bs4])
  Downloading https://files.pythonhosted.org/packages/0b/44/0474f2207fdd601bb25787671c81076333d2c80e6f97e92790f8887cf682/soupsieve-1.9.3-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4, ipython-beautifulsoup
Successfully installed beautifulsoup4-4.8.0 ipython-beautifulsoup-0.3 soupsieve-1.9.3
....... Done


## Defining url to scrape and saving the response

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
response = requests.get(url)
response

<Response [200]>

## Scrapping the response to grab the table and its values

In [6]:
soup = BeautifulSoup(response.text,'html.parser')# Parse the HTML as a string
  
table = soup.find_all('table')[0] # Grab the first table
#print(table)    
new_table = pd.DataFrame(columns=range(0,3), index = (range(0,290))) # I know the size 
 
row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
    
        #print(row_marker,column_marker)
        new_table.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1
    row_marker += 1    
    #if row_marker == 289:
     #   break
new_table.head()

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


## Adjusting the scrapped response to clean the data, include headers, drop na values and remove 'Not assigned' values 

In [7]:
new_table[2].replace('\\n','', regex=True, inplace=True)

In [8]:
new_table.columns= ['PostalCode','Borough','Neighborhood']

In [9]:
new_table.shape

(290, 3)

In [10]:
new_table.isnull().sum()

PostalCode      2
Borough         2
Neighborhood    2
dtype: int64

In [11]:
new_table.dropna(axis=0,how='any', inplace=True)

In [12]:
new_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [13]:
len(new_table.groupby("PostalCode"))

180

In [14]:
print('Boroughs with Not Assigned',len(new_table[new_table['Borough'] == 'Not assigned']))
print('Neighborhoods with Not Assigned',len(new_table[new_table['Borough'] == 'Not assigned']))

Boroughs with Not Assigned 77
Neighborhoods with Not Assigned 77


In [15]:
new_table = new_table[new_table.Borough != 'Not assigned'] 

In [16]:
new_table = new_table.reset_index()
new_table.drop(columns=['index'],inplace=True)

In [17]:
print('Boroughs with Not Assigned',len(new_table[new_table['Borough'] == 'Not assigned']))
print('Neighborhoods with Not Assigned',len(new_table[new_table['Borough'] == 'Not assigned']))

Boroughs with Not Assigned 0
Neighborhoods with Not Assigned 0


In [18]:
new_table.head(10)
new_table.shape

(211, 3)

## Dealing with case where Neighborhood is 'Not assigned' but Borough is (Queen's Park)

In [19]:
print(new_table[new_table['Neighborhood'] == 'Not assigned'])

  PostalCode       Borough  Neighborhood
6        M7A  Queen's Park  Not assigned


In [20]:
new_table.loc[new_table['Neighborhood'] == 'Not assigned', ['Neighborhood']] = "Queen's Park"

In [21]:
new_table.loc[(new_table['Borough'] == "Queen's Park")]

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Queen's Park


## Creating grouped by Neighborhood dataframe

In [22]:
grouped = new_table.groupby('PostalCode')['Neighborhood'].apply(', '.join)

In [23]:
grouped = pd.DataFrame(grouped)
print(grouped)

                                                 Neighborhood
PostalCode                                                   
M1B                                            Rouge, Malvern
M1C                    Highland Creek, Rouge Hill, Port Union
M1E                         Guildwood, Morningside, West Hill
M1G                                                    Woburn
M1H                                                 Cedarbrae
...                                                       ...
M9N                                                    Weston
M9P                                                 Westmount
M9R         Kingsview Village, Martin Grove Gardens, Richv...
M9V         Albion Gardens, Beaumond Heights, Humbergate, ...
M9W                                                 Northwest

[103 rows x 1 columns]


In [24]:
grouped.columns
grouped['PostalCode'] = grouped.index
grouped.reset_index(drop=True,inplace=True)

In [25]:
new_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [26]:
grouped.head()

Unnamed: 0,Neighborhood,PostalCode
0,"Rouge, Malvern",M1B
1,"Highland Creek, Rouge Hill, Port Union",M1C
2,"Guildwood, Morningside, West Hill",M1E
3,Woburn,M1G
4,Cedarbrae,M1H


## Merging the original dataframe and the grouped by neighborhood to obtain the desired dataframe

In [27]:
new_table2 = pd.merge(new_table, grouped, on='PostalCode', how='outer')

In [28]:
new_table2.drop(columns=['Neighborhood_x'],inplace=True)
new_table2.rename(columns={'Neighborhood_y':'Neighborhood'}, inplace=True)

In [29]:
new_table2.drop_duplicates(inplace=True)

In [30]:
new_table2.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [31]:
new_table2.reset_index(drop=True,inplace=True)

In [32]:
new_table2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [33]:
new_table2.shape

(103, 3)

# Lat and Long for each Neighborhood

## Getting and adjusting the data

In [34]:
path='https://cocl.us/Geospatial_data'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [36]:
new_table3 = pd.merge(new_table2, df, on='PostalCode', how='outer')

In [37]:
new_table3

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


# Viewing data in map and filtering Borough

In [38]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import folium

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##

In [39]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [40]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_table3['Latitude'], new_table3['Longitude'], new_table3['Borough'], new_table3['Neighborhood']):
    label = '{}- {}'.format(borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Creating dataframe with Boroughs that contain the word Toronto 

In [41]:
new_table3.loc[new_table3['Borough'].str.contains("Toronto")], ['Borough',"Neighborhood"]

(    PostalCode           Borough  \
 2          M5A  Downtown Toronto   
 9          M5B  Downtown Toronto   
 15         M5C  Downtown Toronto   
 19         M4E      East Toronto   
 20         M5E  Downtown Toronto   
 24         M5G  Downtown Toronto   
 25         M6G  Downtown Toronto   
 30         M5H  Downtown Toronto   
 31         M6H      West Toronto   
 36         M5J  Downtown Toronto   
 37         M6J      West Toronto   
 41         M4K      East Toronto   
 42         M5K  Downtown Toronto   
 43         M6K      West Toronto   
 47         M4L      East Toronto   
 48         M5L  Downtown Toronto   
 54         M4M      East Toronto   
 61         M4N   Central Toronto   
 62         M5N   Central Toronto   
 67         M4P   Central Toronto   
 68         M5P   Central Toronto   
 69         M6P      West Toronto   
 73         M4R   Central Toronto   
 74         M5R   Central Toronto   
 75         M6R      West Toronto   
 79         M4S   Central Toronto   
 

In [42]:
toronto_data = new_table3[new_table3['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [43]:
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
8,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752


In [44]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}- {}'.format(borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

# Fetching Foursquare data

In [45]:
# @hidden_cell
CLIENT_ID = '3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH' # your Foursquare ID
CLIENT_SECRET = 'VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 200

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH
CLIENT_SECRET:VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF


In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        print(url)
        print('\n')
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [47]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Harbourfront, Regent Park
https://api.foursquare.com/v2/venues/explore?&client_id=3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH&client_secret=VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF&v=20180605&ll=43.6542599,-79.3606359&radius=1000&limit=200


Ryerson, Garden District
https://api.foursquare.com/v2/venues/explore?&client_id=3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH&client_secret=VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF&v=20180605&ll=43.6571618,-79.37893709999999&radius=1000&limit=200


St. James Town
https://api.foursquare.com/v2/venues/explore?&client_id=3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH&client_secret=VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF&v=20180605&ll=43.6514939,-79.3754179&radius=1000&limit=200


The Beaches
https://api.foursquare.com/v2/venues/explore?&client_id=3LZMCAS24EVTNLWZPTJQGXA0JNHZZPBFZG4KTCX52GGTS2ZH&client_secret=VDHVW1THKNTYNT5XVVQQ0ADUCGMGK5M4MNBH0ZDIBLHDFWIF&v=20180605&ll=43.67635739999999,-79.2930312&radius=1000&limi

In [48]:
print(toronto_venues.shape)
toronto_venues.head()

(3088, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,"Harbourfront, Regent Park",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site


In [49]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,47,47,47,47,47,47
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",15,15,15,15,15,15
"Cabbagetown, St. James Town",39,39,39,39,39,39
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,100,100,100,100,100,100
Church and Wellesley,100,100,100,100,100,100


In [50]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


## Analizing each neighborhood and grouping venues type per Neighborhood

In [51]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,...,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.025641,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0.01,0.0,0.01
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.08,0.0,0.0,0.02,0.0,0.01,0.0,0.02
8,Christie,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.0
9,Church and Wellesley,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.01,0.01


In [53]:
num_top_venues = 7

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                 Café  0.05
1                Hotel  0.05
2          Coffee Shop  0.05
3              Theater  0.04
4     Sushi Restaurant  0.03
5  American Restaurant  0.03
6       Clothing Store  0.03


----Berczy Park----
                 venue  freq
0          Coffee Shop  0.09
1                Hotel  0.06
2                 Café  0.05
3  Japanese Restaurant  0.04
4             Beer Bar  0.04
5               Bakery  0.03
6         Cocktail Bar  0.03


----Brockton, Exhibition Place, Parkdale Village----
                    venue  freq
0                    Café  0.07
1             Coffee Shop  0.06
2  Furniture / Home Store  0.04
3                     Bar  0.04
4      Tibetan Restaurant  0.03
5              Restaurant  0.03
6     Arts & Crafts Store  0.02


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0                Park  0.09
1         Pizza Place  0.06
2         Coffee Shop  0.0

## Listing most common venues per Neighborhood

In [54]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [55]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Hotel,Coffee Shop,Café,Theater,American Restaurant,Sushi Restaurant,Clothing Store,Movie Theater,Restaurant,Gym
1,Berczy Park,Coffee Shop,Hotel,Café,Japanese Restaurant,Beer Bar,Cocktail Bar,Restaurant,Bakery,Park,Seafood Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Bar,Furniture / Home Store,Tibetan Restaurant,Restaurant,Beer Bar,Indian Restaurant,Sandwich Place,Italian Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Park,Pizza Place,Coffee Shop,Brewery,Pet Store,Sushi Restaurant,Italian Restaurant,Burrito Place,Snack Place,French Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Harbor / Marina,Café,Coffee Shop,Garden,Park,Sculpture Garden,Dog Run,Dance Studio,Track,Scenic Lookout


# Clustering and plotting clusters in a map

In [56]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:38] 

array([3, 3, 3, 1, 2, 3, 3, 3, 3, 1, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1,
       0, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1], dtype=int32)

In [57]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,1,Coffee Shop,Café,Theater,Italian Restaurant,Gym / Fitness Center,Restaurant,Bakery,Diner,Breakfast Spot,Park
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,3,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Diner,Gastropub,Tea Room,Café,Restaurant,Italian Restaurant,Park
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Café,Restaurant,Hotel,Bakery,Gastropub,Seafood Restaurant,Italian Restaurant,Breakfast Spot,Cosmetics Shop
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Pub,Coffee Shop,Beach,Japanese Restaurant,Bar,Bakery,Breakfast Spot,Pizza Place,Caribbean Restaurant,Burger Joint
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Coffee Shop,Hotel,Café,Japanese Restaurant,Beer Bar,Cocktail Bar,Restaurant,Bakery,Park,Seafood Restaurant


In [58]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters