# Capstone Project - The Battle of Neighborhoods - Week2 - Notebook

## Importing essential libraries

In [3]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

%matplotlib inline 

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

## Scraping the Data from Wikipedia with Pandas

### Copied the Data from the wikipedia page and used the below code the import it in this NoteBook:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
df_raw = pd.read_clipboard(header=None)

In [5]:
df_raw.shape

(180, 3)

In [6]:
df_raw.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Assigning the column names:

In [7]:
df = df_raw.copy()

In [8]:
column_names = ['Postal Code','Borough','Neighborhood']

In [9]:
df.columns = column_names

In [10]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Not cleaned data frame shape:

In [11]:
df.shape

(180, 3)

### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned, So we have to remove the rows that the column Borough is 'Not assigned':

In [12]:
df = df[df['Borough'] != 'Not assigned']

In [13]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [14]:
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned',df['Borough'],df['Neighborhood'])

In [15]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### More than one neighborhood can exist in one postal code area. Combining the "Neighborhood" column, separated with a comma, for all the the mutual Postal Codes:

In [16]:
df = df.groupby(['Postal Code','Borough'],as_index=False).agg(','.join)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
df.shape

(103, 3)

### Retrieving the Geographical Coordinates with the provided csv file:

In [18]:
geospatial_data = pd.read_csv('https://cocl.us/Geospatial_data')

In [19]:
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the two DataFrame on the Postal Code column:

In [20]:
df_Toronto = pd.merge(df, geospatial_data, on='Postal Code')

In [21]:
df_Toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


##### All the essential packages are imported in the first cell of this Notebook

### Lets check the number of boroughs and neighborhoods in Toronto:

In [22]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
    len(df_Toronto['Borough'].unique()),
    df_Toronto.shape[0]))

The dataframe has 10 boroughs and 103 neighborhoods.


### We can count the number of neighborhoods each borough contains:

In [23]:
df_Toronto['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 5
Mississauga          1
Name: Borough, dtype: int64

### Although there are 10 boroughs and overally 103 neighborhoods in Toronto, but we are only interested in the neighborhoods in Downtown Toronto. Thus, below is a new DataFrame that only contains the Downtown Toronto borough and its neighborhoods:

In [24]:
df_DT_Toronto = df_Toronto[df_Toronto['Borough'] == 'Downtown Toronto']
df_DT_Toronto.reset_index(drop=True,inplace=True)
df_DT_Toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


### Using geopy library to get the latitude and longitude values of Downtown Toronto

In [25]:
address = 'Downtown Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="DT_Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Dowtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Dowtown Toronto are 43.6563221, -79.3809161.


### Creating a map of Downtown Toronto with neighborhoods superimposed on top

In [27]:
# create map of Downtown Toronto using latitude and longitude values
map_DT_Toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_DT_Toronto['Latitude'], df_DT_Toronto['Longitude'], df_DT_Toronto['Borough'], df_DT_Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='white',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DT_Toronto)  
    
map_DT_Toronto

### In the above map we have demonstrated all the neighborhoods in Downtown Toronto
### Now Foursquare API comes in hand, we want to obtain the top 100 venues within 500 radius for all the neighborhoods in Downtown Toronto

### Define Foursquare Credentials and Version

In [28]:
CLIENT_ID = 'ULPXSFN5LXEZS5WN2ZMBESPQFSQ4WZTGHU1FCL4IQUXA45RQ' # your Foursquare ID
CLIENT_SECRET = '13EIWGTUEQF0QPB2HHHRJNJMM1T0JWXGFRIFDOQWQ3A4DALF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ULPXSFN5LXEZS5WN2ZMBESPQFSQ4WZTGHU1FCL4IQUXA45RQ
CLIENT_SECRET:13EIWGTUEQF0QPB2HHHRJNJMM1T0JWXGFRIFDOQWQ3A4DALF


### Let's examine the Foursquare API by exploring the top 100 venues that are in first row of our Downtown Torotno DataFrame within a radius of 500 meters.

In [29]:
print(df_DT_Toronto.loc[0,'Neighborhood'])
df_DT_Toronto.head()

Rosedale


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Get the latitude and longitude values for the 'Rosedale' Neighborhood

In [30]:
neighborhood_latitude = df_DT_Toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_DT_Toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_DT_Toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name,
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Rosedale are 43.6795626, -79.37752940000001.


### Now I will get the top 100 venues in Rosedale within a 500 meter radius

#### First, We have to create a GET request

In [31]:
LIMIT = 100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=ULPXSFN5LXEZS5WN2ZMBESPQFSQ4WZTGHU1FCL4IQUXA45RQ&client_secret=13EIWGTUEQF0QPB2HHHRJNJMM1T0JWXGFRIFDOQWQ3A4DALF&v=20180605&ll=43.6795626,-79.37752940000001&radius=500&limit=100'

#### Send the GET request and examine the resutls

In [32]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '600bede91ae38011348f09e8'},
 'response': {'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.6840626045, 'lng': -79.37131878274371},
   'sw': {'lat': 43.675062595499995, 'lng': -79.38374001725632}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4aff2d47f964a520743522e3',
       'name': 'Rosedale Park',
       'location': {'address': '38 Scholfield Ave.',
        'crossStreet': 'at Edgar Ave.',
        'lat': 43.68232820227814,
        'lng': -79.37893434347683,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.68232820227814,
          'lng': -79.37893434347683}],
        'distance': 32

### Now we have to retrieve, clean and structure the venues data from the JSON format to a pandas DataFrame
#### Below is a function that extracts the category of the venues

In [33]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [34]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Rosedale Park,Playground,43.682328,-79.378934
1,Whitney Park,Park,43.682036,-79.373788
2,Alex Murray Parkette,Park,43.6783,-79.382773
3,Milkman's Lane,Trail,43.676352,-79.373842


In [35]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


### Let's create a function to repeat the same process and retrieve the venues for all the neighborhoods in Downtown Toronto

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Create a DataFrame called DT_Toronto_venues that contains all the venues for all the neighborhoods in Downtown Toronto with running the above function

In [37]:
DT_Toronto_venues = getNearbyVenues(names=df_DT_Toronto['Neighborhood'],
                                   latitudes=df_DT_Toronto['Latitude'],
                                   longitudes=df_DT_Toronto['Longitude'])

Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Queen's Park, Ontario Provincial Government


### Let's check the size of the resulting dataframe

In [38]:
print(DT_Toronto_venues.shape)
DT_Toronto_venues.head()

(1225, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


### Let's check how many venues were returned for each neighborhood

In [39]:
DT_Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,61,61,61,61,61,61
Christie,16,16,16,16,16,16
Church and Wellesley,78,78,78,78,78,78
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",60,60,60,60,60,60


### how many unique venues were returned for all the Downtown neighborhoods:

In [40]:
print('There are {} uniques categories in all Downtown Neighborhoods'.format(len(DT_Toronto_venues['Venue Category'].unique())))

There are 206 uniques categories in all Downtown Neighborhoods


### Due to the fact that we are only interested in the venues that are in the restaurant category, so we have to clean the DataFrame and remove all the unnecessary venues, in this matter we will only consider the the rows that the column 'Venue Category' Contains the word 'Restaurant'

In [41]:
DT_Toronto_restaurant_venues = DT_Toronto_venues[DT_Toronto_venues['Venue Category'].str.contains('Restaurant')]

In [42]:
DT_Toronto_restaurant_venues.reset_index(drop=True,inplace=True)
print(DT_Toronto_restaurant_venues.shape)
DT_Toronto_restaurant_venues.head()

(291, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"St. James Town, Cabbagetown",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
1,"St. James Town, Cabbagetown",43.667967,-79.367675,Murgatroid,43.667381,-79.369311,Restaurant
2,"St. James Town, Cabbagetown",43.667967,-79.367675,F'Amelia,43.667536,-79.368613,Italian Restaurant
3,"St. James Town, Cabbagetown",43.667967,-79.367675,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Mr. Jerk,43.667328,-79.373389,Caribbean Restaurant


### I will remove the rows that the 'Venue Category' is named only 'Restaurant' because we are interested in rows that the type of the restaurant is specified

In [43]:
DT_Toronto_restaurant_venues = DT_Toronto_restaurant_venues[DT_Toronto_restaurant_venues['Venue Category'] != 'Restaurant']
DT_Toronto_restaurant_venues.reset_index(drop=True,inplace=True)
print(DT_Toronto_restaurant_venues.shape)
DT_Toronto_restaurant_venues.head()

(251, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"St. James Town, Cabbagetown",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
1,"St. James Town, Cabbagetown",43.667967,-79.367675,F'Amelia,43.667536,-79.368613,Italian Restaurant
2,"St. James Town, Cabbagetown",43.667967,-79.367675,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
3,"St. James Town, Cabbagetown",43.667967,-79.367675,Mr. Jerk,43.667328,-79.373389,Caribbean Restaurant
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Kanpai Snack Bar,43.664331,-79.368065,Taiwanese Restaurant


# Analyze each neighborhood's venues

In [44]:
# one hot encoding
DT_onehot = pd.get_dummies(DT_Toronto_restaurant_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
DT_onehot['Neighborhood'] = DT_Toronto_restaurant_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [DT_onehot.columns[-1]] + list(DT_onehot.columns[:-1])
DT_onehot = DT_onehot[fixed_columns]

print(DT_onehot.shape)
DT_onehot.head()

(251, 41)


Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Belgian Restaurant,Brazilian Restaurant,Caribbean Restaurant,Chinese Restaurant,Colombian Restaurant,Comfort Food Restaurant,Doner Restaurant,Dumpling Restaurant,Eastern European Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Moroccan Restaurant,New American Restaurant,Portuguese Restaurant,Ramen Restaurant,Seafood Restaurant,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"St. James Town, Cabbagetown",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [45]:
DT_onehot.shape[1]

41

### The above cell indicates that without considering the Neighborhood columns, we have 40 unique type of restaurants in Downtown Toronto

### The below DataFrame indicates the percentage of each type of retaurant in each neighborhood. In other words, the percetange of restaurant types for all the 17 neighborhoods which have at least 1 restaurant in 500 m readius.

In [54]:
DT_grouped_restaurant_type_frequency = DT_onehot.groupby('Neighborhood').mean().reset_index()
print(DT_grouped_restaurant_type_frequency.shape)
DT_grouped_restaurant_type_frequency.head()

(17, 41)


Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Belgian Restaurant,Brazilian Restaurant,Caribbean Restaurant,Chinese Restaurant,Colombian Restaurant,Comfort Food Restaurant,Doner Restaurant,Dumpling Restaurant,Eastern European Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Moroccan Restaurant,New American Restaurant,Portuguese Restaurant,Ramen Restaurant,Seafood Restaurant,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.0,0.1,0.0,0.1,0.0
1,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.066667,0.0,0.0,0.0,0.066667,0.2,0.066667,0.066667,0.0,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.066667,0.0,0.0,0.066667,0.0,0.066667,0.0,0.066667,0.0
2,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Church and Wellesley,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.130435,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.217391,0.0,0.0,0.086957,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.217391,0.0,0.043478,0.043478,0.0,0.0
4,"Commerce Court, Victoria Hotel",0.173913,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.043478,0.0,0.0,0.130435,0.130435,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.130435,0.0,0.0,0.086957,0.0,0.086957,0.0


### Let's create a new DataFrame that contains each neighborhood along with the top 3 most common venues in that neighborhood

### First, let's write a function to sort the venues in descending order:

In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Now we can create the new dataframe and display the top 3 venues for each neighborhood:

In [48]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
DT_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
DT_neighborhoods_venues_sorted['Neighborhood'] = DT_grouped_restaurant_type_frequency['Neighborhood']

for ind in np.arange(DT_grouped_restaurant_type_frequency.shape[0]):
    DT_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(DT_grouped_restaurant_type_frequency.iloc[ind, :], num_top_venues)

DT_neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Berczy Park,Seafood Restaurant,Comfort Food Restaurant,Greek Restaurant
1,Central Bay Street,Italian Restaurant,Indian Restaurant,Chinese Restaurant
2,Christie,Italian Restaurant,Greek Restaurant,Gluten-free Restaurant
3,Church and Wellesley,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant
4,"Commerce Court, Victoria Hotel",American Restaurant,Seafood Restaurant,Italian Restaurant
5,"First Canadian Place, Underground city",Japanese Restaurant,American Restaurant,Asian Restaurant
6,"Garden District, Ryerson",Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant
7,"Harbourfront East, Union Station, Toronto Islands",Italian Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant
8,"Kensington Market, Chinatown, Grange Park",Vietnamese Restaurant,Mexican Restaurant,Vegetarian / Vegan Restaurant
9,"Queen's Park, Ontario Provincial Government",Sushi Restaurant,Italian Restaurant,Japanese Restaurant


## In this stage we could use the power of machine learning and cluster our neighborhoods based on their restaurant category frequency. Undoubtedly, the neighborhoods which seafood retaurants are in their top 3 venues, could be considered a potential neighborhood for opening a new seafood restaurant.

### Cluster the Neighborhoods of Downtown Toronto

#### Run k-means to cluster the neighborhoods in Downtown Toronto into 10 clusters

In [49]:
# set number of clusters
kclusters = 10

DT_grouped_clustering = DT_grouped_restaurant_type_frequency.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(DT_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([8, 1, 2, 5, 3, 3, 9, 1, 6, 5])

### Let's create a new dataframe that includes the cluster as well as the top 3 venues for each neighborhood

In [50]:
# add clustering labels
DT_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

DT_merge = df_DT_Toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
DT_merge = DT_merge.join(DT_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print(DT_merge.shape)
DT_merge.head() # check the last columns!

(19, 9)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,,,,
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,0.0,Italian Restaurant,Indian Restaurant,Thai Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,5.0,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Asian Restaurant,French Restaurant,Vietnamese Restaurant
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,9.0,Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant


In [51]:
# Ignore/drop NaNs
DT_merge.dropna(axis=0, how='any',inplace=True)
DT_merge['Cluster Labels'].astype(int)
DT_merge.reset_index(inplace=True, drop=True)
print("Number of rows after dropping NaNs:", len(DT_merge))
print("Number of NaNs:", DT_merge.isna().sum())

Number of rows after dropping NaNs: 17
Number of NaNs: Postal Code              0
Borough                  0
Neighborhood             0
Latitude                 0
Longitude                0
Cluster Labels           0
1st Most Common Venue    0
2nd Most Common Venue    0
3rd Most Common Venue    0
dtype: int64


### Finally, let's visualize the resulting clusters

In [52]:
# create map
map_clusters = folium.Map(location=[43.656322, -79.380916], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(DT_merge['Latitude'], DT_merge['Longitude'], DT_merge['Neighborhood'], DT_merge['Cluster Labels'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster+1), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### To conclude, We clustered our Downtown neighborhoods based on their retaurant frequency type, in other words, we created a dataframe that contains each neighborhood along with their top 3 most common restaurant categories and then we clusterd them based on that. As we can see in the above map, in the orange cluster (cluster number 9), the most common retaurant in these two neighborhoods are 'seafood restaurant' that could be a good choice for opening a new seafood restaurant in these neighborhoods.

##### Cluster 1

In [57]:
DT_merge.loc[DT_merge['Cluster Labels'] == 0, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"St. James Town, Cabbagetown",0.0,Italian Restaurant,Indian Restaurant,Thai Restaurant


##### Cluster 2

In [58]:
DT_merge.loc[DT_merge['Cluster Labels'] == 1, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
6,Central Bay Street,1.0,Italian Restaurant,Indian Restaurant,Chinese Restaurant
8,"Harbourfront East, Union Station, Toronto Islands",1.0,Italian Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant


##### Cluster 3

In [59]:
DT_merge.loc[DT_merge['Cluster Labels'] == 2, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
15,Christie,2.0,Italian Restaurant,Greek Restaurant,Gluten-free Restaurant


##### Cluster 4

In [60]:
DT_merge.loc[DT_merge['Cluster Labels'] == 3, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
4,St. James Town,3.0,American Restaurant,Moroccan Restaurant,Italian Restaurant
7,"Richmond, Adelaide, King",3.0,Thai Restaurant,American Restaurant,Sushi Restaurant
9,"Toronto Dominion Centre, Design Exchange",3.0,American Restaurant,Seafood Restaurant,Italian Restaurant
10,"Commerce Court, Victoria Hotel",3.0,American Restaurant,Seafood Restaurant,Italian Restaurant
14,"First Canadian Place, Underground city",3.0,Japanese Restaurant,American Restaurant,Asian Restaurant


##### Cluster 5

In [61]:
DT_merge.loc[DT_merge['Cluster Labels'] == 4, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
2,"Regent Park, Harbourfront",4.0,Asian Restaurant,French Restaurant,Vietnamese Restaurant


##### Cluster 6

In [62]:
DT_merge.loc[DT_merge['Cluster Labels'] == 5, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
1,Church and Wellesley,5.0,Sushi Restaurant,Japanese Restaurant,Fast Food Restaurant
16,"Queen's Park, Ontario Provincial Government",5.0,Sushi Restaurant,Italian Restaurant,Japanese Restaurant


##### Cluster 7

In [63]:
DT_merge.loc[DT_merge['Cluster Labels'] == 6, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,"Kensington Market, Chinatown, Grange Park",6.0,Vietnamese Restaurant,Mexican Restaurant,Vegetarian / Vegan Restaurant


##### Cluster 8

In [64]:
DT_merge.loc[DT_merge['Cluster Labels'] == 7, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
11,"University of Toronto, Harbord",7.0,Japanese Restaurant,Italian Restaurant,French Restaurant


##### Cluster 9

In [53]:
DT_merge.loc[DT_merge['Cluster Labels'] == 8, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
5,Berczy Park,8.0,Seafood Restaurant,Comfort Food Restaurant,Greek Restaurant
13,Stn A PO Boxes,8.0,Seafood Restaurant,Italian Restaurant,Japanese Restaurant


##### Cluster 10

In [66]:
DT_merge.loc[DT_merge['Cluster Labels'] == 9, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
3,"Garden District, Ryerson",9.0,Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant
