### Importing Libraries and Data

### Task - 1 Fetching the data and converting the given format

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
# uncomment this line if you haven't completed the Foursquare API lab

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
#Reading data using pandas read_html function
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)

#Reading the data using pandas 
raw_data=tables[0]

#Removing rows having borough as 'Not assigned'
toronto_df=raw_data.drop(raw_data.loc[raw_data['Borough']=='Not assigned'].index).reset_index(drop=True)
toronto_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)

#Updating neighborhood column value if there is 'Not assigned'
toronto_df['Neighborhood']=toronto_df.apply(lambda x: x['Borough'] if x['Neighborhood']=='Not assigned' else x['Neighborhood'],axis=1)

#combing the values of neighborhood if there exist more than 1 for single zip code
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist())) 
grp = toronto_df.groupby(['PostalCode', 'Borough'])
toronto_df2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')
#shape of the dataframe
toronto_df2.shape

(103, 3)

### Task - 2  Adding the co-ordinate columns from csv (The API is not working) and taking the records which contains 'Toronto' in borough



In [3]:
#Reading the csv file having the coordinate of the zip codes 
lat_lng=pd.read_csv('Geospatial_Coordinates.csv')
lat_lng.rename(columns={'Postal Code':'PostalCode'},inplace=True)

#Merging the coordinate to the toronto dataframe
toronto_df2=toronto_df2.merge(lat_lng, on ='PostalCode',how='inner')

#Records where the Borough column contains 'Toronto'
#df_toronto=toronto_df2[toronto_df2['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto=toronto_df2
df_toronto.shape

(103, 5)

### Task - 3 Segmenting and Clustering

In [4]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)

In [5]:
#Co-ordinates for Toronto
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [6]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [7]:
# Foursquare Credentials
CLIENT_ID = '0HA1RJT3KT2DJATDGIB4JNXPVW5FJ4BWKX2HDEJTRADF5CFF' # your Foursquare ID
CLIENT_SECRET = 'KDP2KMUOFEMACIEGNR5GVXX03YU5QDDX5CQRHJOIH4Q3K3I3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0HA1RJT3KT2DJATDGIB4JNXPVW5FJ4BWKX2HDEJTRADF5CFF
CLIENT_SECRET:KDP2KMUOFEMACIEGNR5GVXX03YU5QDDX5CQRHJOIH4Q3K3I3


In [8]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                   
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [15]:
# Venues
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude'])

#### Let's check the size of the resulting dataframe

In [16]:
print(toronto_venues.shape)
toronto_venues.head()

(2130, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


In [18]:
toronto_venues_restaurant=toronto_venues[toronto_venues['Venue Category'].str.contains('Restaurant')]

In [54]:
# Top 5 Neighborhoods
a=toronto_venues_restaurant['Neighborhood'].value_counts()[0:5]
a=pd.DataFrame(a)
a.reset_index(inplace=True)
a.columns=['neighborhood','restaurant_count']
ng=a['neighborhood'].to_list()

In [66]:
#Data of Top 5 Neighborhoods
toronto_venues_restaurant_top_5=toronto_venues_restaurant[toronto_venues_restaurant['Neighborhood'].isin(ng)].reset_index(drop=True)

#### Let's find out how many unique categories can be curated from all the returned venues

In [68]:
print('There are {} unique type of restuarant.'.format(len(toronto_venues_restaurant_top_5['Venue Category'].unique())))

There are 25 unique type of restuarant.


### Analyze Each Neighborhood

In [69]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_restaurant_top_5[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues_restaurant_top_5['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
print(toronto_onehot.shape)
toronto_onehot.head()

(130, 26)


Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Brazilian Restaurant,Chinese Restaurant,Colombian Restaurant,Ethiopian Restaurant,Fast Food Restaurant,French Restaurant,Gluten-free Restaurant,Greek Restaurant,Italian Restaurant,Japanese Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,New American Restaurant,Ramen Restaurant,Restaurant,Seafood Restaurant,Sushi Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Garden District, Ryerson",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [70]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Brazilian Restaurant,Chinese Restaurant,Colombian Restaurant,Ethiopian Restaurant,Fast Food Restaurant,French Restaurant,Gluten-free Restaurant,Greek Restaurant,Italian Restaurant,Japanese Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,New American Restaurant,Ramen Restaurant,Restaurant,Seafood Restaurant,Sushi Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,"Commerce Court, Victoria Hotel",0.133333,0.033333,0.0,0.0,0.0,0.0,0.033333,0.033333,0.033333,0.0,0.1,0.1,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.233333,0.1,0.0,0.066667,0.066667,0.0
1,"First Canadian Place, Underground city",0.1,0.1,0.033333,0.0,0.033333,0.0,0.033333,0.0,0.033333,0.033333,0.033333,0.1,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.166667,0.1,0.066667,0.066667,0.033333,0.0
2,"Garden District, Ryerson",0.0,0.0,0.0,0.045455,0.0,0.045455,0.090909,0.0,0.0,0.0,0.136364,0.136364,0.0,0.0,0.045455,0.136364,0.045455,0.045455,0.090909,0.045455,0.045455,0.0,0.045455,0.0,0.045455
3,"Richmond, Adelaide, King",0.045455,0.045455,0.045455,0.0,0.045455,0.0,0.045455,0.0,0.045455,0.0,0.0,0.045455,0.045455,0.045455,0.0,0.0,0.045455,0.045455,0.0,0.181818,0.045455,0.090909,0.136364,0.045455,0.0
4,"Toronto Dominion Centre, Design Exchange",0.115385,0.038462,0.0,0.038462,0.0,0.0,0.038462,0.038462,0.038462,0.038462,0.115385,0.115385,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.153846,0.115385,0.076923,0.0,0.038462,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [71]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Commerce Court, Victoria Hotel----
                 venue  freq
0           Restaurant  0.23
1  American Restaurant  0.13
2   Italian Restaurant  0.10
3   Seafood Restaurant  0.10
4  Japanese Restaurant  0.10


----First Canadian Place, Underground city----
                 venue  freq
0           Restaurant  0.17
1  American Restaurant  0.10
2   Seafood Restaurant  0.10
3     Asian Restaurant  0.10
4  Japanese Restaurant  0.10


----Garden District, Ryerson----
                       venue  freq
0         Italian Restaurant  0.14
1        Japanese Restaurant  0.14
2  Middle Eastern Restaurant  0.14
3       Fast Food Restaurant  0.09
4           Ramen Restaurant  0.09


----Richmond, Adelaide, King----
                 venue  freq
0           Restaurant  0.18
1      Thai Restaurant  0.14
2     Sushi Restaurant  0.09
3  American Restaurant  0.05
4  Japanese Restaurant  0.05


----Toronto Dominion Centre, Design Exchange----
                 venue  freq
0           Restaurant  0.15
1

#### Let's put that into a *pandas* dataframe

In [72]:
#First, let's write a function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [73]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Commerce Court, Victoria Hotel",Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Thai Restaurant,Asian Restaurant,Fast Food Restaurant,French Restaurant
1,"First Canadian Place, Underground city",Restaurant,American Restaurant,Seafood Restaurant,Asian Restaurant,Japanese Restaurant,Thai Restaurant,Sushi Restaurant,Gluten-free Restaurant,New American Restaurant,Brazilian Restaurant
2,"Garden District, Ryerson",Italian Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Ramen Restaurant,Vietnamese Restaurant,Chinese Restaurant,Mexican Restaurant,Modern European Restaurant,New American Restaurant
3,"Richmond, Adelaide, King",Restaurant,Thai Restaurant,Sushi Restaurant,Latin American Restaurant,Vegetarian / Vegan Restaurant,Asian Restaurant,Brazilian Restaurant,Colombian Restaurant,Fast Food Restaurant,Gluten-free Restaurant
4,"Toronto Dominion Centre, Design Exchange",Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Sushi Restaurant,Fast Food Restaurant,Greek Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant


## Cluster Neighborhoods

In [74]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 1, 2, 0])

In [75]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# Dropping data not having any clusters
toronto_merged=toronto_merged[toronto_merged['Cluster Labels'].notnull()].reset_index(drop=True)
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].apply(lambda x : int(x))
toronto_merged # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Italian Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Ramen Restaurant,Vietnamese Restaurant,Chinese Restaurant,Mexican Restaurant,Modern European Restaurant,New American Restaurant
1,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,2,Restaurant,Thai Restaurant,Sushi Restaurant,Latin American Restaurant,Vegetarian / Vegan Restaurant,Asian Restaurant,Brazilian Restaurant,Colombian Restaurant,Fast Food Restaurant,Gluten-free Restaurant
2,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0,Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Sushi Restaurant,Fast Food Restaurant,Greek Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant
3,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,3,Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Thai Restaurant,Asian Restaurant,Fast Food Restaurant,French Restaurant
4,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,4,Restaurant,American Restaurant,Seafood Restaurant,Asian Restaurant,Japanese Restaurant,Thai Restaurant,Sushi Restaurant,Gluten-free Restaurant,New American Restaurant,Brazilian Restaurant


In [76]:
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Italian Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Ramen Restaurant,Vietnamese Restaurant,Chinese Restaurant,Mexican Restaurant,Modern European Restaurant,New American Restaurant
1,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,2,Restaurant,Thai Restaurant,Sushi Restaurant,Latin American Restaurant,Vegetarian / Vegan Restaurant,Asian Restaurant,Brazilian Restaurant,Colombian Restaurant,Fast Food Restaurant,Gluten-free Restaurant
2,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0,Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Sushi Restaurant,Fast Food Restaurant,Greek Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant
3,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,3,Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Thai Restaurant,Asian Restaurant,Fast Food Restaurant,French Restaurant
4,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,4,Restaurant,American Restaurant,Seafood Restaurant,Asian Restaurant,Japanese Restaurant,Thai Restaurant,Sushi Restaurant,Gluten-free Restaurant,New American Restaurant,Brazilian Restaurant


### Finally, let's visualize the resulting clusters

In [77]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       

map_clusters

In [None]:
print("We can see the top 5 Neighbors where a person can open a restuarant ")