# Segmenting and Clustering Neighborhoods in Toronto - Notebook 03

## First insert your Foursquare API credentials, please:

In [None]:
CLIENT_ID = 'XXXXX'
CLIENT_SECRET = 'XXXXX'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

## Installing the (possibly) missing libraries:

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes

In [None]:
!conda install -c conda-forge geopy --yes

## Importing all the libraries at one time:

In [None]:
import folium

from geopy.geocoders import Nominatim

import json

import matplotlib.cm as cm
import matplotlib.colors as colors

import numpy as np

import pandas as pd
from pandas.io.json import json_normalize

import requests

from sklearn.cluster import KMeans

print('Libraries imported!')

## Reading the data from the URL and selecting the first returned table:

In [None]:
postal_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

print('Data read!')

## Naming the columns accordingly to what is expected:

In [None]:
postal_df.columns = ['PostalCode', 'Borough', 'Neighborhood']

## A first look at our data:

In [None]:
postal_df.head(10)

## Removing the rows with invalid borough entries: 

In [None]:
postal_df = postal_df[postal_df.Borough != 'Not assigned']

postal_df.reset_index(drop = True, inplace = True)

## Fixing the invalid neighborhood entries:

In [None]:
for i in range(len(postal_df)):
    
    if postal_df.loc[i, 'Neighborhood'] == 'Not assigned':
    
        postal_df.loc[i, 'Neighborhood'] = postal_df.loc[i, 'Borough']

## All right, let's look at our dataframe with the fixed entries for boroughs and neighborhoods:

In [None]:
postal_df.head(10)

## Creating a new dataframe to contain the grouped neighborhood entries:

In [None]:
output_df = postal_df.drop_duplicates(['PostalCode', 'Borough']).copy()

output_df.reset_index(drop = True, inplace = True)

for i in range(len(output_df)):
    
    nh_list = []
    
    for j in range(len(postal_df)):
    
        if postal_df.loc[j, 'PostalCode'] == output_df.loc[i, 'PostalCode']:
        
            if postal_df.loc[j, 'Borough'] == output_df.loc[i, 'Borough']:
            
                nh_list.append(postal_df.loc[j, 'Neighborhood'])
    
    nh_string = nh_list[0]
    
    for k in range(1, len(nh_list)):
    
        nh_string = nh_string + ', ' + nh_list[k]
    
    output_df.loc[i, 'Neighborhood'] = nh_string    

## Looking at the first ten results of our operations:

In [None]:
output_df.head(10)

## Checking the number of rows of our dataframe with grouped neighborhoods: 

In [None]:
print("Number of rows: {}".format(output_df.shape[0]))

## Downloading the geospatial database:

In [None]:
!wget -q -O Geospatial_data.csv https://cocl.us/Geospatial_data
    
print('Data downloaded!')

## Loading the geospatial database into a Pandas dataframe and checking the data:

In [None]:
geospatial_df = pd.read_csv('Geospatial_data.csv')

geospatial_df.columns = ['PostalCode', 'Latitude', 'Longitude']

geospatial_df.head(10)

## Creating two new columns in our main dataframe:

In [None]:
output_df['Latitude'] = pd.Series()
output_df['Longitude'] = pd.Series()

## Filling the new columns with data gathered from the geospatial dataframe: 

In [None]:
for i in range(len(output_df)):
    
    code = output_df.loc[i, 'PostalCode']
    
    latitude = geospatial_df.loc[geospatial_df.PostalCode == code]['Latitude'].values[0]
    
    longitude = geospatial_df.loc[geospatial_df.PostalCode == code]['Longitude'].values[0]
    
    output_df.loc[i, 'Latitude'] = latitude
    
    output_df.loc[i, 'Longitude'] = longitude

## Checking the results of our operations:

In [None]:
output_df.head(10)

## Creating the Toronto map:

In [None]:
address = 'Toronto'

geolocator = Nominatim(user_agent = "myExplorer")

location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinates of Toronto are {}º and {}º.'.format(latitude, longitude))

In [None]:
# Creating the map:

mapToronto = folium.Map(location = [latitude, longitude], zoom_start = 10, min_zoom = 9, max_zoom = 14)

# Plottng the Postal Codes:

for code, lat, long, borough, neighborhood in zip(
    output_df['PostalCode'], 
    output_df['Latitude'], 
    output_df['Longitude'], 
    output_df['Borough'], 
    output_df['Neighborhood']):
    
    label = '{} ({}) - {}'.format(code, neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    
    marker = folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False)
    
    marker.add_to(mapToronto)
    
# Showing the map:
    
mapToronto

## Tasks related to the THIRD part of the project:

### Recycled function to get the venues by location using the Foursquare API:

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500, limit = 100):
    
    venues_list = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)

        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Recycled function to get the most common venues by location:

In [None]:
def return_most_common_venues(row, num_top_venues):
    
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## I decided to work with the boroughs that contain the word "Toronto" in their name, since they are the "core" of the city. So, let's select them:

In [None]:
boroughs = output_df[output_df['Borough'].str.contains('Toronto')].drop_duplicates('Borough')['Borough'].sort_values().to_list()

print(boroughs)

## And now we generate a new dataframe selecting only the Postal Codes that are located in the selected boroughs: 

In [None]:
centralTorontoData = output_df[output_df['Borough'].isin(boroughs)].reset_index(drop = True) 

centralTorontoData.head(10)

## NOTE: I decided to keep working with the Postal Codes because the Geospatial Database is more stable than the Geocoder. 

## Now let's create a first map of the studied region, including the four central boroughs of Toronto:

In [None]:
# Using the mean coordinates of the studied region as the center of the map:

meanLatitude = centralTorontoData['Latitude'].mean()
meanLongitude = centralTorontoData['Longitude'].mean()

# Creating the map:

mapCentralToronto = folium.Map(location = [meanLatitude, meanLongitude], zoom_start = 12, min_zoom = 10, max_zoom = 14)

# Plotting the Postal Codes:

for code, borough, neighborhood, lat, long in zip(
    centralTorontoData['PostalCode'], 
    centralTorontoData['Borough'],
    centralTorontoData['Neighborhood'],
    centralTorontoData['Latitude'], 
    centralTorontoData['Longitude']):
    
    label = '{} ({}) - {}'.format(code, neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    
    marker = folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False)
    
    marker.add_to(mapCentralToronto) 

# Showing the map:
    
mapCentralToronto

## Using the declared function getNearbyVenues to search for nearby venues:

In [None]:
centralTorontoVenues = getNearbyVenues(
    names = centralTorontoData['PostalCode'],
    latitudes = centralTorontoData['Latitude'],
    longitudes = centralTorontoData['Longitude'])

centralTorontoVenues.head(10)

## Checking the number of venues by Postal Code:

In [None]:
centralTorontoVenues.groupby('PostalCode')['Venue'].count()

## Checking the total of venues:

In [None]:
n = centralTorontoVenues['Venue'].count()
print('There are {} different venues.'.format(n))

## Checking the number of different categories of venues:

In [None]:
n = len(centralTorontoVenues['Venue Category'].unique())
print('There are {} uniques categories.'.format(n))

## Using one hot encoding to generate a new dataframe:

In [None]:
centralTorontoVenues_onehot = pd.get_dummies(centralTorontoVenues[['Venue Category']], prefix = "", prefix_sep = "")
centralTorontoVenues_onehot['PostalCode'] = centralTorontoVenues['PostalCode'] 
fixed_columns = [centralTorontoVenues_onehot.columns[-1]] + list(centralTorontoVenues_onehot.columns[:-1])
centralTorontoVenues_onehot = centralTorontoVenues_onehot[fixed_columns]

centralTorontoVenues_onehot.head(10)

## Grouping and normalizing the new dataframe:

In [None]:
centralTorontoVenues_onehot = centralTorontoVenues_onehot.groupby('PostalCode').mean().reset_index()
centralTorontoVenues_onehot.head(10)

In [None]:
num_top_venues = 5

for code in centralTorontoVenues_onehot['PostalCode']:
    print("---- " + code + " ----")
    temp = centralTorontoVenues_onehot[centralTorontoVenues_onehot['PostalCode'] == code].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

## Creating a new dataframe with the sorted most common venues by Postal Code:

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['PostalCode']

for ind in np.arange(num_top_venues):
    n = (ind + 1)
    try:
        columns.append('{}{} Most Common Venue'.format(n, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(n))

centralTorontoVenues_onehot_sorted = pd.DataFrame(columns = columns)
centralTorontoVenues_onehot_sorted['PostalCode'] = centralTorontoVenues_onehot['PostalCode']

for ind in np.arange(centralTorontoVenues_onehot.shape[0]):
    centralTorontoVenues_onehot_sorted.iloc[ind, 1:] = return_most_common_venues(centralTorontoVenues_onehot.iloc[ind, :], num_top_venues)

centralTorontoVenues_onehot_sorted.head(10)

## Using the KMeans algorithm to cluster the Postal Codes by their (dis)similarity: 

In [None]:
k = 5

centralTorontoVenues_clustering = centralTorontoVenues_onehot.drop('PostalCode', 1)

kmeans = KMeans(n_clusters = k, random_state = 0).fit(centralTorontoVenues_clustering)

print('Generated KMeans labels:\n')
print(kmeans.labels_)

## Merging the generated data to assign the cluster label and the most common venues to each Postal Code:

In [None]:
centralTorontoMerged = centralTorontoData.copy()

centralTorontoMerged.insert(1, 'Cluster Label', kmeans.labels_)

centralTorontoMerged = centralTorontoMerged.join(centralTorontoVenues_onehot_sorted.set_index('PostalCode'), on = 'PostalCode')

centralTorontoMerged.head(10)

## Creating the map of the clusters:

In [None]:
# Creating the map:

mapClusters = folium.Map(location = [meanLatitude, meanLongitude], zoom_start = 12, min_zoom = 10, max_zoom = 14)

# Defining the clusters colors:

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Plotting the clusters:

for code, cluster, borough, neighborhood, lat, long in zip(
    centralTorontoMerged['PostalCode'],
    centralTorontoMerged['Cluster Label'],
    centralTorontoMerged['Borough'],
    centralTorontoMerged['Neighborhood'],
    centralTorontoMerged['Latitude'], 
    centralTorontoMerged['Longitude']):
    
    label = 'Cluster {}: {} ({}) - {}'.format((cluster + 1), code, neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    
    marker = folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7)
    
    marker.add_to(mapClusters)
    
# Showing the map:
       
mapClusters

## Examining the clusters:

### Cluster 1:

In [None]:
centralTorontoMerged.loc[
    centralTorontoMerged['Cluster Label'] == 0, 
    centralTorontoMerged.columns[[0, 3] + list(range(6, centralTorontoMerged.shape[1]))]]

### Cluster 2:

In [None]:
centralTorontoMerged.loc[
    centralTorontoMerged['Cluster Label'] == 1, 
    centralTorontoMerged.columns[[0, 3] + list(range(6, centralTorontoMerged.shape[1]))]]

### Cluster 3:

In [None]:
centralTorontoMerged.loc[
    centralTorontoMerged['Cluster Label'] == 2, 
    centralTorontoMerged.columns[[0, 3] + list(range(6, centralTorontoMerged.shape[1]))]]

### Cluster 4:

In [None]:
centralTorontoMerged.loc[
    centralTorontoMerged['Cluster Label'] == 3, 
    centralTorontoMerged.columns[[0, 3] + list(range(6, centralTorontoMerged.shape[1]))]]

### Cluster 5:

In [None]:
centralTorontoMerged.loc[
    centralTorontoMerged['Cluster Label'] == 4, 
    centralTorontoMerged.columns[[0, 3] + list(range(6, centralTorontoMerged.shape[1]))]]