## IMPORT STATEMENTS

In [1]:
import numpy as np
import pandas as pd
import urllib
from urllib import error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import requests
import urllib
from geopy.geocoders import Nominatim
import geopy
import folium
import webbrowser
import time
import pyproj
import math
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib notebook

## DEFINE GENERAL CONSTANTS

In [2]:
TORONTO_UTM_ZONE = 17
USE_GITHUB_DATA_FILES = True
GITHUB_GEODATA_URL = "https://raw.githubusercontent.com/mauromariotto/w3_data_science_capstone/master" \
                     "/Geospatial_Coordinates.csv"
TORONTO_CITY_WIKI_LINK = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
TORONTO_GEO_DATA_CSV = "./data/Geospatial_Coordinates.csv"

# Foursquare
FS_LIMIT = 100
FS_RADIUS = 500
FS_VERSION = '20180605'
FS_CLIENT_ID = 'S302XSEPLIYOI3IRSNR5YQMFW2XQAZ51C4BEMKTEIP0UIKBX'
FS_CLIENT_SECRET = 'HIYXXEYLBSJ1ZJHKIKXJQAITZSKKEGK1NKGEN15WKEQJV14G'

In [3]:
TORONTO_NEIGH_CSV = "./data/toronto_neigh_df.csv"
TORONTO_VENUES_CSV = "./data/toronto_venues.csv"
TORONTO_NEIGH_DEF_CSV = "./data/toronto_neigh_def_df.csv"
TORONTO_VENUES_CATEGORIES_CSV = "./data/toronto_venues_categories.csv"
TORONTO_WIKIPEDIA_HTML_FILE = "./output/toronto-wiki.html"
FINAL_TORONTO_NEIGH_CSV = "./data/final_toronto_neigh_def_df.csv"

## GENERAL FUNCTION USED IN THE PROJECT


In [4]:
# drops unamed columns from the dataframe df
def drop_unnamed_columns(df):
    df.drop(df.columns[df.columns.str.contains('unnamed', case=False)],
            axis=1, inplace=True)


In [5]:
# function to save content to a file
def save2file(content: str, fn: str):
    with open(fn, 'w') as f:
        f.write(str(content))

In [6]:
# function to create a path
def makedir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [7]:
# function checks if filename fn exists 
def file_exists(fn):
    result = os.path.isfile(fn)
    return result

In [8]:
def rename_unnamed_colums(current_colname, new_colname):
    if "UNNAMED" in current_colname.upper():
        result = new_colname
    else:
        result = current_colname
    return result

In [9]:
# function for parsing wiki page of Toronto
def parse_wiki_page(html):
    bs = BeautifulSoup(html, 'html.parser')
    table = bs.find('table', {'class': 'wikitable sortable'})
    result = []
    for district in table.tr.next_siblings:
        if hasattr(district, 'contents'):
            data = district.contents
            for element in data:
                if hasattr(element, 'contents'):
                    result.append(str(element.contents[0]).strip())
    return result

In [10]:
# function to load the dataframe with the information obtained from the wiki 
# page about Toronto
def load_city_dataframe(city_info):
    start = 0
    delta = 3
    stop = delta
    city_data_len = len(city_info)
    total_rows = city_data_len // 3
    row = 0
    t_index = row
    # Create empty dataframe
    # The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
    df = pd.pandas.DataFrame(columns=('PostalCode', 'Borough', 'Neighborhood'))

    while row < total_rows:
        data = city_info[start:stop]
        t_postal_code, t_borough, t_neigh = data
        t_borough = t_borough.strip()
        t_neigh = t_neigh.strip()
        start += delta
        stop += delta
        # Only process the cells that have an assigned borough.
        # Ignore cells with a borough that is Not assigned.
        if t_borough.upper() != "NOT ASSIGNED":
            # If a cell has a borough but a Not assigned neighborhood,
            # then the neighborhood will be the same as the borough.
            if t_neigh.upper() == "NOT ASSIGNED":
                t_neigh = t_borough
            # Verify if postalcode already in dataframe
            found_list = df.index[df['PostalCode'] == t_postal_code].tolist()
            if len(found_list) == 0:
                df.loc[t_index] = [t_postal_code, t_borough, t_neigh]
                t_index += 1
            else:
                print("A Duplicate postalcode has been found. Update Dataframe without appending")
                found_index = found_list[0]
                s = df.loc[found_index]['Neighborhood']
                df.loc[found_index]['Neighborhood'] = s + ", " + t_neigh
        row += 1
    return df

In [11]:
# function that transforms longitude and latitude into UTM 2D 
# coordinates for zone 17 corresponding to Totonto city
def latlng_2_utm(t_lat, t_lon):
    proj_latlon = pyproj.Proj(proj='latlong', datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=TORONTO_UTM_ZONE, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, t_lon, t_lat)
    return xy[0], xy[1]

In [12]:
# function that transforms the UTM 2D coordinates for TORONTO zone=17 to longitude 
# and latitude
def utm_2_latlng(t_x, t_y):
    proj_latlon = pyproj.Proj(proj='latlong', datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=TORONTO_UTM_ZONE, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, t_x, t_y)
    return lonlat[1], lonlat[0]

In [13]:
# function that calculate the euclidean distance between two points
def euclidean_distance_meter(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx*dx + dy*dy)

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except KeyError:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [15]:
# function to repeat the same search process to all the neighborhoods using foursquare API
# Returns a dataframe with the info for each neighborhood
def get_nearby_venues(names, latitudes, longitudes, radius, limit):
    venues_list = []
    for name, latitude, longitude in zip(names, latitudes, longitudes):
        print("Calling foursquare API for neighborhood: {name}".format(name=name))
        # create the API request URL
        tmp_url = 'https://api.foursquare.com/v2/venues/explore?'
        tmp_url = tmp_url + '&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'
        tmp_url = tmp_url.format(
            FS_CLIENT_ID,
            FS_CLIENT_SECRET,
            FS_VERSION,
            latitude,
            longitude,
            radius,
            limit)
        # make the GET request
        try:
            results = requests.get(tmp_url).json()["response"]['groups'][0]['items']
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, latitude, longitude,
                v['venue']['name'],
                v['venue']['location']['lat'], v['venue']['location']['lng'],
                v['venue']['categories'][0]['name']) for v in results])
        except KeyError as t_error:
            print("Error retrieving Foursquare Venues info for neighborhood {neigh}".format(neigh=name))
            print(t_error.args)
    result = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    result.columns = ['Neighborhood', 'Neigh_lat', 'Neigh_lng', 'Venue_name', 'Venue_lat', 'Venue_lng', 'Venue_cat']
    return result


In [16]:
def get_most_common_venues(row, num):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num]


In [17]:
def load_local_wiki_file(fn):
    result = ""
    if file_exists(fn):
        with open(fn, 'r') as f:
            result = f.read()
    return result

In [18]:
# General function for creating a folium map. 
# Returns the map created
def create_folium_map(df, t_lat, t_lng, zoom_start=10):
    result = folium.Map(location=[t_lat, t_lng], zoom_start=zoom_start)
    # add markers to the toronto map
    for pc, t_borough, t_neigh, t_neigh_lat, t_neigh_lng in zip(df['PostalCode'],
                                                                df['Borough'],
                                                                df['Neighborhood'],
                                                                df['Neigh_lat'],
                                                                df['Neigh_lng']):
        t_label = 'Neigh: {neigh}, Postalcode: {pc}, District: {borough}'.format(pc=pc,
                                                                                 neigh=t_neigh,
                                                                                 borough=t_borough)
        t_label = folium.Popup(t_label, parse_html=True)
        folium.CircleMarker([t_neigh_lat, t_neigh_lng],
                            radius=5,
                            popup=t_label,
                            color='blue',
                            fill=True,
                            fill_color='#3186cc',
                            fill_opacity=0.7,
                            parse_html=False).add_to(result)
    return result

## HERE BEGINS PART 1 WEEK 3 

In [19]:
print(" ****** Part 1 W3 - Segmenting and Clustering Neighborhoods in Toronto")
print(" ****** Current page of wikipedia for city toronto has no duplicates!! ***")

 ****** Part 1 W3 - Segmenting and Clustering Neighborhoods in Toronto
 ****** Current page of wikipedia for city toronto has no duplicates!! ***


In [20]:
save_toronto_wiki_html_locally = False
html_text = ""

In [21]:
try:
    url = urlopen(TORONTO_CITY_WIKI_LINK)
    html_text = url.read()
except error.HTTPError as err:
    print("Error opening url {link} ".format(link=TORONTO_CITY_WIKI_LINK))
    html_text = load_local_wiki_file(TORONTO_WIKIPEDIA_HTML_FILE)
except error.URLError as err:
    print("Error opening url {link} ".format(link=TORONTO_CITY_WIKI_LINK))
    html_text = load_local_wiki_file(TORONTO_WIKIPEDIA_HTML_FILE)

if save_toronto_wiki_html_locally:
    makedir("./output")
    save2file(html_text, TORONTO_WIKIPEDIA_HTML_FILE)

# Parse wikipedia page
toronto_city_data = parse_wiki_page(html_text)

In [22]:
# Load dataframe
toronto_city_df = load_city_dataframe(toronto_city_data)

In [23]:
toronto_city_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## HERE BEGINS PART 2 WEEK 3

Use the Geocoder package or the csv file to create the following dataframe:
add column: Latitude
add column: Longitude 


In [24]:
print(" ****** Part 2 W3 - Segmenting and Clustering Neighborhoods in Toronto")

 ****** Part 2 W3 - Segmenting and Clustering Neighborhoods in Toronto


In [25]:
if USE_GITHUB_DATA_FILES:
    try:
        filename, headers = urllib.request.urlretrieve(GITHUB_GEODATA_URL, filename="./Geospatial_Coordinates.csv")
        geo_spatial_csv = filename
        print(geo_spatial_csv)
    except error.URLError as e:
        print(" ****** Error opening file {file}".format(file=GITHUB_GEODATA_URL))
        if file_exists(TORONTO_GEO_DATA_CSV):
            geo_spatial_csv = TORONTO_GEO_DATA_CSV
        else:
            geo_spatial_csv = ""
else:
    geo_spatial_csv = "./data/Geospatial_Coordinates.csv"

./Geospatial_Coordinates.csv


In [26]:
toronto_geo_df = pd.read_csv(geo_spatial_csv)
print(toronto_geo_df.head())

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476


In [27]:
toronto_city_df = pd.merge(toronto_city_df, toronto_geo_df, on=["PostalCode"])

## Merge the geo spatial information with the dataframe obtained from the wiki page

In [28]:
toronto_city_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [29]:
print("Shape of Toronto city dataframe {shape}".format(shape=toronto_city_df.shape)) 

Shape of Toronto city dataframe (103, 5)


## HERE BEGINS PART 3 WEEK 3

## Explore and cluster the neighborhoods in Toronto. 

In [30]:
toronto_address = 'Toronto, Canada'
geo_locator = Nominatim(user_agent="toronto_explorer")

In [31]:
try:
    toronto_location = geo_locator.geocode(toronto_address)
except geopy.exc.GeocoderServiceError as e:
    print(e)

In [32]:
toronto_coordinates = {'centre': [43.653982, -79.380319],
                       'uni':    [43.656997372, -79.390331772],
                       'town':   [43.70011, -79.4163]}

In [39]:
print(' ****** The geograpical coordinates of Toronto city are ({}, {}).'.format(toronto_coordinates['town'][0],
                                                                                toronto_coordinates['town'][1]))

 ****** The geograpical coordinates of Toronto city are (43.70011, -79.4163).


In [40]:
# Check if function is bijective
utm_x, utm_y = latlng_2_utm(toronto_coordinates['town'][0], toronto_coordinates['town'][1])

In [41]:
utm_x, utm_y

(627610.2971325599, 4839784.21551608)

In [42]:
check_lat, check_lng = utm_2_latlng(utm_x, utm_y)
check_lat, check_lng

(43.700109999999995, -79.4163)

In [43]:
if abs(toronto_coordinates['town'][0] - check_lat) > 1.E-05 or abs(toronto_coordinates['town'][1] - check_lng) > 1.E-05:
    print(" ****** Error - Function is not bijective!")
    exit(0)
else:
    print(" ****** Function is bijective.")

 ****** Function is bijective.


In [104]:
# Get the GPS Coordinates of each Neighborhood and build a new dataframe with these new informations
# Calculate the distance in UTM meters from the city center
toronto_neigh_df = pd.DataFrame(columns=('PostalCode',
                                         'Borough',
                                         'Neighborhood',
                                         'PostalCode_lat',
                                         'PostalCode_lng',
                                         'Neigh_lat',
                                         'Neigh_lng',
                                         'Neigh_UTM_x',
                                         'Neigh_UTM_y',
                                         'Distance_from_centre'))


In [105]:
index = 0
toronto_center_utm_x, toronto_center_utm_y = latlng_2_utm(toronto_coordinates['centre'][0],
                                                          toronto_coordinates['centre'][1])

In [106]:
toronto_center_utm_x, toronto_center_utm_y

(630609.6272817928, 4834717.0830447655)

In [107]:
for postal_code, borough, neighs, pc_lat, pc_long in zip(toronto_city_df['PostalCode'],
                                                         toronto_city_df['Borough'],
                                                         toronto_city_df['Neighborhood'],
                                                         toronto_city_df['Latitude'],
                                                         toronto_city_df['Longitude']):
    for neigh in neighs.split(","):
        neigh = neigh.strip()
        borough = borough.strip()
        address = "{}, Toronto, Canada".format(neigh)
        try:
            # Wait because Geolocator has troubles with too many calls
            time.sleep(0.5)

            location = geo_locator.geocode(address)
            neigh_lat = location.latitude
            neigh_lng = location.longitude
            print(" ******  Latitude {lat}, Longitude {lng} for neigh {neigh}".format(lat=neigh_lat,
                                                                                      lng=neigh_lng,
                                                                                      neigh=neigh))
            neigh_utm_x, neigh_utm_y = latlng_2_utm(neigh_lat, neigh_lng)
            distance_centre = euclidean_distance_meter(neigh_utm_x, neigh_utm_y,
                                                       toronto_center_utm_x,
                                                       toronto_center_utm_y)
            toronto_neigh_df.loc[index] = [postal_code,
                                           borough,
                                           neigh,
                                           pc_lat,
                                           pc_long,
                                           neigh_lat,
                                           neigh_lng,
                                           neigh_utm_x,
                                           neigh_utm_y,
                                           distance_centre]
            index += 1
        except AttributeError as e:
            print(" ****** No Coordinates GPS found for neighborhood {neigh}".format(neigh=neigh))
        except geopy.exc.GeocoderUnavailable as e:
            print(" ****** Geo Service not available.")
            print(" ****** Wait 1 second")
            time.sleep(1)


 ******  Latitude 43.7587999, Longitude -79.3201966 for neigh Parkwoods
 ******  Latitude 43.732658, Longitude -79.3111892 for neigh Victoria Village
 ******  Latitude 43.6607056, Longitude -79.3604569 for neigh Regent Park
 ******  Latitude 43.6400801, Longitude -79.3801495 for neigh Harbourfront
 ******  Latitude 43.7220788, Longitude -79.4375067 for neigh Lawrence Manor
 ******  Latitude 43.7227784, Longitude -79.4509332 for neigh Lawrence Heights
 ******  Latitude 43.659659, Longitude -79.3903399 for neigh Queen's Park
 ****** No Coordinates GPS found for neighborhood Ontario Provincial Government
 ******  Latitude 43.6794838, Longitude -79.5389092 for neigh Islington Avenue
 ******  Latitude 43.6664717, Longitude -79.5243136 for neigh Humber Valley Village
 ******  Latitude 43.8091955, Longitude -79.2217008 for neigh Malvern
 ******  Latitude 43.8049304, Longitude -79.1658374 for neigh Rouge
 ******  Latitude 43.775347, Longitude -79.3459439 for neigh Don Mills
 ******  Latitude 4

 ******  Latitude 43.6918051, Longitude -79.2644935 for neigh Birch Cliff
 ******  Latitude 43.7111699, Longitude -79.2481769 for neigh Cliffside West
 ******  Latitude 43.7615095, Longitude -79.4109234 for neigh Willowdale
 ******  Latitude 43.7615095, Longitude -79.4109234 for neigh Willowdale East
 ******  Latitude 43.7492988, Longitude -79.462248 for neigh Downsview
 ******  Latitude 43.729199, Longitude -79.4032525 for neigh Lawrence Park
 ******  Latitude 43.6994563, Longitude -79.4546164 for neigh Roselawn
 ******  Latitude 43.6517026, Longitude -79.4759978 for neigh Runnymede
 ******  Latitude 43.6654775, Longitude -79.470352 for neigh The Junction North
 ******  Latitude 43.7001608, Longitude -79.5162474 for neigh Weston
 ******  Latitude 43.7528467, Longitude -79.282067 for neigh Dorset Park
 ******  Latitude 43.7432421, Longitude -79.304641 for neigh Wexford Heights
 ******  Latitude 43.7761341, Longitude -79.25843763592165 for neigh Scarborough Town Centre
 ******  Latitude

 ******  Latitude 43.6481827, Longitude -79.5112961 for neigh Royal York South West


In [117]:
print("Process of discovering GPS coordinates terminated!")            

Process of discovering GPS coordinates terminated!


In [118]:
toronto_neigh_df.shape

(202, 10)

In [119]:
toronto_neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,PostalCode_lat,PostalCode_lng,Neigh_lat,Neigh_lng,Neigh_UTM_x,Neigh_UTM_y,Distance_from_centre
0,M3A,North York,Parkwoods,43.753259,-79.329656,43.7588,-79.320197,635222.011341,4846455.0,12611.614627
1,M4A,North York,Victoria Village,43.725882,-79.315572,43.732658,-79.311189,636006.297239,4843566.0,10364.971928
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,43.660706,-79.360457,632196.55475,4835495.0,1767.465727
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015,630653.432519,4833173.0,1544.3424
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,43.722079,-79.437507,625855.506783,4842192.0,8858.449803


## Create a folium Map with all neighborhoods of Toronto

In [120]:
map_toronto = create_folium_map(toronto_neigh_df,
                                toronto_coordinates['town'][0],
                                toronto_coordinates['town'][1])

In [121]:
map_toronto

## Get Venues for all Neighborhoods found in Toronto

In [122]:
toronto_venues_df = get_nearby_venues(names=toronto_neigh_df['Neighborhood'],
                                          latitudes=toronto_neigh_df['Neigh_lat'],
                                          longitudes=toronto_neigh_df['Neigh_lng'],
                                          radius=FS_RADIUS,
                                          limit=FS_LIMIT)

Calling foursquare API for neighborhood: Parkwoods
Calling foursquare API for neighborhood: Victoria Village
Calling foursquare API for neighborhood: Regent Park
Calling foursquare API for neighborhood: Harbourfront
Calling foursquare API for neighborhood: Lawrence Manor
Calling foursquare API for neighborhood: Lawrence Heights
Calling foursquare API for neighborhood: Queen's Park
Calling foursquare API for neighborhood: Islington Avenue
Calling foursquare API for neighborhood: Humber Valley Village
Calling foursquare API for neighborhood: Malvern
Calling foursquare API for neighborhood: Rouge
Calling foursquare API for neighborhood: Don Mills
Calling foursquare API for neighborhood: Parkview Hill
Calling foursquare API for neighborhood: Woodbine Gardens
Calling foursquare API for neighborhood: Garden District
Calling foursquare API for neighborhood: Ryerson
Calling foursquare API for neighborhood: Glencairn
Calling foursquare API for neighborhood: West Deane Park
Calling foursquare AP

Calling foursquare API for neighborhood: Kensington Market
Calling foursquare API for neighborhood: Chinatown
Calling foursquare API for neighborhood: Grange Park
Calling foursquare API for neighborhood: Milliken
Calling foursquare API for neighborhood: Agincourt North
Calling foursquare API for neighborhood: Steeles East
Calling foursquare API for neighborhood: L'Amoreaux East
Calling foursquare API for neighborhood: Summerhill West
Calling foursquare API for neighborhood: Rathnelly
Calling foursquare API for neighborhood: South Hill
Calling foursquare API for neighborhood: Forest Hill SE
Calling foursquare API for neighborhood: Deer Park
Calling foursquare API for neighborhood: CN Tower
Calling foursquare API for neighborhood: King and Spadina
Calling foursquare API for neighborhood: Harbourfront West
Calling foursquare API for neighborhood: Bathurst Quay
Calling foursquare API for neighborhood: South Niagara
Calling foursquare API for neighborhood: New Toronto
Calling foursquare API

In [125]:
print("Gathering Venues information for all neighborhoods of Toronto has been terminated!")

Gathering Venues information for all neighborhoods of Toronto has been terminated!


In [126]:
toronto_venues_df.shape

(5937, 7)

In [127]:
toronto_venues_df

Unnamed: 0,Neighborhood,Neigh_lat,Neigh_lng,Venue_name,Venue_lat,Venue_lng,Venue_cat
0,Parkwoods,43.758800,-79.320197,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant
1,Parkwoods,43.758800,-79.320197,LCBO,43.757774,-79.314257,Liquor Store
2,Parkwoods,43.758800,-79.320197,Petro-Canada,43.757950,-79.315187,Gas Station
3,Parkwoods,43.758800,-79.320197,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy
4,Parkwoods,43.758800,-79.320197,Pizza Pizza,43.760231,-79.325666,Pizza Place
...,...,...,...,...,...,...,...
5932,Royal York South West,43.648183,-79.511296,The Old Sod,43.648297,-79.507642,Pub
5933,Royal York South West,43.648183,-79.511296,Tim Hortons,43.646678,-79.513700,Coffee Shop
5934,Royal York South West,43.648183,-79.511296,Rogers,43.647080,-79.511550,Mobile Phone Shop
5935,Royal York South West,43.648183,-79.511296,Gabby's Grill & Taps,43.648452,-79.506482,Bar


## Let's find out how many unique categories can be extracted from all the returned venues


In [128]:
print(' ****** There are {} uniques venues categories.'.format(len(toronto_venues_df['Venue_cat'].unique())))
venues_categories = list(toronto_venues_df['Venue_cat'].unique())
venues_categories.sort(reverse=False)

 ****** There are 333 uniques venues categories.


In [129]:
venues_categories

['Accessories Store',
 'Afghan Restaurant',
 'African Restaurant',
 'Airport',
 'Airport Service',
 'American Restaurant',
 'Animal Shelter',
 'Antique Shop',
 'Aquarium',
 'Argentinian Restaurant',
 'Art Gallery',
 'Art Museum',
 'Arts & Crafts Store',
 'Arts & Entertainment',
 'Asian Restaurant',
 'Athletics & Sports',
 'Auto Dealership',
 'Auto Workshop',
 'Automotive Shop',
 'BBQ Joint',
 'Bagel Shop',
 'Bakery',
 'Bank',
 'Bar',
 'Baseball Field',
 'Baseball Stadium',
 'Basketball Stadium',
 'Beach',
 'Beach Bar',
 'Bed & Breakfast',
 'Beer Bar',
 'Beer Store',
 'Belgian Restaurant',
 'Big Box Store',
 'Bike Shop',
 'Bike Trail',
 'Bistro',
 'Boat or Ferry',
 'Bookstore',
 'Boutique',
 'Bowling Alley',
 'Brazilian Restaurant',
 'Breakfast Spot',
 'Brewery',
 'Bridal Shop',
 'Bubble Tea Shop',
 'Buffet',
 'Building',
 'Burger Joint',
 'Burrito Place',
 'Bus Line',
 'Bus Station',
 'Bus Stop',
 'Business Service',
 'Butcher',
 'Café',
 'Camera Store',
 'Cantonese Restaurant',
 'Cari

##  Update toronto_neigh_df with the total count of venues for each neighborhood

In [130]:
venues_by_neighborhood_df = toronto_venues_df.groupby('Neighborhood').count().reset_index()
columns_to_drop = [2, 3, 4, 5, 6]

In [131]:
venues_by_neighborhood_df.drop(venues_by_neighborhood_df.columns[columns_to_drop], axis=1, inplace=True)
cols = venues_by_neighborhood_df.columns

# venues_by_neighborhood_df.columns = [rename_unnamed_colums(col, "Neigh_venues_count") for col in cols]
venues_by_neighborhood_df.columns = ["Neighborhood", "Neigh_venues_count"]

In [132]:
venues_by_neighborhood_df.shape

(191, 2)

In [133]:
venues_by_neighborhood_df

Unnamed: 0,Neighborhood,Neigh_venues_count
0,Adelaide,100
1,Agincourt,12
2,Agincourt North,25
3,Albion Gardens,11
4,Alderwood,8
...,...,...
186,Woodbine Heights,7
187,York Mills,16
188,York Mills West,16
189,York University,2


## Merge dataframe toronto_neigh_df with the relevant vanues

In [134]:
toronto_neigh_venues_df = pd.merge(toronto_neigh_df, venues_by_neighborhood_df, on=["Neighborhood"])

In [136]:
toronto_neigh_venues_df.shape

(200, 11)

In [137]:
toronto_neigh_venues_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,PostalCode_lat,PostalCode_lng,Neigh_lat,Neigh_lng,Neigh_UTM_x,Neigh_UTM_y,Distance_from_centre,Neigh_venues_count
0,M3A,North York,Parkwoods,43.753259,-79.329656,43.7588,-79.320197,635222.011341,4846455.0,12611.614627,12
1,M4A,North York,Victoria Village,43.725882,-79.315572,43.732658,-79.311189,636006.297239,4843566.0,10364.971928,5
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,43.660706,-79.360457,632196.55475,4835495.0,1767.465727,21
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015,630653.432519,4833173.0,1544.3424,100
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,43.722079,-79.437507,625855.506783,4842192.0,8858.449803,5


## Filter the dataset with the following rules and then cluster what remains
As a business man has to leave New York and go to Toronto for a new job he asked to find a neighborhood
in Toronto with the following characteristics for an apartment to rent temporaly

 <ol> <li>The neighborhood must be within 4 Km from the city centre </li>
 <li>The neighborhood must have restaurants or pizza </li>
 <li>The neighborhood must have at least 30 venues available </li>
 <li>Retrieve the best neighborhoods clusters with KMeans after filtering </li>
   </ol>

As you can see from the dataset toronto_neigh_venues_df this information is already available in the columns <b>Distance_from_centre</b>, expressed as meters,  and in the column <b>Neigh_venues_count</b>

In [138]:
max_distance_from_centre_meter = 4000
min_other_venues_per_neighborhood = 30

In [147]:
filtered_toronto_neigh_df = toronto_neigh_venues_df.loc[toronto_neigh_venues_df["Distance_from_centre"]
                                                        < max_distance_from_centre_meter]

In [148]:
filtered_toronto_neigh_df = filtered_toronto_neigh_df.loc[filtered_toronto_neigh_df["Neigh_venues_count"]
                                                          > min_other_venues_per_neighborhood]

In [149]:
filtered_total_venues_in_toronto = filtered_toronto_neigh_df["Neigh_venues_count"].sum()

In [150]:
print("***** Total venues after filtering: {total}".format(total=filtered_total_venues_in_toronto))

***** Total venues after filtering: 3276


In [151]:
restaurants_venues_categories = ('Restaurant', 'Pizza')
venues_categories = restaurants_venues_categories

## Venues being Restaurant or Pizza

In [155]:
filtered_toronto_venues_df = toronto_venues_df.loc[
    toronto_venues_df["Venue_cat"].str.contains("|".join(venues_categories), case=False)]

In [156]:
filtered_toronto_venues_df.shape

(1675, 7)

In [157]:
filtered_toronto_venues_df

Unnamed: 0,Neighborhood,Neigh_lat,Neigh_lng,Venue_name,Venue_lat,Venue_lng,Venue_cat
0,Parkwoods,43.758800,-79.320197,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant
4,Parkwoods,43.758800,-79.320197,Pizza Pizza,43.760231,-79.325666,Pizza Place
11,Parkwoods,43.758800,-79.320197,Spicy Chicken House,43.760639,-79.325671,Chinese Restaurant
12,Victoria Village,43.732658,-79.311189,Armenian Kitchen,43.731071,-79.305390,Middle Eastern Restaurant
13,Victoria Village,43.732658,-79.311189,Jatujak,43.736208,-79.307668,Thai Restaurant
...,...,...,...,...,...,...,...
5919,Royal York South West,43.648183,-79.511296,La Veranda Osteria,43.647638,-79.511442,Italian Restaurant
5923,Royal York South West,43.648183,-79.511296,Chutneys Fine Indian Cuisine,43.646860,-79.514037,Indian Restaurant
5925,Royal York South West,43.648183,-79.511296,Swiss Chalet,43.647888,-79.508356,Restaurant
5927,Royal York South West,43.648183,-79.511296,Just Greek,43.647676,-79.510533,Greek Restaurant


In [159]:
# Now extract from toronto only those neighborhoods having these features combined
neigh_of_interest = list(filtered_toronto_venues_df["Neighborhood"].unique())


In [160]:
final_toronto_neigh_df = filtered_toronto_neigh_df[filtered_toronto_neigh_df["Neighborhood"].isin(neigh_of_interest)]
final_toronto_neigh_df.drop_duplicates(subset='Neighborhood', keep="last", inplace=True)

In [163]:
final_toronto_neigh_df.shape

(43, 11)

In [165]:
total_rows = final_toronto_neigh_df.shape[0]
final_toronto_neigh_df.head(total_rows)

Unnamed: 0,PostalCode,Borough,Neighborhood,PostalCode_lat,PostalCode_lng,Neigh_lat,Neigh_lng,Neigh_UTM_x,Neigh_UTM_y,Distance_from_centre,Neigh_venues_count
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015,630653.432519,4833173.0,1544.3424,100
6,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,43.659659,-79.39034,629789.309083,4835332.0,1025.123558,72
13,M4B,East York,Parkview Hill,43.706397,-79.309937,43.653482,-79.383935,630319.138862,4834656.0,296.875492,77
15,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,43.6565,-79.377114,630862.602373,4835002.0,380.818539,65
16,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,43.658469,-79.378993,630706.797594,4835218.0,509.824145,100
18,M9B,Etobicoke,Princess Gardens,43.650943,-79.554724,43.640466,-79.391224,629759.36975,4833199.0,1740.068889,80
19,M9B,Etobicoke,Martin Grove,43.650943,-79.554724,43.653482,-79.383935,630319.138862,4834656.0,296.875492,77
27,M4X,Downtown Toronto,St. James Town,43.667967,-79.367675,43.669403,-79.372704,631190.100884,4836442.0,1819.805908,62
36,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,43.647984,-79.375396,631019.666822,4834059.0,775.607178,100
39,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,43.667342,-79.388457,629924.548909,4836188.0,1622.741286,100


## Show the selected neighborhoods in the folium map
 Create a map of Toronto with neighborhoods after filtering.

In [174]:
map_toronto_final = create_folium_map(final_toronto_neigh_df,
                                      toronto_coordinates['town'][0],
                                      toronto_coordinates['town'][1],12)

In [175]:
map_toronto_final
                 

In [176]:
print("***** Final neighborhoods selected {count}:".format(count=final_toronto_neigh_df.shape[0]))

***** Final neighborhoods selected 43:


## KMeans clustering

In [177]:
print("***** Applying the KMeans algorithm for clustering")

***** Applying the KMeans algorithm for clustering


In [181]:
max_number_of_clusters = 10
include_venues_count = False
use_mms_scaler = True
if include_venues_count:
    columns_for_kmeans = ['Neigh_UTM_x', 'Neigh_UTM_y', 'Neigh_venues_count']
else:
    columns_for_kmeans = ['Neigh_UTM_x', 'Neigh_UTM_y']


In [182]:
X = final_toronto_neigh_df[columns_for_kmeans].values
sil_scores = []
std_scaler = StandardScaler()
mms = MinMaxScaler()
if use_mms_scaler:
    X_normalized = mms.fit_transform(X)
else:
    X_normalized = std_scaler.fit_transform(X)

In [183]:
X_normalized

array([[0.75979431, 0.12879977],
       [0.5539618 , 0.48292164],
       [0.68016621, 0.37201258],
       [0.80961816, 0.42876078],
       [0.77250576, 0.46416916],
       [0.54683031, 0.13298831],
       [0.68016621, 0.37201258],
       [0.88762766, 0.66502111],
       [0.8470306 , 0.27405173],
       [0.58617566, 0.62339848],
       [0.01268246, 0.55689416],
       [0.76693765, 0.31857839],
       [0.80123592, 0.29102553],
       [0.68016621, 0.37201258],
       [0.75979431, 0.12879977],
       [0.75774843, 0.21286272],
       [0.45672735, 0.58447547],
       [0.73254309, 0.26143657],
       [0.75651094, 0.2684009 ],
       [0.04142729, 0.        ],
       [0.7772494 , 0.27513155],
       [0.87675985, 0.66607644],
       [0.55253641, 0.29927872],
       [0.        , 0.23657947],
       [0.22630823, 0.67321444],
       [0.55124907, 0.69664477],
       [0.0910809 , 0.51056138],
       [0.53951384, 0.8840798 ],
       [0.32727518, 0.39887443],
       [0.40967117, 0.358229  ],
       [0.

In [184]:
kmeans_model = None
bestK = 0
best_score = 0
for k in range(2, max_number_of_clusters+1):
    kmeans = KMeans(n_clusters=k,
                    random_state=0)
    kmeans_model = kmeans.fit(X_std)
    labels = kmeans_model.labels_
    score = silhouette_score(X_std, labels, metric='euclidean')
    if score > best_score:
        bestK = k
        best_score = score
    sil_scores.append(score)

In [185]:
bestK = bestK - 1
print("***** Silhoutte score for KMeans: {scores}".format(scores=sil_scores))
print("***** Best K for kMeans: {k}".format(k=bestK))

***** Silhoutte score for KMeans: [0.478713707995623, 0.4711127650250308, 0.42372578916394693, 0.41335651592870554, 0.4448387692222902, 0.4940092546992579, 0.4806961718346695, 0.49493620022185353, 0.4949946836666868]
***** Best K for kMeans: 9


## Use Silhoutte score for finding the best cluster number

In [186]:
# Use KMeans with the best K
kmeans = KMeans(n_clusters=bestK,
                random_state=0)
kmeans_model = kmeans.fit(X_std)
final_toronto_neigh_df.insert(0, 'Cluster Labels', kmeans_model.labels_)

[0 1 2 2 2 0 2 6 8 1 4 8 8 2 0 8 1 8 8 3 8 6 7 3 4 1 4 5 7 7 7 5 5 0 7 0 3
 2 2 3 6 8 1]


In [187]:
print("Labels from KMeans:",kmeans_model.labels_)

Labels from KMeans: [0 1 2 2 2 0 2 6 8 1 4 8 8 2 0 8 1 8 8 3 8 6 7 3 4 1 4 5 7 7 7 5 5 0 7 0 3
 2 2 3 6 8 1]


In [189]:
final_toronto_neigh_df

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,PostalCode_lat,PostalCode_lng,Neigh_lat,Neigh_lng,Neigh_UTM_x,Neigh_UTM_y,Distance_from_centre,Neigh_venues_count
3,0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015,630653.432519,4833173.0,1544.3424,100
6,1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,43.659659,-79.39034,629789.309083,4835332.0,1025.123558,72
13,2,M4B,East York,Parkview Hill,43.706397,-79.309937,43.653482,-79.383935,630319.138862,4834656.0,296.875492,77
15,2,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,43.6565,-79.377114,630862.602373,4835002.0,380.818539,65
16,2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,43.658469,-79.378993,630706.797594,4835218.0,509.824145,100
18,0,M9B,Etobicoke,Princess Gardens,43.650943,-79.554724,43.640466,-79.391224,629759.36975,4833199.0,1740.068889,80
19,2,M9B,Etobicoke,Martin Grove,43.650943,-79.554724,43.653482,-79.383935,630319.138862,4834656.0,296.875492,77
27,6,M4X,Downtown Toronto,St. James Town,43.667967,-79.367675,43.669403,-79.372704,631190.100884,4836442.0,1819.805908,62
36,8,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,43.647984,-79.375396,631019.666822,4834059.0,775.607178,100
39,1,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,43.667342,-79.388457,629924.548909,4836188.0,1622.741286,100


## Create Map Folium for the clusters

In [190]:
map_clusters = folium.Map(location=[toronto_coordinates['town'][0],
                                    toronto_coordinates['town'][1]],
                          zoom_start=10)
# set color scheme for the clusters
x = np.arange(bestK)
ys = [i + x + (i*x)**2 for i in range(bestK)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

In [191]:
for lat, lon, neigh, venues, distance, cluster in zip(final_toronto_neigh_df['Neigh_lat'],
                                                      final_toronto_neigh_df['Neigh_lng'],
                                                      final_toronto_neigh_df['Neighborhood'],
                                                      final_toronto_neigh_df['Neigh_venues_count'],
                                                      final_toronto_neigh_df['Distance_from_centre'],
                                                      final_toronto_neigh_df['Cluster Labels']):
    message = "{neigh}, Distance: {distance}m, Venues: {venues}, Cluster {cluster}"
    message = message.format(neigh=neigh, distance=math.floor(distance), cluster=str(cluster), venues=venues)
    label = folium.Popup(message, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [192]:
map_clusters