## This is the project file for capstone project

*IBM Course*

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import requests

## Get the Table

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

tables = pd.read_html(r.text)

### convert in dataframe

In [3]:
toronto_df =  pd.DataFrame(tables[0])

toronto_df.columns = ['PostalCode','Borough','Neighborhood']

### Drop NaN Values

In [4]:
#clean data
toronto_df.dropna(axis=0,inplace=True)
toronto_df.reset_index()
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
### More than one neighborhood can exist in one postal code area.

In [5]:
toronto_dropNotAssigned = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)

toronto_grouped = toronto_dropNotAssigned.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [6]:
mask = toronto_grouped['Neighborhood'] == "Not assigned"
toronto_grouped.loc[mask, 'Neighborhood'] = toronto_grouped.loc[mask, 'Borough']

### shape of the dataframe

In [7]:
toronto_grouped.shape

(103, 3)

## Read GEO data

In [8]:
coordenades = pd.read_csv("geodata.csv")
coordenades.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [9]:
# rename PostalCode as PostalCode for easy query purpose
coordenades.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)

In [10]:
neighborhood = pd.merge(toronto_grouped, coordenades, on='PostalCode', how='inner')

In [31]:
neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [13]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighe in zip(neighborhood['Latitude'], neighborhood['Longitude'], neighborhood['Borough'], neighborhood['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version¶


In [95]:
CLIENT_ID = 'PNNTTIIPRW4E5HOYFVP2JU20KUJUX0YF3A15WYIRHRHZHIF3' # your Foursquare ID
CLIENT_SECRET = '3H5OSLL0KIWIXIOUVXQX1XJHYIDXZKLJKELUXXJ5PSQQSF4N' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PNNTTIIPRW4E5HOYFVP2JU20KUJUX0YF3A15WYIRHRHZHIF3
CLIENT_SECRET:3H5OSLL0KIWIXIOUVXQX1XJHYIDXZKLJKELUXXJ5PSQQSF4N


In [96]:
#Function to apply it to every Postcode we have.
global exitlimit
exitlimit = 0
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    global exitlimit

    exitlimit = exitlimit+1
    if exitlimit==105:
        return
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        api_response = requests.get(url).json()
        if  "groups" in api_response:
            results = api_response["response"]['groups'][0]['items']
            # return only relevant information for each nearby venue
            
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [97]:
toronto_venues = getNearbyVenues(names=neighborhood['Neighborhood'],
                                   latitudes=neighborhood['Latitude'],
                                   longitudes=neighborhood['Longitude']
                                  )


Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence P

ValueError: Length mismatch: Expected axis has 0 elements, new values have 7 elements

In [85]:
# print(toronto_venues.shape)
# toronto_venues.head()