## Coursera Capstone Project
#### IBM Data Science Specialization
#### March, 2019, Ming




In [1]:
# import libraries

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


In [2]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
radius = 500
LIMIT = 100

Your credentails:
CLIENT_ID: 1RXZTD5EH50RBI3THQEE2NOJZH0QACXD4DUM30KV1M1KLTVS
CLIENT_SECRET:TBALVRU1HPKFVXNRTBN3AQFK12QDEMFEYAR4ISK1HWINXM4C


## 1. Introduction 

##### In this project, we will use clustering to compare communities of Manhattan, New York and Toronto. Utilizing Four Square venues data in the two cities, we will 1) know which neighborhoods are similar between the two cities, 2) visualize how similar neighborhoods locate in the two cities, 3) picture similarities and differences of lifestypes bwteen Manhattan and Toronto. 

##### People interested in this project would be residents in either city who are interested in the other, and people who are interested in moving to one of the cities.


## 2. Data 

##### We will use Four Square API as the main data source. Neighborhood data of Manhattan is from the Coursera class data file (https://cocl.us/new_york_dataset). And the Neighborhood data of Toronto is from the Wikipedia Page we used for Week 3 (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M). We use Four Square API calls to get venues within 500m from the neighborhood coordinates, and do clustering based on frequency of venue categories within the neighborhood. Neighborhoods of Manhattan and Toronto are put together to do the clustering, so that similar neighborhoods will end up within the same cluster.


#### Let's load our data here

#### First Manhattan venues data

In [3]:
# open downloaded New York Data
with open('nyu_geojson.json') as json_data:
    newyork_data = json.load(json_data)

# create dataframe
ny_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
mht_neighborhoods = pd.DataFrame(columns=column_names)
for data in ny_data:
    borough = neighborhood_name = data['properties']['borough']
    if borough != 'Manhattan':
        continue
    neighborhood_name = data['properties']['name']        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    mht_neighborhoods = mht_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
mht_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [4]:
# visually verify that we indeed load neighborhoods in Manhattan
address = 'Manhattan, NY'
geolocator = Nominatim(user_agent="mht_explorer")
location = geolocator.geocode(address)
latitude1 = location.latitude
longitude1 = location.longitude

map_manhattan = folium.Map(location=[latitude1, longitude1], zoom_start=11)
for lat, lng, label in zip(mht_neighborhoods['Latitude'], mht_neighborhoods['Longitude'], mht_neighborhoods['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)      
map_manhattan

In [5]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# function to repeat the same process to all the neighborhoods in the dataframe    
def getNearbyVenues(names, latitudes, longitudes, radius=500):    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']    
    return(nearby_venues)

In [6]:
# request venues information using API
mht_venues = getNearbyVenues(names=mht_neighborhoods['Neighborhood'],
                                   latitudes=mht_neighborhoods['Latitude'],
                                   longitudes=mht_neighborhoods['Longitude'])
mht_venues.shape

(3307, 7)

#### Then Toronto venues data

In [7]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
# instantiate the dataframe
trt_data = pd.DataFrame(columns=column_names)

In [8]:
# use BeautifulSoup to scrape the data table from wikipedia
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = BeautifulSoup(urlopen(url))
table = soup.find('table', class_="wikitable")

In [9]:
# fill in dataframe
table_rows = table.find_all('tr')
for tr in table_rows:
    td = tr.find_all('td')
    row = [x.text.strip() for x in td]
    if len(row) != 3:
        continue
    postcode, borough, neighborhood = row    
    if borough == 'Not assigned':
        continue
    if 'Toronto' not in borough:
        continue
    if neighborhood == 'Not assigned':
        neighborhood = borough
    trt_data = trt_data.append({'PostalCode': postcode,
                                'Borough': borough,
                                'Neighborhood': neighborhood}, ignore_index=True)

In [10]:
# clean up data by combining rows with same PostalCode and Borough into the same row, and join their Neighborhood
trt_data = trt_data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index()
trt_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4E,East Toronto,The Beaches
1,M4K,East Toronto,"The Danforth West, Riverdale"
2,M4L,East Toronto,"The Beaches West, India Bazaar"
3,M4M,East Toronto,Studio District
4,M4N,Central Toronto,Lawrence Park


In [11]:
# use the downloaded csv file provided by the classs site for Toronto neighborhood coordinates
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
trt_data = pd.merge(trt_data, coords, on = 'PostalCode')
trt_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [12]:
# visually verify that we indeed load neighborhoods in Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="trt_explorer")
location = geolocator.geocode(address)
latitude2 = location.latitude
longitude2 = location.longitude

map_trt = folium.Map(location=[latitude2, longitude2], zoom_start=12)
for lat, lng, borough, neighborhood in zip(trt_data['Latitude'], trt_data['Longitude'], trt_data['Borough'], trt_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_trt)      
map_trt

In [13]:
# request venues information using API
trt_venues = getNearbyVenues(names=trt_data['Neighborhood'],
                                   latitudes=trt_data['Latitude'],
                                   longitudes=trt_data['Longitude'])
trt_venues.shape

(1693, 7)