# Coursera Capstone Project

This is my jupyter notebook for the projects of Applied Data Science Capstone Course

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## Import libaries

In [14]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
%matplotlib inline 
import matplotlib.pyplot as plt
from scipy import stats



## Import data

#### Get NYC Neighborhoods data

In [36]:
import json

with open('./newyork_data.json') as json_data:
    nyc_data = json.load(json_data)
nyc_neighborhoods_data = nyc_data['features']

#Transform to a pandas dataframe
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
nyc_neighborhoods = pd.DataFrame(columns=column_names)
for data in nyc_neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    nyc_neighborhoods = nyc_neighborhoods.append({'Borough': borough,
                                                  'Neighborhood': neighborhood_name,
                                                  'Latitude': neighborhood_lat,
                                                  'Longitude': neighborhood_lon}, ignore_index=True)

print(nyc_neighborhoods.shape)
nyc_neighborhoods.head()

(306, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


#### Get Toronto Neighborhoods data

In [88]:
#Get Canada neighborhoods data from website
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r  = requests.get(url)
canada_neighborhoods_data = r.text

soup = BeautifulSoup(canada_neighborhoods_data, 'html.parser')
table = soup.find('table').find_all('tr')

#Transform to a pandas dataframe
column_names = ['PostalCode', 'Borough', 'Neighborhood']
canada_neighborhoods = pd.DataFrame(columns=column_names)

for index, item in enumerate(table):
    if index > 0:
        data = item.find_all('td')
        postalcode = data[0].text
        borough = data[1].text
        neighborhood_name  = data[2].text.replace('\n', '')
        if neighborhood_name == 'Not assigned':
            neighborhood_name = borough
    
        canada_neighborhoods = canada_neighborhoods.append({'PostalCode': postalcode,
                                                              'Borough': borough,
                                                              'Neighborhood': neighborhood_name,
                                                              'Latitude': np.nan,
                                                              'Longitude': np.nan}, ignore_index=True)
canada_neighborhoods = canada_neighborhoods[canada_neighborhoods['Borough'] != 'Not assigned']
canada_neighborhoods = canada_neighborhoods.groupby(['PostalCode','Borough']).agg( ','.join).reset_index()

#Add coordinates columns
coords = pd.read_csv('./Geospatial_Coordinates.csv')
canada_neighborhoods = canada_neighborhoods.set_index('PostalCode').join(coords.set_index('Postal Code'), how='inner').reset_index()

#Get Toronto neighborhoods data
toronto_neighborhoods = canada_neighborhoods[canada_neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop=True)

print(toronto_neighborhoods.shape)
toronto_neighborhoods.head()

(38, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Get client restaurant data in Toronto

In [93]:
client_restaurant = pd.read_csv('./Client.csv')

print(client_restaurant.shape)
client_restaurant.head()

(7, 2)


Unnamed: 0,Neighborhood,Efficiency
0,Moore Park,0.7
1,Forest Hill West,0.8
2,Regent Park,0.6
3,Rosedale,0.74
4,Island airport,0.88


#### Define Foursquare Credentials and Version

In [5]:
CLIENT_ID = '5NMOAVIALBOSTNQVCJJRVWT0ZHT1URBDTWHOYBLIENYSQQDD' # your Foursquare ID
CLIENT_SECRET = 'R3NSNU03XXZQLAFGUG2JYKEYL1MM3Y5AX2O2X53HGJXY4ECW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5NMOAVIALBOSTNQVCJJRVWT0ZHT1URBDTWHOYBLIENYSQQDD
CLIENT_SECRET:R3NSNU03XXZQLAFGUG2JYKEYL1MM3Y5AX2O2X53HGJXY4ECW


#### Get venues data

In [6]:
import requests
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [70]:
# Get NYC venues data
#toronto_venues = getNearbyVenues(names=nyc_neighborhoods['Neighborhood'],
#                                 latitudes=nyc_neighborhoods['Latitude'],
#                                 longitudes=nyc_neighborhoods['Longitude']
#                                )
#nyc_venues_backup = nyc_venues #backup dataframe, to limit the api calls

nyc_venues = nyc_venues_backup

print(nyc_venues.shape)
nyc_venues.head()

(10242, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop
4,Wakefield,40.894705,-73.847201,SUBWAY,40.890656,-73.849192,Sandwich Place


In [96]:
# Get Toronto venues data
#toronto_venues = getNearbyVenues(names=toronto_neighborhoods['Neighborhood'],
#                                 latitudes=toronto_neighborhoods['Latitude'],
#                                 longitudes=toronto_neighborhoods['Longitude']
#                                )
#toronto_venues_backup = toronto_venues

toronto_venues = toronto_venues_backup

print(toronto_venues.shape)
toronto_venues.head()

(1707, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,The Beaches,43.676357,-79.293031,Beaches Fitness,43.680319,-79.290991,Gym / Fitness Center
4,The Beaches,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop


## Data pre-processing

### New York dataset

#### Get number of same type restaurant foreach neighborhood

In [72]:
nyc_competitors = nyc_venues[nyc_venues['Venue Category'] == 'Chinese Restaurant'].groupby('Neighborhood').size().to_frame('Competitors').reset_index()

print(nyc_competitors.shape)
nyc_competitors.head()

(128, 2)


Unnamed: 0,Neighborhood,Competitors
0,Allerton,2
1,Astoria Heights,1
2,Bath Beach,4
3,Battery Park City,1
4,Bay Ridge,2


#### Get top 10 venue types foreach neighborhood

In [73]:
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood'] 

# group by neighborhood and calculate the average of each type
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()

print(nyc_grouped.shape)
nyc_grouped.head()

(301, 431)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,Airport Tram,American Restaurant,Animal Shelter,Antique Shop,...,Waste Facility,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = nyc_grouped['Neighborhood']

for ind in np.arange(nyc_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyc_grouped.iloc[ind, :], num_top_venues)
    
#Add Competitors
neighborhoods_venues_sorted = neighborhoods_venues_sorted.set_index('Neighborhood').join(nyc_competitors.set_index('Neighborhood'), how='left').fillna(0).reset_index()

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(301, 12)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Competitors
0,Allerton,Pizza Place,Spa,Supermarket,Chinese Restaurant,Deli / Bodega,Food,Fast Food Restaurant,Bakery,Electronics Store,Pharmacy,2.0
1,Annadale,Pub,Cosmetics Shop,Diner,Train Station,Liquor Store,Sports Bar,Pizza Place,Restaurant,Pet Store,Exhibit,0.0
2,Arden Heights,Pharmacy,Deli / Bodega,Pizza Place,Coffee Shop,Home Service,Filipino Restaurant,Event Space,Exhibit,Eye Doctor,Factory,0.0
3,Arlington,Bus Stop,Deli / Bodega,Intersection,American Restaurant,Food Service,Boat or Ferry,Coffee Shop,Yoga Studio,Fish & Chips Shop,Filipino Restaurant,0.0
4,Arrochar,Deli / Bodega,Bus Stop,Italian Restaurant,Liquor Store,Middle Eastern Restaurant,Taco Place,Sandwich Place,Food Truck,Pizza Place,Cosmetics Shop,0.0


In [22]:
#Get NYC geodata
from geopy.geocoders import Nominatim

address = 'New York City, United States'

geolocator = Nominatim(user_agent="ny_explorer")
nyc_location = geolocator.geocode(address)
nyc_latitude = nyc_location.latitude
nyc_longitude = nyc_location.longitude

# create map of NYC using latitude and longitude values
import folium

map_restaurants_nyc = folium.Map(location=[nyc_latitude, nyc_longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(restaurants['Venue Latitude'], restaurants['Venue Longitude'], restaurants['Venue']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,).add_to(map_restaurants_nyc)  

map_restaurants_nyc