# Coursera Capstone - FINAL PROJECT - Where to open a pizzeria in Zürich?
Author: Nicolo' (Nick) Sgobba

### Import all necessary libraries

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import numpy as np # library to handle data in a vectorized manner

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('All libraries have been successfully imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
All libraries have been successfully imported.


### Wikipedia page where all Zürich boroughs can be found

In [5]:
wikiPage = requests.get("https://en.wikipedia.org/wiki/Subdivisions_of_Z%C3%BCrich")

### Import and extract borough data

In [6]:
# import it in a soup object
soup = BeautifulSoup(wikiPage.content, 'html5lib')

# get the table with the needed info
table = soup.find_all('table')[1]

# extract all <tr> from the table
trs = table.find_all('tr')

In [117]:
# function to parse the <tr> and return an entry for the dataframe
def parseTableRow(tr, index):
    borough = 'District ' + str(index)
    
    neighborhood = ''

    td = tr.find_next('td')
    td = td.next_sibling
    td = td.next_sibling
    td = td.next_sibling
    td = td.next_sibling
    td = td.next_sibling
    td = td.next_sibling
    
    for string in td.stripped_strings:
        if (neighborhood != ''):
            neighborhood = neighborhood + ', '
        neighborhood = neighborhood + string
    
    return borough, neighborhood

### Create dataframe with borough data

In [118]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,Borough,Neighborhood


In [119]:
# populate dataframe
index = 1
for tr in trs:
    if 'District ' in str(tr):
        data = parseTableRow(tr, index)
        neighborhoods = neighborhoods.append({'Borough': data[0], 'Neighborhood': data[1]}, ignore_index=True)
        index = index + 1

In [120]:
neighborhoods

Unnamed: 0,Borough,Neighborhood
0,District 1,"Rathaus, Hochschulen, Lindenhof, City"
1,District 2,"Wollishofen, Leimbach, Enge"
2,District 3,"Alt-Wiedikon, Friesenberg, Sihlfeld"
3,District 4,"Werd, Langstrasse, Hard"
4,District 5,"Gewerbeschule, Escher Wyss"
5,District 6,"Unterstrass, Oberstrass"
6,District 7,"Fluntern, Hottingen, Hirslanden, Witikon"
7,District 8,"Seefeld, Mühlebach, Weinegg"
8,District 9,"Albisrieden, Altstetten"
9,District 10,"Höngg, Wipkingen"


### Get location of Zürich and its borough

In [138]:
# The code was removed by Watson Studio for sharing.

In [139]:
def getLocation(district):

    url = (baseUrl+'&address={}').format('Zürich'+district)
    response = requests.get(url).json() # get response
    
    success = False
    retries = 5
    
    while (not success) and retries > 0:
        try:
            geographical_data = response['results'][0]['geometry']['location'] # get coordinates
            success = True
        except: 
            retries = retries - 1
    
    if (retries == 0):
        print('ERROR while getting data!')
        return
    
    latitude = geographical_data['lat']
    longitude = geographical_data['lng']
    
    return latitude, longitude

In [141]:
nh = neighborhoods.copy()

latitudeCln = []
longitudeCln = []

for index, row in nh.iterrows():
    print(row[0])
    lat, long = getLocation(row[0])
    latitudeCln.append(lat)
    longitudeCln.append(long)

nh['Latitude'] = latitudeCln
nh['Longitude'] = longitudeCln

nh.shape

District 1
District 2
District 3
District 4
District 5
District 6
District 7
District 8
District 9
District 10
District 11
District 12


(12, 4)

In [142]:
nh

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,District 1,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632
1,District 2,"Wollishofen, Leimbach, Enge",47.343046,8.527225
2,District 3,"Alt-Wiedikon, Friesenberg, Sihlfeld",47.364101,8.509246
3,District 4,"Werd, Langstrasse, Hard",47.380756,8.517289
4,District 5,"Gewerbeschule, Escher Wyss",47.387768,8.522438
5,District 6,"Unterstrass, Oberstrass",47.393446,8.545447
6,District 7,"Fluntern, Hottingen, Hirslanden, Witikon",47.368398,8.583532
7,District 8,"Seefeld, Mühlebach, Weinegg",47.387768,8.522438
8,District 9,"Albisrieden, Altstetten",47.386046,8.479356
9,District 10,"Höngg, Wipkingen",47.404459,8.497033


In [143]:
address = 'Zürich, Switzerland'

geolocator = Nominatim(user_agent="capstoneProject")
location = geolocator.geocode(address, timeout=60, exactly_one=True)

latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Zürich are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Zürich are 47.3723957, 8.5423216.


### Display boroughs of Zürich on map

In [144]:
# make a copy of the dataframe to get it simply back if needed
nhz = nh.copy()

In [145]:
# create map of Zürich using latitude and longitude values
map_zh = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(nhz['Latitude'], nhz['Longitude'], nhz['Borough'], nhz['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_zh)  
    
map_zh

### Get venues data from Foursquare

In [146]:
# The code was removed by Watson Studio for sharing.

In [147]:
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [148]:
LIMIT = 500 # limit of number of venues returned by Foursquare API
radius = 5000 # define radius

### Get the pizzerie from different borougs of Zürich

In [151]:
# Use category id 4bf58dd8d48988d1ca941735 to only get the pizzerie
zh_venues_pizzeria = getNearbyVenues(names=nhz['Neighborhood'], latitudes=nhz['Latitude'], longitudes=nhz['Longitude'], radius=1000, categoryIds='4bf58dd8d48988d1ca941735')
zh_venues_pizzeria.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Sam's Pizza Land,47.375924,8.538561,Pizza Place
1,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Zvr Schtung,47.372066,8.539356,Pizza Place
2,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,10' dieci - Gelateria & Take-Away,47.374437,8.543818,Pizza Place
3,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Il Golosone,47.374172,8.536232,Italian Restaurant
4,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Santa Lucia,47.372221,8.543363,Pizza Place


In [152]:
# function to add markers for given venues to ma
def addToMap(df, color, existingMap):
    for lat, lng, neighborhood, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, neighborhood)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            parse_html=False).add_to(existingMap)

In [153]:
map_zh_pizzerie = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(zh_venues_pizzeria, 'red', map_zh_pizzerie)
map_zh_pizzerie

### Get schools from the different boroughs of Zürich

In [154]:
zh_venues_schools = getNearbyVenues(names=nhz['Neighborhood'], latitudes=nhz['Latitude'], longitudes=nhz['Longitude'], radius=1000, categoryIds='4d4b7105d754a06372d81259')
zh_venues_schools.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,UBS Grünenhof,47.371274,8.537292,College Auditorium
1,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Zentralbibliothek,47.374171,8.545037,College Library
2,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,KV Zurich Business School Sihlpost,47.377245,8.535293,College Academic Building
3,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Pädagogische Hochschule,47.377789,8.534336,College Auditorium
4,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Google Europaallee Gym,47.378407,8.532103,College Gym


In [155]:
map_zh_schools = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(zh_venues_schools, 'green', map_zh_schools)
map_zh_schools

### Get enterprises from the different boroughs of Zürich

In [156]:
zh_venues_enterprises = getNearbyVenues(names=nhz['Neighborhood'], latitudes=nhz['Latitude'], longitudes=nhz['Longitude'], radius=1000, categoryIds='4d4b7105d754a06375d81259')
zh_venues_enterprises.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Parkhaus Urania,47.374427,8.541004,Parking
1,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,St. Peter,47.371495,8.540753,Church
2,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Swisslinx AG,47.37479,8.535883,Office
3,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Literaturhaus,47.37193,8.54278,Event Space
4,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,Grossmünster,47.370142,8.543919,Church


In [158]:
map_zh_enterprises = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(zh_venues_enterprises, 'yellow', map_zh_enterprises)
map_zh_enterprises

### Process data to determine number of pizzerie, schools and enterprises in each borough of Zürich

In [159]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

In [160]:
zh_data = nhz.copy()
addColumn(zh_data, 'Pizzerie', zh_venues_pizzeria)
addColumn(zh_data, 'Schools', zh_venues_schools)
addColumn(zh_data, 'Enterprises', zh_venues_enterprises)
zh_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Pizzerie,Schools,Enterprises
0,District 1,"Rathaus, Hochschulen, Lindenhof, City",47.374156,8.539632,32.0,29.0,50.0
1,District 2,"Wollishofen, Leimbach, Enge",47.343046,8.527225,2.0,7.0,50.0
2,District 3,"Alt-Wiedikon, Friesenberg, Sihlfeld",47.364101,8.509246,24.0,26.0,29.0
3,District 4,"Werd, Langstrasse, Hard",47.380756,8.517289,41.0,50.0,45.0
4,District 5,"Gewerbeschule, Escher Wyss",47.387768,8.522438,25.0,50.0,50.0
5,District 6,"Unterstrass, Oberstrass",47.393446,8.545447,8.0,50.0,50.0
6,District 7,"Fluntern, Hottingen, Hirslanden, Witikon",47.368398,8.583532,0.0,1.0,10.0
7,District 8,"Seefeld, Mühlebach, Weinegg",47.387768,8.522438,25.0,50.0,50.0
8,District 9,"Albisrieden, Altstetten",47.386046,8.479356,7.0,20.0,50.0
9,District 10,"Höngg, Wipkingen",47.404459,8.497033,3.0,50.0,50.0


### Define a weight according to what you excpect the most

In [161]:
# negative weight, because Paolo wants to open a pizzeria and thus wants to avoid concurrence as much as possible
weight_pizzerie = -1

# positive weight, because high school students are good customers
weight_schools = 1

# positive weight because employees are even better customers
weight_enterprises = 1.5

In [162]:
zh_weighted = zh_data[['Borough', 'Neighborhood']].copy()

### Based on the chosen weights, compute the score of each borough

In [163]:
zh_weighted['Score'] = zh_data['Pizzerie'] * weight_pizzerie + zh_data['Schools'] * weight_schools + zh_data['Enterprises'] * weight_enterprises
zh_weighted = zh_weighted.sort_values(by=['Score'], ascending=False)
zh_weighted

Unnamed: 0,Borough,Neighborhood,Score
9,District 10,"Höngg, Wipkingen",122.0
5,District 6,"Unterstrass, Oberstrass",117.0
4,District 5,"Gewerbeschule, Escher Wyss",100.0
7,District 8,"Seefeld, Mühlebach, Weinegg",100.0
8,District 9,"Albisrieden, Altstetten",88.0
1,District 2,"Wollishofen, Leimbach, Enge",80.0
11,District 12,"Saatlen, Schwamendingen Mitte, Hirzenbach",79.5
3,District 4,"Werd, Langstrasse, Hard",76.5
0,District 1,"Rathaus, Hochschulen, Lindenhof, City",72.0
10,District 11,"Affoltern, Oerlikon, Seebach",67.5


## Borough "District 10 - Höngg, Wipkingen" is the best choice for Paolo to open his pizzeria

In [164]:
map_zh_result = folium.Map(location=[latitude, longitude], zoom_start=12)

nhz_win = nhz[nhz['Borough'] == 'District 10']

for lat, lng, borough, neighborhood in zip(nhz_win['Latitude'], nhz_win['Longitude'], nhz_win['Borough'], nhz_win['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_zh_result) 

addToMap(zh_venues_pizzeria[zh_venues_pizzeria['Neighborhood'] == 'Höngg, Wipkingen'], 'red', map_zh_result)
addToMap(zh_venues_schools[zh_venues_schools['Neighborhood'] == 'Höngg, Wipkingen'], 'green', map_zh_result)
addToMap(zh_venues_enterprises[zh_venues_enterprises['Neighborhood'] == 'Höngg, Wipkingen'], 'yellow', map_zh_result)

map_zh_result

## Thank you very much