In [8]:
import requests # library to handle requests
from bs4 import BeautifulSoup
import pandas as pd

In [9]:
response = requests.get("http://www.healthysf.org/bdi/outcomes/zipmap.htm")
soup = BeautifulSoup(response.text, "lxml")
table = soup.find_all("table")
df = pd.read_html(str(table))
df = pd.DataFrame(df[4])

In [10]:
df.columns = df.iloc[0]
df = df.iloc[1:-1, :-1]
sf_data = df
sf_data.head()

Unnamed: 0,Zip Code,Neighborhood
1,94102,Hayes Valley/Tenderloin/North of Market
2,94103,South of Market
3,94107,Potrero Hill
4,94108,Chinatown
5,94109,Polk/Russian Hill (Nob Hill)


Convert Addresses into Latitude and Longitude

In [12]:
!pip install uszipcode
from uszipcode import SearchEngine

search = SearchEngine(simple_zipcode=True)

latitude = []
longitude = []

for index, row in df.iterrows():
    zipcode = search.by_zipcode(row["Zip Code"]).to_dict()
    latitude.append(zipcode.get("lat"))
    longitude.append(zipcode.get("lng"))

sf_data["Latitude"] = latitude
sf_data["Longitude"] = longitude

sf_data.head()



Unnamed: 0,Zip Code,Neighborhood,Latitude,Longitude
1,94102,Hayes Valley/Tenderloin/North of Market,37.78,-122.42
2,94103,South of Market,37.78,-122.41
3,94107,Potrero Hill,37.77,-122.39
4,94108,Chinatown,37.791,-122.409
5,94109,Polk/Russian Hill (Nob Hill),37.79,-122.42


In [17]:
!pip install geopy



Create a map of San Francisco with neighborhoods

In [18]:

from geopy.geocoders import Nominatim
address = 'San Francisco'

geolocator = Nominatim(user_agent = "san_francisco_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of San Francisco are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of San Francisco are 37.7790262, -122.419906.


In [19]:
!pip install folium



In [20]:

import folium
map_sf = folium.Map(location = [latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(sf_data['Latitude'], sf_data['Longitude'], sf_data['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_sf)  
    
map_sf

Define Foursquare Credentials and Version


In [21]:
CLIENT_ID = 'PK4E3AX1HWYOAYAJXBCEN5FAIYYBI2YQMJTCM3DJTC0CUD2L' # your Foursquare ID
CLIENT_SECRET = 'GLWH10Z34GDBJB1J5T2UW5J0KSTXQIWBRLCFBD2LIM5LGGCA' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PK4E3AX1HWYOAYAJXBCEN5FAIYYBI2YQMJTCM3DJTC0CUD2L
CLIENT_SECRET:GLWH10Z34GDBJB1J5T2UW5J0KSTXQIWBRLCFBD2LIM5LGGCA


Get the top 100 venues that are in each neighborhood within a radius of 500 meters.


In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [23]:
sf_venues = getNearbyVenues(names = sf_data['Neighborhood'],
                                   latitudes = sf_data['Latitude'],
                                   longitudes = sf_data['Longitude']
                                  )
                                  
sf_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,War Memorial Opera House,37.778601,-122.420816,Opera House
1,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,Herbst Theater,37.779548,-122.420953,Concert Hall
2,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,San Francisco Ballet,37.77858,-122.420798,Dance Studio
3,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,Louise M. Davies Symphony Hall,37.777976,-122.420157,Concert Hall
4,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,War Memorial Court,37.779042,-122.420971,Park


In [24]:
sf_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bayview-Hunters Point,3,3,3,3,3,3
Castro/Noe Valley,60,60,60,60,60,60
Chinatown,100,100,100,100,100,100
Haight-Ashbury,29,29,29,29,29,29
Hayes Valley/Tenderloin/North of Market,100,100,100,100,100,100
Ingelside-Excelsior/Crocker-Amazon,44,44,44,44,44,44
Inner Mission/Bernal Heights,50,50,50,50,50,50
Inner Richmond,65,65,65,65,65,65
Lake Merced,18,18,18,18,18,18
Marina,71,71,71,71,71,71


In [25]:
print('There are {} uniques categories.'.format(len(sf_venues['Venue Category'].unique())))

There are 212 uniques categories.


Analyze Each Neighborhood

In [26]:
sf_onehot = pd.get_dummies(sf_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighborhood column back to dataframe
sf_onehot['Neighborhood'] = sf_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sf_onehot.columns[-1]] + list(sf_onehot.columns[:-1])
sf_onehot = sf_onehot[fixed_columns]

sf_onehot.head()

Unnamed: 0,Neighborhood,ATM,Adult Boutique,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yemeni Restaurant,Yoga Studio
0,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Hayes Valley/Tenderloin/North of Market,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
sf_grouped = sf_onehot.groupby('Neighborhood').mean().reset_index()
sf_grouped.head()

Unnamed: 0,Neighborhood,ATM,Adult Boutique,Alternative Healer,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yemeni Restaurant,Yoga Studio
0,Bayview-Hunters Point,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Castro/Noe Valley,0.0,0.016667,0.0,0.016667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.016667,0.0,0.0,0.0,0.0,0.033333,0.016667,0.0,0.033333
2,Chinatown,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01,0.01,...,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.01
3,Haight-Ashbury,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483
4,Hayes Valley/Tenderloin/North of Market,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.04,0.0,0.03,0.01,0.0,0.0


In [30]:
import numpy as np
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [31]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        # append 'st', 'nd', 'rd' to the top 3 venues
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = sf_grouped['Neighborhood']

for ind in np.arange(sf_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sf_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bayview-Hunters Point,Coffee Shop,Motorcycle Shop,Gelato Shop,Yoga Studio,Fried Chicken Joint,French Restaurant,Fountain,Food Truck,Food & Drink Shop,Fondue Restaurant
1,Castro/Noe Valley,Gay Bar,Park,Coffee Shop,Grocery Store,Thai Restaurant,Yoga Studio,Playground,Wine Bar,Pizza Place,Pilates Studio
2,Chinatown,Hotel,Coffee Shop,Boutique,American Restaurant,Sushi Restaurant,Hotel Bar,Cocktail Bar,Electronics Store,Bar,Men's Store
3,Haight-Ashbury,Coffee Shop,Grocery Store,Yoga Studio,Gastropub,Restaurant,Recreation Center,Pizza Place,Pet Store,Park,Mexican Restaurant
4,Hayes Valley/Tenderloin/North of Market,Cocktail Bar,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Hotel,Café,Theater,Sandwich Place,Boutique,French Restaurant


Cluster Neighborhoods

In [33]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5
sf_grouped_clustering = sf_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(sf_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 0, 0, 0, 0, 0, 0, 3, 0])

In [34]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sf_merged = sf_data
sf_merged = sf_merged.merge(neighborhoods_venues_sorted, on = 'Neighborhood')

sf_merged.head()

Unnamed: 0,Zip Code,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,94102,Hayes Valley/Tenderloin/North of Market,37.78,-122.42,0,Cocktail Bar,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Hotel,Café,Theater,Sandwich Place,Boutique,French Restaurant
1,94103,South of Market,37.78,-122.41,0,Coffee Shop,Bakery,Vietnamese Restaurant,Sandwich Place,Theater,Performing Arts Venue,Bar,Mexican Restaurant,Marijuana Dispensary,Pizza Place
2,94107,Potrero Hill,37.77,-122.39,0,Food Truck,Coffee Shop,Gym,Park,Pharmacy,Café,Harbor / Marina,Sandwich Place,Pizza Place,Bank
3,94108,Chinatown,37.791,-122.409,0,Hotel,Coffee Shop,Boutique,American Restaurant,Sushi Restaurant,Hotel Bar,Cocktail Bar,Electronics Store,Bar,Men's Store
4,94109,Polk/Russian Hill (Nob Hill),37.79,-122.42,0,Grocery Store,Café,Massage Studio,Bar,Gym / Fitness Center,Thai Restaurant,Sushi Restaurant,Bakery,Pet Store,Diner


visualize the resulting clusters

In [39]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sf_merged['Latitude'], sf_merged['Longitude'], sf_merged['Neighborhood'], sf_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters