In [2]:
# Importing all dependencies & installing each package needed for analysis

import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests
import lxml
from pandas.io.json import json_normalize

!pip install folium
import folium

!pip install scikit-learn
import sklearn
from sklearn.cluster import KMeans

In [28]:
# This capstone project explores the regions around Pittsburgh, PA and, by utilizing Foursquare, an interactive map is created that will assist in pinpointing existing areas of competition to be used by prospective owners of a conceptual pizzeria/bar restaurant (i.e. where Foursquare data shows pizza restaurants and bars as the top venues).  The url's below provide the information for all zip codes and geographical data within Allegheny County, PA:

url = 'http://www.ciclt.net/sn/clt/capitolimpact/gw_ziplist.aspx?FIPS=42003'
find_lat_long_url = 'https://public.opendatasoft.com/api/records/1.0/search/?dataset=us-zip-code-latitude-and-longitude&q=zip%3D{zip}'

# Using pandas read_html, pulled all zip codes corresponding to (unique) cities/neighborhoods within Allegheny County, PA (Pittsburgh suburbs)

pittsburgh_postal = pd.read_html(url)
df = pittsburgh_postal[2]
df2 = pittsburgh_postal[3]
df3 = pittsburgh_postal[4]
df4 = pittsburgh_postal[5]

df_pittsburgh = pd.concat([df, df2, df3, df4], axis = 0, ignore_index = True)

df_pittsburgh = df_pittsburgh.drop(['County'], axis = 1) 
df_pittsburgh.insert(2, "Latitude", 0.0)
df_pittsburgh.insert(3, "Longitude", 0.0)
df_geo_pittsburgh = df_pittsburgh


# Created a for-loop to incorporate both latitude/longitude from the open data API for all unique neighborhoods in Allegheny County, PA

for index, row in df_pittsburgh.iterrows():
    zipcode = row['Zip Code']
    current_lat_long_url = find_lat_long_url.format(zip = zipcode)
    response = requests.get(current_lat_long_url)
    records_json = response.json()['records'] 
    has_items = bool(records_json)
    if not has_items:
        continue
    records_json = records_json[0]
    fields_json = records_json['fields']
    latitude = fields_json['latitude']
    longitude = fields_json['longitude']
    df_geo_pittsburgh.at[index, 'Latitude'] = latitude
    df_geo_pittsburgh.at[index, 'Longitude'] = longitude
    
df_geo_pittsburgh_final = df_geo_pittsburgh.drop_duplicates(subset = ["Latitude"])
df_geo_pittsburgh_final = df_geo_pittsburgh_final.drop_duplicates(subset = ["City"])
df_geo_pittsburgh_final = df_geo_pittsburgh_final.drop_duplicates(subset = ["Zip Code"])
df_geo_pittsburgh_final = df_geo_pittsburgh_final[df_geo_pittsburgh_final.Latitude != 0]
df_geo_pittsburgh_final.tail(50)


Unnamed: 0,Zip Code,City,Latitude,Longitude
51,15116,Glenshaw,40.533806,-79.96303
52,15120,Homestead,40.395336,-79.90635
56,15122,Pittsburgh,40.366177,-79.89418
59,15126,Imperial,40.454423,-80.26324
61,15129,Library,40.289752,-80.00756
64,15131,McKeesport,40.341469,-79.81105
69,15135,Boston,40.306819,-79.8145
71,15136,McKees Rocks,40.471618,-80.07695
72,15137,North Versailles,40.380219,-79.81017
73,15139,Oakmont,40.519518,-79.83762


In [10]:

#Creating a folium map of Pittsburgh, which includes all unique towns designated by postal code in Allegheny Country (& later consolidated):
map_pittsburgh = folium.Map(location=[40.463, -80.011], zoom_start = 10)

for lat, lng, label in zip(df_geo_pittsburgh['Latitude'], df_geo_pittsburgh['Longitude'], df_geo_pittsburgh['City']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_pittsburgh)  
    
map_pittsburgh

In [12]:
#The following code provides all necessary access to Foursquare and allows for proper grouping of all Pittsburgh area venues to be grouped:
CLIENT_ID = '4YGCUSWECQXY35RF1PRFH2BLG4CKUEJ5AQC1V42GINDNQ11C'
CLIENT_SECRET = 'BOILTJDRWTGF1YNYUIPFWIDZZB0PE4Q1QSQZ0CMDCPFDOZDB'
VERSION = '20180604'
LIMIT = 100 

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        #API Request to Foursquare
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        #GET request
        results = requests.get(url).json()["response"]
        has_response = bool(results)
        
        if not has_response:
            continue

        groups_json = results['groups']
        has_items = bool(groups_json)
        if not has_items:
            continue
        items = groups_json[0]['items']

        if not bool(items):
            continue

        #Will return all relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in items])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#Checking the first (5) returned Pittsburgh neighborhood venues:
pittsburgh_venues = getNearbyVenues(names = df_geo_pittsburgh['City'], latitudes = df_geo_pittsburgh['Latitude'], longitudes = df_geo_pittsburgh['Longitude'])

pittsburgh_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bairdford,40.630894,-79.88057,United States Postal Service,40.632229,-79.879144,Post Office
1,Bairdford,40.630894,-79.88057,Bairdford Park,40.629222,-79.877576,Park
2,Bairdford,40.630894,-79.88057,Brunch,40.632473,-79.87648,Breakfast Spot
3,Bakerstown,40.652311,-79.93303,Richland Township Community Park,40.655932,-79.930172,Playground
4,Bakerstown,40.652311,-79.93303,United States Postal Service,40.650729,-79.935447,Post Office


In [783]:
#Next, utilizing one-hot coding to effectively demonstrate the distribution of venues per neighborhood (to allow for later clustering)
pittsburgh_venues.groupby('Neighborhood')

pittsburgh_onehot = pd.get_dummies(pittsburgh_venues[['Venue Category']], prefix="", prefix_sep="")
pittsburgh_onehot['Neighborhood'] = pittsburgh_venues['Neighborhood']

fixed_columns = [pittsburgh_onehot.columns[-1]] + list(pittsburgh_onehot.columns[:-1])
pittsburgh_onehot = pittsburgh_onehot[fixed_columns]

pittsburgh_grouped = pittsburgh_onehot.groupby('Neighborhood').mean().reset_index()
pittsburgh_grouped

#Keeping only the neighborhoods which contain a statistically significant amount of venues within the Foursquare database (again allowing for later clustering); this encompasses a total of 33 neighborhoods.  Allowing too many rural areas, which contain only 1-4 venues will tend to skew grouping.  This was determined to be the most accurate way to consolidate the area, while maximizing the existing Foursquare venue data

counts = np.count_nonzero(pittsburgh_grouped, axis=1)

array_indexes_to_drop = []

for index, count in enumerate(counts):
    if count < 8:
        array_indexes_to_drop.append(index)

pittsburgh_grouped.drop(array_indexes_to_drop, inplace = True)
pittsburgh_grouped


Unnamed: 0,Neighborhood,ATM,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Astrologer,Athletics & Sports,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Water Park,Wings Joint,Women's Store,Yoga Studio
2,Arsenal,0.0,0.021277,0.021277,0.0,0.0,0.042553,0.0,0.0,0.0,...,0.0,0.0,0.0,0.021277,0.021277,0.0,0.0,0.0,0.021277,0.0
10,Bethel Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,Bloomfield,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.021277,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0
18,Bridgeville,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,Brookline,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,Carnegie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,Carson,0.0,0.081081,0.0,0.0,0.0,0.0,0.054054,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,Caste Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,Castle Shannon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,Crafton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [784]:
#Executed the following code to have a glance at the top venues per major neighborhoods within scope
num_top_venues = 5

for hood in pittsburgh_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = pittsburgh_grouped[pittsburgh_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Arsenal----
                 venue  freq
0                  Bar  0.06
1       Sandwich Place  0.06
2           Taco Place  0.04
3  Arts & Crafts Store  0.04
4          Karaoke Bar  0.04


----Bethel Park----
            venue  freq
0  Breakfast Spot  0.11
1  Sandwich Place  0.11
2             Bar  0.11
3             Gym  0.11
4     Pizza Place  0.11


----Bloomfield----
             venue  freq
0      Pizza Place  0.06
1      Coffee Shop  0.06
2    Grocery Store  0.04
3              Bar  0.04
4  Thai Restaurant  0.04


----Bridgeville----
                 venue  freq
0          Gas Station  0.18
1  American Restaurant  0.18
2       Sandwich Place  0.09
3   Chinese Restaurant  0.09
4          Coffee Shop  0.09


----Brookline----
                venue  freq
0         Pizza Place  0.17
1  Italian Restaurant  0.08
2        Liquor Store  0.08
3                 Spa  0.08
4       Grocery Store  0.08


----Carnegie----
                venue  freq
0  Italian Restaurant  0.11
1             

In [785]:
#Defining the following function in order to allow the venues to be sorted in ascending top venue order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#Subsequently, setting up a separate df to display the top 10 venues/neighborhood (see following table)
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

#Columns according to n-th top venues:
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#New df arranging by most common venues per neighborhood - thus starting to more easily see where pizza restaurants and bars are concentrated:
pittsburgh_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
pittsburgh_neighborhoods_venues_sorted['Neighborhood'] = pittsburgh_grouped['Neighborhood']

for ind in np.arange(pittsburgh_grouped.shape[0]):
   pittsburgh_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(pittsburgh_grouped.iloc[ind, :], num_top_venues)

pittsburgh_neighborhoods_venues_sorted.head(30)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Arsenal,Bar,Sandwich Place,Taco Place,Arts & Crafts Store,Karaoke Bar,Indie Movie Theater,Bus Station,Sports Bar,Coffee Shop,Gym
10,Bethel Park,Breakfast Spot,Sandwich Place,Bar,Gym,Pizza Place,Home Service,Mexican Restaurant,Beer Garden,Ice Cream Shop,Other Repair Shop
12,Bloomfield,Pizza Place,Coffee Shop,Grocery Store,Bar,Thai Restaurant,Art Gallery,Bookstore,Sandwich Place,New American Restaurant,Burger Joint
18,Bridgeville,Gas Station,American Restaurant,Sandwich Place,Chinese Restaurant,Coffee Shop,Mexican Restaurant,Performing Arts Venue,Pharmacy,Hotel,Post Office
19,Brookline,Pizza Place,Italian Restaurant,Liquor Store,Spa,Grocery Store,Bike Shop,Convenience Store,Bakery,Lounge,Mediterranean Restaurant
23,Carnegie,Italian Restaurant,Library,Bar,Bank,Food & Drink Shop,Pizza Place,Miscellaneous Shop,Theater,Gift Shop,Other Great Outdoors
24,Carson,Bar,American Restaurant,Asian Restaurant,Sushi Restaurant,Thai Restaurant,Ice Cream Shop,Pub,Burger Joint,Dance Studio,Boutique
25,Caste Village,Baseball Field,Chinese Restaurant,Bar,Bakery,Convenience Store,Grocery Store,Laundromat,Other Repair Shop,Other Great Outdoors,Outdoors & Recreation
26,Castle Shannon,Dessert Shop,Light Rail Station,Coffee Shop,Train Station,Bakery,Playground,Bar,Print Shop,Pub,Noodle House
32,Crafton,Pizza Place,Pharmacy,Discount Store,Supermarket,Chinese Restaurant,Grocery Store,Fried Chicken Joint,Shoe Store,Bar,Bank


In [786]:
#Setting up to divide the neighborhoods into 6 clusters; optimized amount to filter out the heaviest concentration of pizzerias/bars:
kclusters = 6
pittsburgh_grouped_clustering = pittsburgh_grouped.drop('Neighborhood', 1)

#Implementing K-means clustering fit (and installing/importing sklearn package):
#!pip install scikit-learn
import sklearn
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(pittsburgh_grouped_clustering)

#Adding Clustering Labels 0 through 5 -- later setting up corresponding colored markers
pittsburgh_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
pittsburgh_merged = df_geo_pittsburgh_final

#Merging both df's to add latitude/longitude for each neighborhood from Foursquare to add to the subsequent map
pittsburgh_merged = pittsburgh_merged.join(pittsburgh_neighborhoods_venues_sorted.set_index('Neighborhood'), on='City')
pittsburgh_merged.dropna(axis=0, inplace=True)

In [787]:
#Initiating a folium map in order to add the final clustering results, based on top venues
map_clusters = folium.Map(location=[40.463, -80.011], zoom_start=10)

#Cluster coloring scheme:
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Successfully added colored markers to the map alongside cluster numbering, based on respective shared/top venues (Note: chose index-5 to obtain red markers for Label 4, which best emphasizes the desired variable - i.e. locale of pizzeria/bar centric areas)
markers_colors = []
for lat, lon, poi, cluster in zip(pittsburgh_merged['Latitude'], pittsburgh_merged['Longitude'], pittsburgh_merged['City'], pittsburgh_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    index = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[index-5],
        fill=True,
        fill_color=rainbow[index-5],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [788]:
#Looking at the map, the following most common venues define the clustering #'s:

#Label 0 = Italian/Library
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 0, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
45,Carnegie,Italian Restaurant,Library,Bar,Bank,Food & Drink Shop,Pizza Place,Miscellaneous Shop,Theater,Gift Shop,Other Great Outdoors


In [789]:
#Label 1 = Gas Station/Yoga/American
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 1, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Bridgeville,Gas Station,American Restaurant,Sandwich Place,Chinese Restaurant,Coffee Shop,Mexican Restaurant,Performing Arts Venue,Pharmacy,Hotel,Post Office
73,Oakmont,Yoga Studio,Bakery,American Restaurant,Ice Cream Shop,Hobby Shop,Spa,Bank,Italian Restaurant,Video Game Store,Public Art


In [790]:
#Label 2 = Fish & Chips/Convenience Store/Fast Food
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 2, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
107,Mt Oliver,Fish & Chips Shop,Convenience Store,Fast Food Restaurant,American Restaurant,Fried Chicken Joint,Video Store,Hardware Store,Pizza Place,Performing Arts Venue,Park


In [791]:
#Label 3 = Baseball Field/Chinese Restaurant
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 3, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
158,Caste Village,Baseball Field,Chinese Restaurant,Bar,Bakery,Convenience Store,Grocery Store,Laundromat,Other Repair Shop,Other Great Outdoors,Outdoors & Recreation


In [792]:
#Label 4 = Pizzeria/Bar/Coffee Shop -- It is here within the red-dotted neighborhoods where, by far, the most pizzerias/bars exist
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 4, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Wexford,Fast Food Restaurant,Deli / Bodega,Bank,Moving Target,Bakery,Taco Place,Cosmetics Shop,ATM,Paper / Office Supplies Store,Performing Arts Venue
42,Bethel Park,Breakfast Spot,Sandwich Place,Bar,Gym,Pizza Place,Home Service,Mexican Restaurant,Beer Garden,Ice Cream Shop,Other Repair Shop
56,Pittsburgh,Bar,Pizza Place,Coffee Shop,Sandwich Place,American Restaurant,Moving Target,Italian Restaurant,Boat or Ferry,Tunnel,Bakery
64,McKeesport,Bar,Moving Target,Pizza Place,Boat or Ferry,Tunnel,Disc Golf,Optical Shop,Coffee Shop,Gym / Fitness Center,Shipping Store
71,McKees Rocks,Pizza Place,Breakfast Spot,Indie Movie Theater,Bakery,Pharmacy,Café,Spa,Bar,Bank,Plaza
74,Monroeville,Park,Pool,Construction & Landscaping,Bar,Bakery,Pub,Pizza Place,Flower Shop,Paper / Office Supplies Store,ATM
79,Springdale,Salon / Barbershop,Pizza Place,Discount Store,General Travel,Soup Place,Chinese Restaurant,Construction & Landscaping,Bar,Post Office,Pool
80,Turtle Creek,Pizza Place,Italian Restaurant,Bank,Convenience Store,Pharmacy,Smoke Shop,Garden Center,Discount Store,Astrologer,Library
85,Arsenal,Bar,Sandwich Place,Taco Place,Arts & Crafts Store,Karaoke Bar,Indie Movie Theater,Bus Station,Sports Bar,Coffee Shop,Gym
93,Carson,Bar,American Restaurant,Asian Restaurant,Sushi Restaurant,Thai Restaurant,Ice Cream Shop,Pub,Burger Joint,Dance Studio,Boutique


In [793]:
#Label 5 = Dessert/Trolley Stations
pittsburgh_merged.loc[pittsburgh_merged['Cluster Labels'] == 5, pittsburgh_merged.columns[[1] + list(range(5, pittsburgh_merged.shape[1]))]]

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
154,Castle Shannon,Dessert Shop,Light Rail Station,Coffee Shop,Train Station,Bakery,Playground,Bar,Print Shop,Pub,Noodle House
