# Where to open a bar restaurant in the Bay Area

In [155]:
import requests
import pandas as pd
import numpy as np
import io
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
print('-----------------------------------------------------------')
from geopy.geocoders import Nominatim 
print('-----------------------------------------------------------')
#!conda install -c conda-forge folium=0.5.0 --yes 
print('-----------------------------------------------------------')
import folium 
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

-----------------------------------------------------------
-----------------------------------------------------------
-----------------------------------------------------------


### First, we will scrape the cities and towns of the Bay Area from wikipedia 

In [93]:
URL = 'https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_the_San_Francisco_Bay_Area'
r = requests.get(URL)

In [94]:
soup = BeautifulSoup(r.content, 'html5lib') 
#print(soup.prettify())

In [95]:
mytable = soup.find('table',{'class':'wikitable plainrowheaders sortable'})
A=[]
B=[]
C=[]
D=[]
E=[]
for row in mytable.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==6:
        B.append(cells[0].find(text=True))
        C.append(cells[1].find(text=True))
        D.append(cells[2].find(text=True))
        E.append(cells[3].find(text=True))

for row in mytable.findAll('tr'):
    cells=row.findAll('th')
    if len(cells)==1:
        A.append(cells[0].find(text=True))


In [101]:
df = pd.DataFrame({'Name':A, 'Type':B, 'County':C, 'Population':D, 'Area':E}, columns=['Name','Type','County','Population','Area'])
typedrop = df[df['Type']=='Town\n'].index
df.drop(typedrop, inplace=True)


In [97]:
cities = np.array(df['Name'])

In [98]:
lat = []
lon = []
for city in cities:
    address = city + ', CA'
    geolocator = Nominatim(user_agent=city)
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    lat.append(latitude)
    lon.append(longitude)


Lets only use proper cities instead of towns to reduce comptutational cost

In [108]:
df['Latitude'] = lat
df['Longitude'] = lon
df.reset_index(inplace=True)
df.drop(['index'],axis=1)

Unnamed: 0,level_0,Name,Type,County,Population,Area,Lattitude,Longitude,Latitude
0,0,Alameda,City,Alameda,73812,10.61,37.609029,-121.899142,37.609029
1,1,Albany,City,Alameda,18539,1.79,37.886870,-122.297747,37.886870
2,2,American Canyon,City,Napa,19454,4.84,38.174918,-122.260804,38.174918
3,3,Antioch,City,Contra Costa,102372,28.35,38.004921,-121.805789,38.004921
4,4,Belmont,City,San Mateo,25835,4.62,37.520215,-122.275801,37.520215
5,5,Belvedere,City,Marin,2068,0.52,37.872704,-122.464417,37.872704
6,6,Benicia,City,Solano,26997,12.93,38.049365,-122.158578,38.049365
7,7,Berkeley,City,Alameda,112580,10.47,37.870839,-122.272864,37.870839
8,8,Brentwood,City,Contra Costa,51481,14.79,37.931777,-121.696027,37.931777
9,9,Brisbane,City,San Mateo,4282,3.10,37.680766,-122.399972,37.680766


In [109]:
# save the DataFrame as CSV file
df.to_csv("bayarea.csv", index=False)

### Now Lets get the Foursquare data

In [112]:
CLIENT_ID = 'LSNSTGI2UHCS1YMXK3HKZBMF53QIOE2W1KS43PQK1LZQ4RKM' # your Foursquare ID
CLIENT_SECRET = 'PTXRQDXDDICOIZU5IFRBNEGTVXBXB2USKKTNIEETGJPRN5ZX' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LSNSTGI2UHCS1YMXK3HKZBMF53QIOE2W1KS43PQK1LZQ4RKM
CLIENT_SECRET:PTXRQDXDDICOIZU5IFRBNEGTVXBXB2USKKTNIEETGJPRN5ZX


In [113]:
venues = []
LIMIT = 50
radius = 2 * 1000 * 1.61
for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Name']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [115]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)
venues_df.columns = ['City', 'Latitude', 'Longitude', 'Venue Name', 'Latitude', 'Longitude', 'Category']

print(venues_df.shape)
venues_df.head()

(4023, 7)


Unnamed: 0,City,Latitude,Longitude,Venue Name,Latitude.1,Longitude.1,Category
0,Alameda,37.609029,-121.899142,Pleasanton Ridge Regional Park,37.614761,-121.881874,Trail
1,Alameda,37.609029,-121.899142,Elliston Vineyards,37.601171,-121.890326,Winery
2,Alameda,37.609029,-121.899142,Castlewood Country Club,37.637041,-121.895049,Golf Course
3,Alameda,37.609029,-121.899142,Augustin Bernal Park,37.635379,-121.903425,Park
4,Alameda,37.609029,-121.899142,Bosco's Bones & Brew,37.593791,-121.888016,American Restaurant


In [116]:
print('There are {} uniques categories.'.format(len(venues_df['Category'].unique())))

There are 309 uniques categories.


In [227]:
# print out the list of categories
venues_df['Category'].unique()[::10]

array(['Trail', 'Museum', 'Coffee Shop', 'Thai Restaurant',
       'Frozen Yogurt Shop', 'Video Game Store', 'Supermarket',
       'Food Court', 'Arts & Crafts Store', 'Dance Studio',
       'Japanese Restaurant', 'Fish & Chips Shop', 'Theater',
       'Southern / Soul Food Restaurant', 'Water Park', 'Factory', 'Spa',
       'Furniture / Home Store', 'Hawaiian Restaurant',
       'Department Store', 'Airport', 'Massage Studio',
       'Miscellaneous Shop', 'Reservoir', 'Gluten-free Restaurant',
       'Pool Hall', 'Szechuan Restaurant', 'Theme Park',
       'General College & University', 'Zoo', 'Hobby Shop'], dtype=object)

### Here we begin our K means clustering

In [140]:
# one hot encoding
BA_onehot = pd.get_dummies(venues_df[['Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
BA_onehot['Cities'] = venues_df['City'] 

# move neighborhood column to the first column
fixed_columns = [BA_onehot.columns[-1]] + list(BA_onehot.columns[:-1])
BA_onehot = BA_onehot[fixed_columns]

print(BA_onehot.shape)
BA_onehot.head()

(4023, 310)


Unnamed: 0,Cities,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,American Restaurant,Animal Shelter,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Alameda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alameda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Alameda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Alameda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alameda,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
BA_grouped = BA_onehot.groupby(['Cities']).mean().reset_index()

print(BA_grouped.shape)
BA_grouped

(84, 310)


Unnamed: 0,Cities,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,American Restaurant,Animal Shelter,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Alameda,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.071429,0.0,...,0.00,0.00,0.00,0.00,0.071429,0.00,0.00,0.00,0.00,0.00
1,Albany,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.00
2,American Canyon,0.02,0.00,0.00,0.00,0.00,0.00,0.00,0.000000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.02,0.00,0.00,0.00,0.00
3,Antioch,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.020000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.02,0.00,0.00,0.00,0.00
4,Belmont,0.00,0.00,0.02,0.00,0.00,0.00,0.00,0.020000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.00
5,Belvedere,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.040000,0.0,...,0.00,0.00,0.02,0.00,0.000000,0.00,0.00,0.00,0.00,0.00
6,Benicia,0.00,0.02,0.00,0.00,0.00,0.00,0.00,0.020000,0.0,...,0.00,0.00,0.02,0.00,0.000000,0.00,0.00,0.00,0.00,0.00
7,Berkeley,0.00,0.00,0.00,0.02,0.00,0.00,0.00,0.040000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.04,0.00,0.00
8,Brentwood,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.080000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.00
9,Brisbane,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000,0.0,...,0.00,0.00,0.00,0.00,0.000000,0.00,0.00,0.00,0.00,0.00


In [144]:
len(BA_grouped[BA_grouped["Bar"] > 0])

27

In [145]:
bars = BA_grouped[["Cities","Bar"]]
bars

Unnamed: 0,Cities,Bar
0,Alameda,0.000000
1,Albany,0.020000
2,American Canyon,0.000000
3,Antioch,0.000000
4,Belmont,0.020000
5,Belvedere,0.000000
6,Benicia,0.000000
7,Berkeley,0.000000
8,Brentwood,0.040000
9,Brisbane,0.020000


In [149]:
# set number of clusters
kclusters = 3

BA_clustering = bars.drop(["Cities"], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(BA_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 0, 0, 2, 0, 0, 0, 1, 2], dtype=int32)

In [150]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
BA_merged = bars.copy()

# add clustering labels
BA_merged["Cluster Labels"] = kmeans.labels_

In [151]:
BA_merged.rename(columns={"Cities": "City"}, inplace=True)
BA_merged.head()

Unnamed: 0,City,Bar,Cluster Labels
0,Alameda,0.0,0
1,Albany,0.02,2
2,American Canyon,0.0,0
3,Antioch,0.0,0
4,Belmont,0.02,2


In [152]:
BA_merged = BA_merged.join(df.set_index("Name"), on="City")

print(BA_merged.shape)
BA_merged.head() # check the last columns!

(84, 12)


Unnamed: 0,City,Bar,Cluster Labels,level_0,index,Type,County,Population,Area,Lattitude,Longitude,Latitude
0,Alameda,0.0,0,0,0,City,Alameda,73812,10.61,37.609029,-121.899142,37.609029
1,Albany,0.02,2,1,1,City,Alameda,18539,1.79,37.88687,-122.297747,37.88687
2,American Canyon,0.0,0,2,2,City,Napa,19454,4.84,38.174918,-122.260804,38.174918
3,Antioch,0.0,0,3,3,City,Contra Costa,102372,28.35,38.004921,-121.805789,38.004921
4,Belmont,0.02,2,4,5,City,San Mateo,25835,4.62,37.520215,-122.275801,37.520215


In [153]:
# sort the results by Cluster Labels
print(BA_merged.shape)
BA_merged.sort_values(["Cluster Labels"], inplace=True)
BA_merged

(84, 12)


Unnamed: 0,City,Bar,Cluster Labels,level_0,index,Type,County,Population,Area,Lattitude,Longitude,Latitude
0,Alameda,0.000000,0,0,0,City,Alameda,73812,10.61,37.609029,-121.899142,37.609029
54,Pittsburg,0.000000,0,55,64,City,Contra Costa,63264,17.22,38.018175,-121.890123,38.018175
53,Pinole,0.000000,0,54,63,City,Contra Costa,18390,5.32,38.004367,-122.298859,38.004367
52,Piedmont,0.000000,0,53,62,City,Alameda,10667,1.68,37.824371,-122.231635,37.824371
51,Petaluma,0.000000,0,52,61,City,Sonoma,57941,14.38,38.270022,-122.606122,38.270022
50,Palo Alto,0.000000,0,51,60,City,Santa Clara,64403,23.88,37.444329,-122.159847,37.444329
49,Pacifica,0.000000,0,50,59,City,San Mateo,37234,12.66,37.613825,-122.486919,37.613825
55,Pleasant Hill,0.000000,0,56,65,City,Contra Costa,33152,7.07,37.947979,-122.060796,37.947979
48,Orinda,0.000000,0,49,58,City,Contra Costa,17643,12.68,37.877148,-122.179689,37.877148
44,Newark,0.000000,0,45,54,City,Alameda,42573,13.87,37.529659,-122.040240,37.529659


In [229]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=9)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(BA_merged['Latitude'], BA_merged['Longitude'], BA_merged['City'], BA_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
#map_clusters.save('BAmap.png')

### Compare clusters
We will also define major cities to be cities with a population of more than 100,000

In [206]:
c0 = BA_merged.loc[BA_merged['Cluster Labels'] == 0]
c0

Unnamed: 0,City,Bar,Cluster Labels,level_0,index,Type,County,Population,Area,Lattitude,Longitude,Latitude
0,Alameda,0.0,0,0,0,City,Alameda,73812,10.61,37.609029,-121.899142,37.609029
54,Pittsburg,0.0,0,55,64,City,Contra Costa,63264,17.22,38.018175,-121.890123,38.018175
53,Pinole,0.0,0,54,63,City,Contra Costa,18390,5.32,38.004367,-122.298859,38.004367
52,Piedmont,0.0,0,53,62,City,Alameda,10667,1.68,37.824371,-122.231635,37.824371
51,Petaluma,0.0,0,52,61,City,Sonoma,57941,14.38,38.270022,-122.606122,38.270022
50,Palo Alto,0.0,0,51,60,City,Santa Clara,64403,23.88,37.444329,-122.159847,37.444329
49,Pacifica,0.0,0,50,59,City,San Mateo,37234,12.66,37.613825,-122.486919,37.613825
55,Pleasant Hill,0.0,0,56,65,City,Contra Costa,33152,7.07,37.947979,-122.060796,37.947979
48,Orinda,0.0,0,49,58,City,Contra Costa,17643,12.68,37.877148,-122.179689,37.877148
44,Newark,0.0,0,45,54,City,Alameda,42573,13.87,37.529659,-122.04024,37.529659


In [223]:
c0 = c0.replace(r'\n','', regex=True)
c0 = c0.replace(r',','',regex=True)
pop0 = np.array(c0['Population'])
pop0 = list(map(int, pop0))
mpop0 = int(np.mean(pop0))
print('mean population is: ' + str(mpop0))

major_cities0 = []
cl0 = np.array(c0)
for i in range(len(cl0)):
    if pop0[i] > 100000:
        major_cities0.append(cl0[i,0])
major_cities0

mean population is: 67501


['Vallejo',
 'Richmond',
 'Sunnyvale',
 'Santa Rosa',
 'San Francisco',
 'Santa Clara',
 'Antioch',
 'Berkeley',
 'Fairfield',
 'Concord',
 'Fremont']

In [210]:
c1 = BA_merged.loc[BA_merged['Cluster Labels'] == 1]
c1

Unnamed: 0,City,Bar,Cluster Labels,level_0,index,Type,County,Population,Area,Lattitude,Longitude,Latitude
64,San Jose,0.06,1,66,78,City,Santa Clara,945942,176.53,37.336191,-121.890583,37.336191
8,Brentwood,0.04,1,8,9,City,Contra Costa,51481,14.79,37.931777,-121.696027,37.931777
68,San Rafael,0.04,1,70,82,City,Marin,57713,16.47,37.973535,-122.531087,37.973535
28,Hayward,0.04,1,29,34,City,Alameda,144186,45.32,37.668821,-122.080796,37.668821
22,Emeryville,0.06,1,23,27,City,Alameda,10080,1.25,37.831409,-122.286527,37.831409
74,Sebastopol,0.04,1,76,88,City,Sonoma,7379,1.85,38.384512,-122.83325,38.384512
46,Oakland,0.08,1,47,56,City,Alameda,390724,55.79,37.804456,-122.271356,37.804456
33,Livermore,0.04,1,34,40,City,Alameda,80968,25.17,37.682058,-121.768053,37.682058
60,Rohnert Park,0.04,1,61,71,City,Sonoma,40971,7.0,38.339637,-122.701098,38.339637
15,Cotati,0.04,1,16,19,City,Sonoma,7265,1.88,38.32668,-122.706844,38.32668


In [225]:
c1 = c1.replace(r'\n','', regex=True)
c1 = c1.replace(r',','',regex=True)
pop1 = np.array(c1['Population'])
pop1 = list(map(int, pop1))
mpop1 = int(np.mean(pop1))
print('mean population is: ' + str(mpop1))

major_cities1 = []
cl1 = np.array(c1)
for i in range(len(cl1)):
    if pop1[i] > 100000:
        major_cities1.append(cl1[i,0])
major_cities1

mean population is: 151393


['San Jose', 'Hayward', 'Oakland']

In [212]:
c2 = BA_merged.loc[BA_merged['Cluster Labels'] == 2]
c2

Unnamed: 0,City,Bar,Cluster Labels,level_0,index,Type,County,Population,Area,Lattitude,Longitude,Latitude
1,Albany,0.02,2,1,1,City,Alameda,18539,1.79,37.88687,-122.297747,37.88687
4,Belmont,0.02,2,4,5,City,San Mateo,25835,4.62,37.520215,-122.275801,37.520215
81,Vacaville,0.02,2,82,95,City,Solano,92428,28.37,38.356577,-121.987744,38.356577
17,Daly City,0.02,2,18,21,City,San Mateo,101123,7.66,37.705767,-122.461921,37.705767
9,Brisbane,0.02,2,9,10,City,San Mateo,4282,3.1,37.680766,-122.399972,37.680766
29,Healdsburg,0.02,2,30,35,City,Sonoma,11254,4.46,38.610516,-122.881341,38.610516
18,Dixon,0.022222,2,19,23,City,Solano,18351,7.0,38.445464,-121.823296,38.445464
19,Dublin,0.02,2,20,24,City,Alameda,46036,14.91,37.702152,-121.935792,37.702152
65,San Leandro,0.02,2,67,79,City,Alameda,84950,13.34,37.72493,-122.156077,37.72493
27,Half Moon Bay,0.02,2,28,33,City,San Mateo,11324,6.42,37.463552,-122.428586,37.463552


In [226]:
c2 = c2.replace(r'\n','', regex=True)
c2 = c2.replace(r',','',regex=True)
pop2 = np.array(c2['Population'])
pop2 = list(map(int, pop2))
mpop2 = int(np.mean(pop2))
print('mean population is: ' + str(mpop2))

major_cities2 = []
cl2 = np.array(c2)
for i in range(len(cl2)):
    if pop2[i] > 100000:
        major_cities2.append(cl2[i,0])
major_cities2

mean population is: 38215


['Daly City']