# Data Science Capstone Final Project 

### Opening a Hotel in Miami

### Data

We need data from reliable sources for analysis:
 - Wikipedia List of Miami Neighborhoods: https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Miami \
 - Foursquare Developers Access to venue data: https://foursquare.com/

### Methodology

The methodology will include:
 - Acquire data from Wikipedia List of Miami Neighborhoods
 - Use web scraping techniques in Python to extract list of neighborhoods
 - Use Geocoder package to obtain geocoordinates for each neighborhood
 - Populate neighbhood data into a dataframe
 - Plot the neighborhoods on a map using Folium
 - Call the Foursquare API to obtain venue data and see how many venues returned for each neighborhood
 - Group the venues by neighborhood and take the mean frequency ofoccurence for each type of venue, focusin on yoga studios
 - Cluster the data using K-Means clustering to see which area has higher or lower number of yoga studios

In [151]:
import pandas as pd
import numpy as np
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
!pip install folium
import folium # map rendering library
from sklearn.cluster import KMeans
import urllib.request
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
!conda install -c conda-forge geocoder -y
import geocoder # to get coordinates

Solving environment: done

# All requested packages already installed.



### Import the Neighborhoods from Wikipedia and create a dataframe

In [325]:
#Create dataframe of Miami Neighborhoods
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Miami").text
soup = BeautifulSoup(data, 'html.parser')
neighborhoodList = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)
mm_df.reset_index()
mm_df1 = mm_df.drop([2], axis =0)
mm_df2 = mm_df1.drop([0,1], axis = 0)
miamidf = mm_df2.reset_index()
miamidf.shape
miamidf.head(5)

Unnamed: 0,index,Neighborhood
0,3,Allapattah
1,4,Arts & Entertainment District
2,5,Biscayne Boulevard Historic District
3,6,"Biscayne Island, Miami"
4,7,Brickell


### Find the Latitudes and Longitudes

In [154]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Miami, Florida'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in miamidf["Neighborhood"].tolist() ]
coords

[[25.80215000000004, -80.22362999999996],
 [25.77481000000006, -80.19772999999998],
 [25.878823222621985, -80.16766140320783],
 [25.815823554243593, -80.18854193012609],
 [25.76428691089276, -80.19255749520372],
 [25.765775006273287, -80.18609485539598],
 [25.81052219359692, -80.1931328210923],
 [25.796809582114925, -80.27359595987755],
 [25.732330000000047, -80.25413999999995],
 [25.755330000000072, -80.22217999999998],
 [25.812234148114165, -80.19465301057959],
 [25.707075817050484, -80.25961786278785],
 [25.762710000000027, -80.31586999999996],
 [25.77505000000008, -80.19565999999998],
 [25.788190000000043, -80.25610999999998],
 [25.769130000000075, -80.18997999999993],
 [25.77481000000006, -80.19772999999998],
 [25.910870000000045, -80.20006999999998],
 [25.828640000000064, -80.19832999999994],
 [25.893779988891556, -80.16231994445744],
 [25.846002424251907, -80.17506343519538],
 [26.631030000000067, -81.87631999999996],
 [25.77481000000006, -80.19772999999998],
 [25.80899395598187

In [159]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
# merge the coordinates into the original dataframe
miamidf['Latitude'] = df_coords['Latitude']
miamidf['Longitude'] = df_coords['Longitude']
# check the neighborhoods and the coordinates
print(miamidf.shape)
miamidf

(36, 4)


Unnamed: 0,index,Neighborhood,Latitude,Longitude
0,3,Allapattah,25.80215,-80.22363
1,4,Arts & Entertainment District,25.77481,-80.19773
2,5,Biscayne Boulevard Historic District,25.878823,-80.167661
3,6,"Biscayne Island, Miami",25.815824,-80.188542
4,7,Brickell,25.764287,-80.192557
5,8,Brickell Key,25.765775,-80.186095
6,9,Buena Vista (Miami),25.810522,-80.193133
7,10,Central Business District (Miami),25.79681,-80.273596
8,11,Coconut Grove,25.73233,-80.25414
9,12,Coral Way,25.75533,-80.22218


### Create a Map of Miami

In [160]:
#Get lat, lng of Miami
address = 'Miami, FL'

geolocator = Nominatim(user_agent="mm_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Miami are {}, {}.'.format(latitude, longitude)) 

The geograpical coordinate of Miami are 25.7742658, -80.1936589.


In [218]:
# create map of Miami using latitude and longitude values
map_miami = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(miamidf['Latitude'], miamidf['Longitude'], miamidf['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_miami)  
    
map_miami

### Use Foursquare to find venues

In [326]:
# define Foursquare Credentials and Version
CLIENT_ID = 'my client id' # your Foursquare ID
CLIENT_SECRET = 'my client secret' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: my client id
CLIENT_SECRET:my client secret


In [239]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

miami_venues = getNearbyVenues(names=miamidf['Neighborhood'],
                                   latitudes=miamidf['Latitude'],
                                   longitudes=miamidf['Longitude']
                                  )
print(miami_venues.shape)
miami_venues.head()

Allapattah
Arts & Entertainment District
Biscayne Boulevard Historic District
Biscayne Island, Miami
Brickell
Brickell Key
Buena Vista (Miami)
Central Business District (Miami)
Coconut Grove
Coral Way
Miami Design District
Edgewater (Miami)
Flagami
Government Center (Miami)
Grapeland Heights
Greater Downtown Miami
Health District (Miami)
Liberty City (Miami)
Little Haiti
Little Havana
Little River (Miami)
Lummus Park Historic District
Miami Ironside
Midtown Miami
Riverside (Miami)
Overtown (Miami)
Park West (Miami)
Spring Garden (Miami)
The Roads
Umoja Village
Upper Eastside
Virginia Key
Watson Island
West Flagler
Wynwood
Wynwood Art District
(712, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allapattah,25.80215,-80.22363,Papo Llega y Pon,25.803466,-80.223886,Cuban Restaurant
1,Allapattah,25.80215,-80.22363,Plaza Seafood Market,25.805638,-80.223992,Seafood Restaurant
2,Allapattah,25.80215,-80.22363,YMCA,25.799132,-80.224291,Gym / Fitness Center
3,Arts & Entertainment District,25.77481,-80.19773,HistoryMiami,25.774827,-80.196559,Museum
4,Arts & Entertainment District,25.77481,-80.19773,Cane Á Sucre,25.774573,-80.194696,Sandwich Place


In [245]:
#count how many venues in each neighborhood
miami_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Allapattah,3,3,3,3,3,3
Arts & Entertainment District,15,15,15,15,15,15
Biscayne Boulevard Historic District,7,7,7,7,7,7
"Biscayne Island, Miami",33,33,33,33,33,33
Brickell,53,53,53,53,53,53
Brickell Key,35,35,35,35,35,35
Buena Vista (Miami),39,39,39,39,39,39
Central Business District (Miami),59,59,59,59,59,59
Coconut Grove,28,28,28,28,28,28
Coral Way,3,3,3,3,3,3


In [274]:
#Unique venues
print('There are {} uniques categories.'.format(len(miami_venues['Venue Category'].unique())))
# print out the list of categories
miami_venues['Venue Category'].unique()[:50]

There are 168 uniques categories.


True

### Analyze each neighborhood

In [247]:
# one hot encoding
miami_onehot = pd.get_dummies(miami_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
miami_onehot['Neighborhood'] = miami_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [miami_onehot.columns[-1]] + list(miami_onehot.columns[:-1])
miami_onehot = miami_onehot[fixed_columns]

miami_onehot.head()

Unnamed: 0,Zoo,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Arcade,Arepa Restaurant,...,Tex-Mex Restaurant,Thai Restaurant,Theater,Trail,Train Station,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [259]:
miami_grouped = miami_onehot.groupby('Neighborhood').mean().reset_index()
miami_grouped.head()

Unnamed: 0,Neighborhood,Zoo,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Arcade,...,Tex-Mex Restaurant,Thai Restaurant,Theater,Trail,Train Station,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Allapattah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Arts & Entertainment District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0
2,Biscayne Boulevard Historic District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Biscayne Island, Miami",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.060606,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0
4,Brickell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018868,0.0,...,0.018868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037736


In [317]:
len(miami_grouped[miami_grouped["Hotel"] > 0])

miami_hotel = miami_grouped[["Neighborhood", "Hotel"]]
miami_hotel.head()


Unnamed: 0,Neighborhood,Hotel
0,Allapattah,0.0
1,Arts & Entertainment District,0.066667
2,Biscayne Boulevard Historic District,0.0
3,"Biscayne Island, Miami",0.0
4,Brickell,0.09434


In [318]:
#Cluster the neighborhoods
# set number of clusters
kclusters = 3

miami_grouped_clustering = miami_hotel.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(miami_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

miami_merged = miami_hotel.copy()

miami_merged["Cluster Labels"]= kmeans.labels_
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

miami_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Hotel,Cluster Labels
0,Allapattah,0.0,0
1,Arts & Entertainment District,0.066667,1
2,Biscayne Boulevard Historic District,0.0,0
3,"Biscayne Island, Miami",0.0,0
4,Brickell,0.09434,2


In [319]:
# merge miami_grouped with miamidf to add latitude/longitude for each neighborhood
miami_merged = miami_merged.join(miamidf.set_index("Neighborhood"), on="Neighborhood")

print(miami_merged.shape)
miami_merged.head() # check the last columns!

(34, 6)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,index,Latitude,Longitude
0,Allapattah,0.0,0,3,25.80215,-80.22363
1,Arts & Entertainment District,0.066667,1,4,25.77481,-80.19773
2,Biscayne Boulevard Historic District,0.0,0,5,25.878823,-80.167661
3,"Biscayne Island, Miami",0.0,0,6,25.815824,-80.188542
4,Brickell,0.09434,2,7,25.764287,-80.192557


In [320]:
## create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(miami_merged['Latitude'], miami_merged['Longitude'], miami_merged['Neighborhood'], miami_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [321]:
#Examine clusters
miami_merged.loc[miami_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,index,Latitude,Longitude
0,Allapattah,0.0,0,3,25.80215,-80.22363
2,Biscayne Boulevard Historic District,0.0,0,5,25.878823,-80.167661
3,"Biscayne Island, Miami",0.0,0,6,25.815824,-80.188542
6,Buena Vista (Miami),0.0,0,9,25.810522,-80.193133
7,Central Business District (Miami),0.0,0,10,25.79681,-80.273596
9,Coral Way,0.0,0,12,25.75533,-80.22218
10,Edgewater (Miami),0.0,0,14,25.707076,-80.259618
11,Flagami,0.0,0,15,25.76271,-80.31587
13,Grapeland Heights,0.0,0,17,25.78819,-80.25611
16,Liberty City (Miami),0.0,0,20,25.91087,-80.20007


In [322]:
miami_merged.loc[miami_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,index,Latitude,Longitude
1,Arts & Entertainment District,0.066667,1,4,25.77481,-80.19773
5,Brickell Key,0.057143,1,8,25.765775,-80.186095
8,Coconut Grove,0.035714,1,11,25.73233,-80.25414
12,Government Center (Miami),0.047619,1,16,25.77505,-80.19566
14,Greater Downtown Miami,0.070707,1,18,25.76913,-80.18998
15,Health District (Miami),0.066667,1,19,25.77481,-80.19773
19,Little River (Miami),0.0625,1,23,25.846002,-80.175063
22,Miami Ironside,0.066667,1,25,25.77481,-80.19773
27,Spring Garden (Miami),0.055556,1,30,25.78558,-80.211706


In [323]:
miami_merged.loc[miami_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,index,Latitude,Longitude
4,Brickell,0.09434,2,7,25.764287,-80.192557
26,Riverside (Miami),0.105263,2,27,25.76986,-80.20749
