### Importing required libraries

In [4]:
import numpy as np 
import pandas as pd
# !pip install geocoder
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from pandas.io.json import json_normalize
import requests
print('All libraries imported!')

All libraries imported!


### Use web scrapping to import data from Wikipedia Page

In [24]:
# get wiki page
Wiki = requests.get("https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Denver").text
soup = BeautifulSoup(Wiki, 'html.parser')
# retrieve neighbourhood data
neighbours =[]
for row in soup.find_all("div", class_="mw-parser-output")[0].findAll('li'):
    neighbours.append(row.text)
neighbours
# remove unwanted entries from list
del neighbours[0:13]
neighbours

# add the neighbourhood data into pandas dataframe

df = pd.DataFrame({"Neighborhood": neighbours})
# quick data check
df.shape
#df.head()

(98, 1)

### Getting lat long values of the neighbourhood for further analysis

In [60]:
latlng =[]
for index,row in df.iterrows():
    latlng.append(geocoder.arcgis('{}, Denver CO'.format(row[0])).latlng)
latlng

[[39.71117000000004, -104.99208999999996],
 [39.733700000000056, -104.97956999999997],
 [39.743670000000066, -104.99066999999997],
 [39.717820000000074, -104.94872999999995],
 [39.72911000000005, -104.96816999999999],
 [39.74379000000005, -104.94824999999997],
 [39.73359000000005, -104.95000999999996],
 [39.74545000000006, -104.96719999999999],
 [39.73526000000004, -104.99057999999997],
 [39.721960000000024, -104.96551999999997],
 [39.72896000000003, -105.00221999999997],
 [39.74425000000008, -104.98107999999996],
 [39.71933000000007, -104.97999999999996],
 [39.753490000000056, -104.99887999999999],
 [39.70384000000007, -104.94973999999996],
 [39.691140000000075, -104.95060999999998],
 [39.741310000000055, -104.89399999999995],
 [39.732710000000054, -104.93041999999997],
 [39.71861000000007, -104.92459999999994],
 [36.24127000000004, -76.66049999999996],
 [39.71623840920247, -104.9080830819771],
 [39.73166000000003, -104.91336999999999],
 [39.60465163062182, -105.09523084869765],
 [39.

In [65]:
# add these into dataset and merge them in original dataset
df_latlng = pd.DataFrame(latlng, columns=['Latitude', 'Longitude'])
df_latlng
df['Latitude'] = df_latlng['Latitude']
df['Longitude'] = df_latlng['Longitude']

In [67]:
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Baker,39.71117,-104.99209
1,Capitol Hill,39.73370,-104.97957
2,Central Business District,39.74367,-104.99067
3,Cherry Creek,39.71782,-104.94873
4,Cheesman Park,39.72911,-104.96817
...,...,...,...
93,"^ ""Denver Statistical Neighborhood Boundaries ...",39.74001,-104.99202
94,"^ ""Find A City To Love - Where to Live in Denv...",39.74001,-104.99202
95,City of Denver: Maps - Neighborhoods,40.79911,-111.98411
96,City of Denver: Community Planning and Develop...,39.74001,-104.99202


### Now we have got the required dataset, let us explore the neighbourhoods using Foursqare api

In [70]:
CLIENT_ID = 'masked' # your Foursquare ID
CLIENT_SECRET = 'masked' # your Foursquare Secret
ACCESS_TOKEN = 'masked' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30

### Let us try to pull up top 50 Venues in 5000 radius of the neighbourhoods

In [71]:
radius = 5000
LIMIT = 50

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    results = requests.get(url).json()["response"]['groups'][0]['items']
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

### Get the venue list in dataframe

In [74]:
v_df = pd.DataFrame(venues)
v_df
# change the column names for better understanding
v_df.columns = ['Neighborhood', 'Neighbourhood_Lat', 'Neighbourhood_Long', 'Venue_Name', 'Venue_Lat', 'Venue_Long', 'Category']
v_df.head()

Unnamed: 0,Neighborhood,Neighbourhood_Lat,Neighbourhood_Long,Venue_Name,Venue_Lat,Venue_Long,Category
0,Baker,39.71117,-104.99209,Denver Biscuit Company,39.71377,-104.987778,Breakfast Spot
1,Baker,39.71117,-104.99209,Stranahan's Colorado Whiskey,39.712253,-104.998576,Distillery
2,Baker,39.71117,-104.99209,MUTINY Information Cafe,39.716474,-104.987317,Bookstore
3,Baker,39.71117,-104.99209,Sweet Action Ice Cream,39.717544,-104.987352,Ice Cream Shop
4,Baker,39.71117,-104.99209,Mayan Theatre,39.718704,-104.987342,Indie Movie Theater


In [75]:
v_df['Category'].unique()
# unique values shows we have Indian Restaurant as a category

array(['Breakfast Spot', 'Distillery', 'Bookstore', 'Ice Cream Shop',
       'Indie Movie Theater', 'Thrift / Vintage Store',
       'Italian Restaurant', 'Donut Shop', 'Yoga Studio',
       'Sandwich Place', 'Brewery', 'Japanese Restaurant', 'Bar',
       'Mexican Restaurant', 'Park', 'Pizza Place',
       'Vietnamese Restaurant', 'Grocery Store', 'Argentinian Restaurant',
       'Bagel Shop', 'Arts & Entertainment', 'Trail',
       'Furniture / Home Store', 'Sushi Restaurant',
       'Fried Chicken Joint', 'Taco Place', 'Marijuana Dispensary',
       'Ski Shop', 'Art Museum', 'Cuban Restaurant', 'Coffee Shop',
       'Café', 'Cocktail Bar', 'Hotel', 'Vegetarian / Vegan Restaurant',
       'American Restaurant', 'Bakery', 'Liquor Store', 'Juice Bar',
       'History Museum', 'Concert Hall', 'Smoke Shop', 'Butcher',
       'Burger Joint', 'Bike Shop', 'New American Restaurant', 'Theater',
       'Chiropractor', 'Tex-Mex Restaurant', 'Botanical Garden', 'Lounge',
       'Clothing Store'

### Let us try to analyse the neighbourhood in depth using Categories
#### As Categories are nominal categorical variable, it would be best to run onehot encoding

In [85]:
onehot = pd.get_dummies(v_df[['Category']])
onehot['Neighborhood'] = v_df['Neighborhood']
onehot.head()

Unnamed: 0,Category_ATM,Category_Accessories Store,Category_Adult Education Center,Category_African Restaurant,Category_Airport,Category_Airport Gate,Category_Airport Lounge,Category_Airport Service,Category_Alternative Healer,Category_American Restaurant,...,Category_Video Store,Category_Vietnamese Restaurant,Category_Warehouse Store,Category_Wine Bar,Category_Wine Shop,Category_Wings Joint,Category_Yoga Studio,Category_Zoo,Category_Zoo Exhibit,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Baker
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Baker
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Baker
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Baker
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Baker


### To get the unique records for given neighbourhood let us group by

In [86]:
Grouped_data = onehot.groupby(["Neighborhood"]).mean().reset_index()
Grouped_data

Unnamed: 0,Neighborhood,Category_ATM,Category_Accessories Store,Category_Adult Education Center,Category_African Restaurant,Category_Airport,Category_Airport Gate,Category_Airport Lounge,Category_Airport Service,Category_Alternative Healer,...,Category_Vegetarian / Vegan Restaurant,Category_Video Store,Category_Vietnamese Restaurant,Category_Warehouse Store,Category_Wine Bar,Category_Wine Shop,Category_Wings Joint,Category_Yoga Studio,Category_Zoo,Category_Zoo Exhibit
0,"Alamo Placita — A historic district, part of t...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.00,0.00,0.00,0.00,0.0,0.04,0.00,0.00
1,Athmar Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.04,0.00,0.00,0.00,0.0,0.02,0.00,0.00
2,Auraria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.00,0.00,0.02,0.00,0.0,0.00,0.00,0.00
3,Baker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.04,0.00,0.00,0.00,0.0,0.02,0.00,0.00
4,Barnum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.08,0.00,0.00,0.02,0.0,0.04,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Westwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.10,0.00,0.00,0.02,0.0,0.00,0.00,0.00
94,Whittier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.00,0.00,0.00,0.00,0.0,0.02,0.02,0.04
95,Windsor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.00,0.02,0.00,0.00,0.0,0.00,0.00,0.00
96,"^ ""Denver Statistical Neighborhood Boundaries ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.00,0.00,0.02,0.00,0.0,0.02,0.00,0.00


### As we are interested in Indian Restaurants only, let us filter out the data for Indian restaurants

In [87]:
IR_data=Grouped_data[['Neighborhood','Category_Indian Restaurant']]
IR_data

Unnamed: 0,Neighborhood,Category_Indian Restaurant
0,"Alamo Placita — A historic district, part of t...",0.00
1,Athmar Park,0.00
2,Auraria,0.00
3,Baker,0.00
4,Barnum,0.00
...,...,...
93,Westwood,0.00
94,Whittier,0.00
95,Windsor,0.02
96,"^ ""Denver Statistical Neighborhood Boundaries ...",0.00


### Let us run k-means to cluster

In [93]:
num_clusters = 5
ip_cluster = IR_data.drop(["Neighborhood"],1)
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(ip_cluster)

## Add clusters into Indian restaurants dataset

In [94]:
IR_data['Custers'] = kmeans.labels_
IR_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IR_data['Custers'] = kmeans.labels_


Unnamed: 0,Neighborhood,Category_Indian Restaurant,Custers
0,"Alamo Placita — A historic district, part of t...",0.00,0
1,Athmar Park,0.00,0
2,Auraria,0.00,0
3,Baker,0.00,0
4,Barnum,0.00,0
...,...,...,...
93,Westwood,0.00,0
94,Whittier,0.00,0
95,Windsor,0.02,1
96,"^ ""Denver Statistical Neighborhood Boundaries ...",0.00,0


### Let us also add lat long values from our original dataset

In [99]:
IR_data = IR_data.join(df.set_index("Neighborhood"), on="Neighborhood")
IR_data

Unnamed: 0,Neighborhood,Category_Indian Restaurant,Custers,Latitude,Longitude
0,"Alamo Placita — A historic district, part of t...",0.00,0,39.71933,-104.98000
1,Athmar Park,0.00,0,39.70396,-105.01039
2,Auraria,0.00,0,39.74575,-105.00997
3,Baker,0.00,0,39.71117,-104.99209
4,Barnum,0.00,0,39.71816,-105.03262
...,...,...,...,...,...
93,Westwood,0.00,0,39.70625,-105.03984
94,Whittier,0.00,0,39.75634,-104.96595
95,Windsor,0.02,1,39.70581,-104.89235
96,"^ ""Denver Statistical Neighborhood Boundaries ...",0.00,0,39.74001,-104.99202


### its time to visualize maps

In [110]:
map = folium.Map(location=[39.7392, -104.9903], zoom_start=11) # Denver Lat long
x = np.arange(num_clusters)
ys = [i+x+(i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(IR_data['Latitude'], IR_data['Longitude'], IR_data['Neighborhood'], IR_data['Custers']):

    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup="Denver Neighborhood",
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map)
       
map

#### showcase different clusters

In [117]:
IR_data.loc[IR_data['Custers'] == 0]

Unnamed: 0,Neighborhood,Category_Indian Restaurant,Custers,Latitude,Longitude
0,"Alamo Placita — A historic district, part of t...",0.0,0,39.71933,-104.98000
1,Athmar Park,0.0,0,39.70396,-105.01039
2,Auraria,0.0,0,39.74575,-105.00997
3,Baker,0.0,0,39.71117,-104.99209
4,Barnum,0.0,0,39.71816,-105.03262
...,...,...,...,...,...
89,Washington Virginia Vale,0.0,0,39.70381,-104.91463
93,Westwood,0.0,0,39.70625,-105.03984
94,Whittier,0.0,0,39.75634,-104.96595
96,"^ ""Denver Statistical Neighborhood Boundaries ...",0.0,0,39.74001,-104.99202


In [120]:
IR_data.loc[IR_data['Custers'] == 1].count()

Neighborhood                  17
Category_Indian Restaurant    17
Custers                       17
Latitude                      17
Longitude                     17
dtype: int64

In [122]:
IR_data.groupby(['Custers']).size()

Custers
0    78
1    17
2     1
3     1
4     1
dtype: int64

We can easily observe from the data set that most of the Indian restauarants are concentrated around cluster 0 and 1 whereas for cluster 2,3 and 4 there are only 1 Indian restaurnt for each one. If we look further into the dataset and try to understand the locations of the clusters from map visualization, these seem to be at outskirt of the city, near to Englewood and Centinneal. Though the population is not dense in these areas but there is still potential of opening up restaurants because of lesser competition across. I would certainly recommend opening up restaurants in cluster 2,3 and 4.