# Capstone Project - The Battle of Neighborhoods (Week 2)

# New Shopping Centre in Bangkok, Thailand

### Importing required libraries

In [33]:
import numpy as np

import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib for visualization
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


### Scraping date from Wikipedia and creating dataframe

In [34]:
data = requests.get("https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Bangkok").text
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.findAll("li"):
    neighborhoodList.append(row.text.strip())
    
# create a new DataFrame from the list
df_all = pd.DataFrame({"Neighborhood": neighborhoodList})

df_all

Unnamed: 0,Neighborhood
0,1 Bang Kapi
1,2 Bang Sue
2,3 Chatuchak
3,4 Don Mueang
4,5 Dusit
5,6 Pathum Wan (well known as Ratchaprasong shop...
6,7 Phetchaburi
7,8 Phra Nakhon (well known as Rattanakosin Island)
8,9 Pom Prap Sattru Phai
9,10 Ratchadaphisek or Ratchada


### Cleaning data by droping unwanted cells

In [35]:
df = df_all.iloc[20:235, ]

# resetting index 
bk_df = df.reset_index(drop=True)
bk_df

Unnamed: 0,Neighborhood
0,Bang Kapi
1,Chok Chai 4
2,Happy Land
3,Hua Mak
4,Khlong Chan
5,Lam Sali
6,Lat Phrao
7,Ramkhamhaeng
8,Bang Pho
9,Bang Son


In [36]:
#printing shape of Bangkok dataframe
bk_df.shape

(215, 1)

### Get Goegraphical coordinates for each neighborhood and add to the dataframe

In [37]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Bangkok, Thailand'.format(neighborhood))
        lat_lng_coords = g.latlng
        return lat_lng_coords

In [39]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in bk_df["Neighborhood"].tolist()]
coords

[[13.765598784353244, 100.64767284099298],
 [13.794170000000065, 100.59506000000005],
 [13.770930000000021, 100.64161000000007],
 [13.754508449034972, 100.61393366633621],
 [13.78163659765778, 100.6446876891406],
 [13.835283415177043, 100.84698907061161],
 [13.8035387102816, 100.60751297499417],
 [13.755710000000022, 100.62906000000004],
 [13.809420000000046, 100.52420000000006],
 [13.853469915986285, 100.87419395691877],
 [13.809685152626628, 100.5374660443427],
 [13.807690000000036, 100.53367000000009],
 [13.753360000000043, 100.50483000000008],
 [13.827247166472318, 100.52689644791752],
 [13.698690000000056, 100.47144000000009],
 [13.804800000000057, 100.55314000000004],
 [13.809685152626628, 100.5374660443427],
 [13.797594065148186, 100.54588809453887],
 [13.8035387102816, 100.60751297499417],
 [13.813500000000033, 100.54986000000008],
 [13.813500000000033, 100.54986000000008],
 [13.799910000000068, 100.55158000000006],
 [13.793039078707338, 100.53214499041933],
 [13.82836370753142

In [41]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
bk_df['Latitude'] = df_coords['Latitude']
bk_df['Longitude'] = df_coords['Longitude']

# check merged dataframe and its shape
print(bk_df.shape)
bk_df

(215, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Bang Kapi,13.765599,100.647673
1,Chok Chai 4,13.79417,100.59506
2,Happy Land,13.77093,100.64161
3,Hua Mak,13.754508,100.613934
4,Khlong Chan,13.781637,100.644688
5,Lam Sali,13.835283,100.846989
6,Lat Phrao,13.803539,100.607513
7,Ramkhamhaeng,13.75571,100.62906
8,Bang Pho,13.80942,100.5242
9,Bang Son,13.85347,100.874194


### Imposing Bangkok dataframe on map

In [42]:
# get the coordinates of Bangkok
address = 'Bangkok, Thailand'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bangkok, Thailand {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Bangkok, Thailand 13.7542529, 100.493087.


In [8]:
# create map of Bangkok using latitude and longitude values
bk_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(bk_df['Latitude'], bk_df['Longitude'], bk_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, 
                        fill_color='#3186cc', fill_opacity=0.7).add_to(bk_map)  
bk_map

NameError: name 'folium' is not defined

In [9]:
# save the map as HTML file
bk_map.save('Bangkok_map.html')

NameError: name 'bk_map' is not defined

### Use the Foursquare API to explore the neighborhoods

In [10]:
# define Foursquare Credentials and Version
CLIENT_ID = 'FDVC2Y3PJ3OQZJ21SXKXJBMOR5L0SI4KIYHH5EI3EM3HWQYR' # your Foursquare ID
CLIENT_SECRET = '03BSXWIB25WISNSXS5EEZQVYMY1FTPUC3IYEMBTMSWRBDVTZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FDVC2Y3PJ3OQZJ21SXKXJBMOR5L0SI4KIYHH5EI3EM3HWQYR
CLIENT_SECRET:03BSXWIB25WISNSXS5EEZQVYMY1FTPUC3IYEMBTMSWRBDVTZ


### get the top 100 venues that are within a radius of 2000 meters

In [13]:
radius = 2000
LIMIT = 100

SyntaxError: 'return' outside function (<ipython-input-13-2618fe63e3cb>, line 41)

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

NameError: name 'pd' is not defined

In [14]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

NameError: name 'pd' is not defined

### Count of venues returned for each neighorhood

In [15]:
venues_df.groupby(["Neighborhood"]).count()

NameError: name 'venues_df' is not defined

### unique categories can be curated from all the returned venues

In [16]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

NameError: name 'venues_df' is not defined

In [17]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

NameError: name 'venues_df' is not defined

In [18]:
# check if the results contain "Shopping Mall"
"Neighborhood" in venues_df['VenueCategory'].unique()

NameError: name 'venues_df' is not defined

### Analyze Each Neighborhood

In [19]:
# one hot encoding
bk_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
bk_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [bk_onehot.columns[-1]] + list(bk_onehot.columns[:-1])
bk_onehot = bk_onehot[fixed_columns]

print(bk_onehot.shape)
bk_onehot.head()

NameError: name 'pd' is not defined

### group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [20]:
bk_grouped = bk_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(bk_grouped.shape)
bk_grouped

NameError: name 'bk_onehot' is not defined

In [21]:
len(bk_grouped[bk_grouped["Shopping Mall"] > 0])

NameError: name 'bk_grouped' is not defined

### Create a new DataFrame for Shopping Mall data only

In [22]:
bk_mall = bk_grouped[["Neighborhoods","Shopping Mall"]]
bk_mall.head()

NameError: name 'bk_grouped' is not defined

## Cluster Neighborhoods

### Run k-means to cluster the neighborhoods in Bangkok into 3 clusters

In [23]:
# set number of clusters
kclusters = 3

bk_clustering = bk_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(bk_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

NameError: name 'bk_mall' is not defined

In [24]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
bk_merged = bk_mall.copy()

# add clustering labels
bk_merged["Cluster Labels"] = kmeans.labels_

bk_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
bk_merged.head()

NameError: name 'bk_mall' is not defined

In [25]:
# merge bangkok_grouped with bangkok_data to add latitude/longitude for each neighborhood
bk_merged = bk_merged.join(bk_df.set_index("Neighborhood"), on="Neighborhood")

print(bk_merged.shape)
bk_merged.head()  # check the last columns!

NameError: name 'bk_merged' is not defined

In [26]:
# sort the results by Cluster Labels
print(bk_merged.shape)
bk_merged.sort_values(["Cluster Labels"], inplace=True)
bk_merged

NameError: name 'bk_merged' is not defined

## Visualize the data

In [27]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(bk_merged['Latitude'], bk_merged['Longitude'], bk_merged['Neighborhood'], bk_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'folium' is not defined

In [28]:
# save the map as HTML file
map_clusters.save('Bangkok_map_clusters.html')

NameError: name 'map_clusters' is not defined

## Examine Clusters

### Cluster 0

In [29]:
bk_merged.loc[bk_merged['Cluster Labels'] == 0]

NameError: name 'bk_merged' is not defined

### Cluster 1

In [30]:
bk_merged.loc[bk_merged['Cluster Labels'] == 1]

NameError: name 'bk_merged' is not defined

### Cluster 2

In [31]:
bk_merged.loc[bk_merged['Cluster Labels'] == 2]

NameError: name 'bk_merged' is not defined