In [299]:
import urllib.request
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import folium
import requests
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import geopy.distance

In [175]:
html = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_Singapore_MRT_stations').read()
soup = BeautifulSoup(html,"html.parser")

In [176]:
text = soup.find_all('td')

In [177]:
MRT_list = []
for i in range(3,len(text),1):
    if ('\xa0' in list(text[i].text)and 'Station' not in text[i+1].text.split(' ')):
        MRT_list.append(text[i+1].text)

In [178]:
MRT_list = MRT_list[0:MRT_list.index('Expo')+1]

In [179]:
len(MRT_list)

61

In [180]:
MRT_list_formated = []
for item in MRT_list:
    MRT_list_formated.append(item.replace(" ","_")+"_MRT_station")

In [181]:
# function that extracts the geo information of the MRT station
def get_geo_info(station):
    
    html = urllib.request.urlopen('https://en.wikipedia.org/wiki/'+station).read()
    soup = BeautifulSoup(html,"html.parser")
    a = re.search('Coordinates.+Operated', soup.text)
    longitude  = a[0].split(';')[1].split('\n')[0].split('Operated')[0].split('Owned')[0].split('Coordinates')[0].strip()
    latitude = a[0].split(';')[0].split('/')[-1].strip()

    return [latitude,longitude]

In [182]:
station_geo = []
station_geo.clear()
for station in MRT_list_formated:
    geo = get_geo_info(station)
    station_geo.append([station,float(geo[0]),float(geo[1])])

In [247]:
df = pd.DataFrame(station_geo) 
df.columns = ["Station","latitude","longitude"]

In [248]:
df.head()

Unnamed: 0,Station,latitude,longitude
0,Jurong_East_MRT_station,1.333415,103.742119
1,Bukit_Batok_MRT_station,1.349073,103.749664
2,Bukit_Gombak_MRT_station,1.358702,103.751787
3,Choa_Chu_Kang_MRT_station,1.385092,103.744322
4,Yew_Tee_MRT_station,1.396986,103.747239


In [258]:
df = df.drop_duplicates(subset=['Station'], keep='first')
df = df.reset_index(drop=True)

In [259]:
df.shape

(58, 3)

### Create a map of SG with MRT Station superimposed on top

In [260]:
latitude = 1.283333
longitude = 103.833333

map_sg = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, Station in zip(df['latitude'], df['longitude'], df['Station']):
    label = '{}'.format(Station)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sg)  

    
map_sg

### Define Foursquare Credentials and Version

In [261]:
CLIENT_ID = 'CHRPLPEJVPFL1WP3YOX0ENKPTM5OR3JAHUMJVVKIPMAN1VLK' # your Foursquare ID
CLIENT_SECRET = 'UMAPXBBDAMKD2WBGCGGTFDELZ1MB1SE31CNOS3KZALIXX3NF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CHRPLPEJVPFL1WP3YOX0ENKPTM5OR3JAHUMJVVKIPMAN1VLK
CLIENT_SECRET:UMAPXBBDAMKD2WBGCGGTFDELZ1MB1SE31CNOS3KZALIXX3NF


### Let's explore the first neighborhood in our dataframe

In [262]:
df.loc[0, 'Station']

'Jurong_East_MRT_station'

In [263]:
station_latitude = df.loc[0, 'latitude'] # neighborhood latitude value
station_longitude = df.loc[0, 'longitude'] # neighborhood longitude value

station_name = df.loc[0, 'Station'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(station_name, 
                                                               station_latitude, 
                                                               station_longitude))

Latitude and longitude values of Jurong_East_MRT_station are 1.333415, 103.742119.


### Now, let's get the top 100 venues that are in Jurong_East_MRT_station within a radius of 500 meters

In [264]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

 # create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    station_latitude, 
    station_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=CHRPLPEJVPFL1WP3YOX0ENKPTM5OR3JAHUMJVVKIPMAN1VLK&client_secret=UMAPXBBDAMKD2WBGCGGTFDELZ1MB1SE31CNOS3KZALIXX3NF&v=20180605&ll=1.333415,103.742119&radius=500&limit=100'

In [265]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c4832064c1f671cfa174265'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Jurong East',
  'headerFullLocation': 'Jurong East, Singapore',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 70,
  'suggestedBounds': {'ne': {'lat': 1.3379150045000046,
    'lng': 103.7466118190961},
   'sw': {'lat': 1.3289149954999955, 'lng': 103.7376261809039}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '51c0356c498e19c820f5e48e',
       'name': 'UNIQLO',
       'location': {'address': '#02-37 & #03-30, Jem',
        'crossStreet': '50 Jurong Gateway Rd',
        'lat': 1.333175096970959,
        'lng': 103.74316037528905,
        'labeledLatLn

In [266]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [267]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,UNIQLO,Clothing Store,1.333175,103.74316
1,Tonkatsu by Ma Maison とんかつ マメゾン (Tonkatsu by M...,Japanese Restaurant,1.333668,103.742818
2,The Rink,Skating Rink,1.333424,103.740345
3,Johan Paris,Bakery,1.334083,103.742384
4,MUJI 無印良品,Furniture / Home Store,1.333187,103.743064


In [268]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

70 venues were returned by Foursquare.


### Explore Neighborhoods in SG MRT Stations

In [269]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(df.index[df['Station']==name].tolist())
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [271]:
sg_venues = getNearbyVenues(names=df['Station'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

[0]
Jurong_East_MRT_station
[1]
Bukit_Batok_MRT_station
[2]
Bukit_Gombak_MRT_station
[3]
Choa_Chu_Kang_MRT_station
[4]
Yew_Tee_MRT_station
[5]
Kranji_MRT_station
[6]
Marsiling_MRT_station
[7]
Woodlands_MRT_station
[8]
Admiralty_MRT_station
[9]
Sembawang_MRT_station
[10]
Canberra_MRT_station
[11]
Yishun_MRT_station
[12]
Khatib_MRT_station
[13]
Yio_Chu_Kang_MRT_station
[14]
Ang_Mo_Kio_MRT_station
[15]
Bishan_MRT_station
[16]
Braddell_MRT_station
[17]
Toa_Payoh_MRT_station
[18]
Novena_MRT_station
[19]
Newton_MRT_station
[20]
Orchard_MRT_station
[21]
Somerset_MRT_station
[22]
Dhoby_Ghaut_MRT_station
[23]
City_Hall_MRT_station
[24]
Raffles_Place_MRT_station
[25]
Marina_Bay_MRT_station
[26]
Marina_South_Pier_MRT_station
[27]
Pasir_Ris_MRT_station
[28]
Tampines_MRT_station
[29]
Simei_MRT_station
[30]
Tanah_Merah_MRT_station
[31]
Bedok_MRT_station
[32]
Kembangan_MRT_station
[33]
Eunos_MRT_station
[34]
Paya_Lebar_MRT_station
[35]
Aljunied_MRT_station
[36]
Kallang_MRT_station
[37]
Lavender_MRT_s

### Let's check the size of the resulting dataframe

In [272]:
print(sg_venues.shape)
sg_venues=sg_venues.rename(columns = {'Neighborhood':'Station'})
sg_venues.head()


(2110, 7)


Unnamed: 0,Station,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Jurong_East_MRT_station,1.333415,103.742119,UNIQLO,1.333175,103.74316,Clothing Store
1,Jurong_East_MRT_station,1.333415,103.742119,Tonkatsu by Ma Maison とんかつ マメゾン (Tonkatsu by M...,1.333668,103.742818,Japanese Restaurant
2,Jurong_East_MRT_station,1.333415,103.742119,The Rink,1.333424,103.740345,Skating Rink
3,Jurong_East_MRT_station,1.333415,103.742119,Johan Paris,1.334083,103.742384,Bakery
4,Jurong_East_MRT_station,1.333415,103.742119,MUJI 無印良品,1.333187,103.743064,Furniture / Home Store


In [286]:
sg_venues.to_pickle('sg_venues.pkl')

In [273]:
sg_venues.groupby('Station').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Admiralty_MRT_station,10,10,10,10,10,10
Aljunied_MRT_station,45,45,45,45,45,45
Ang_Mo_Kio_MRT_station,40,40,40,40,40,40
Bedok_MRT_station,60,60,60,60,60,60
Bishan_MRT_station,42,42,42,42,42,42
Boon_Lay_MRT_station,66,66,66,66,66,66
Braddell_MRT_station,39,39,39,39,39,39
Bugis_MRT_station,100,100,100,100,100,100
Bukit_Batok_MRT_station,22,22,22,22,22,22
Bukit_Gombak_MRT_station,22,22,22,22,22,22


### Let's find out how many unique categories can be curated from all the returned venues


In [274]:
print('There are {} uniques categories.'.format(len(sg_venues['Venue Category'].unique())))

There are 238 uniques categories.


### Analyze Each Neighborhood

In [275]:
# one hot encoding
sg_onehot = pd.get_dummies(sg_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sg_onehot['Station'] = sg_venues['Station'] 

# move neighborhood column to the first column
fixed_columns = [sg_onehot.columns[-1]] + list(sg_onehot.columns[:-1])
sg_onehot = sg_onehot[fixed_columns]

sg_onehot.head()

Unnamed: 0,Station,ATM,Accessories Store,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Australian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Jurong_East_MRT_station,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Jurong_East_MRT_station,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jurong_East_MRT_station,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Jurong_East_MRT_station,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Jurong_East_MRT_station,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [276]:
sg_onehot.shape

(2110, 239)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [277]:
sg_grouped = sg_onehot.groupby('Station').mean().reset_index()
sg_grouped

Unnamed: 0,Station,ATM,Accessories Store,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Australian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Admiralty_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aljunied_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ang_Mo_Kio_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bedok_MRT_station,0.0,0.0,0.016667,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.016667,0.0,0.0
4,Bishan_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Boon_Lay_MRT_station,0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.0,0.0
6,Braddell_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bugis_MRT_station,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.02,0.0,...,0.03,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01
8,Bukit_Batok_MRT_station,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Bukit_Gombak_MRT_station,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's print each neighborhood along with the top 5 most common venues

In [278]:
num_top_venues = 5

for hood in sg_grouped['Station']:
    print("----"+hood+"----")
    temp = sg_grouped[sg_grouped['Station'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Admiralty_MRT_station----
                  venue  freq
0           Supermarket   0.2
1          Dessert Shop   0.1
2  Fast Food Restaurant   0.1
3      Sushi Restaurant   0.1
4                  Café   0.1


----Aljunied_MRT_station----
                           venue  freq
0             Chinese Restaurant  0.13
1                   Noodle House  0.09
2  Vegetarian / Vegan Restaurant  0.07
3               Asian Restaurant  0.07
4                    Coffee Shop  0.04


----Ang_Mo_Kio_MRT_station----
                 venue  freq
0           Food Court  0.08
1         Dessert Shop  0.08
2      Bubble Tea Shop  0.08
3          Coffee Shop  0.08
4  Japanese Restaurant  0.05


----Bedok_MRT_station----
                 venue  freq
0          Coffee Shop  0.08
1   Chinese Restaurant  0.07
2  Japanese Restaurant  0.05
3           Food Court  0.05
4       Sandwich Place  0.03


----Bishan_MRT_station----
                 venue  freq
0          Coffee Shop  0.12
1      Bubble Tea Shop  0.07


                 venue  freq
0   Chinese Restaurant  0.26
1          Coffee Shop  0.11
2  Japanese Restaurant  0.07
3           Food Court  0.07
4                 Café  0.07


----Toa_Payoh_MRT_station----
                venue  freq
0  Chinese Restaurant  0.11
1         Snack Place  0.11
2         Coffee Shop  0.11
3        Dessert Shop  0.07
4      Sandwich Place  0.04


----Tuas_Crescent_MRT_station----
             venue  freq
0       Food Truck  0.25
1         Ski Area  0.25
2    Train Station  0.25
3  Harbor / Marina  0.25
4              ATM  0.00


----Tuas_Link_MRT_station----
                venue  freq
0         Sports Club  0.14
1           Gastropub  0.14
2     Harbor / Marina  0.14
3       Train Station  0.14
4  Chinese Restaurant  0.14


----Tuas_West_Road_MRT_station----
                     venue  freq
0              Coffee Shop   1.0
1                      ATM   0.0
2                Pet Store   0.0
3             Noodle House   0.0
4  North Indian Restaurant   0.0


---

### Let's put that into a pandas dataframe

In [279]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [280]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Station']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
sg_venues_sorted = pd.DataFrame(columns=columns)
sg_venues_sorted['Station'] = sg_grouped['Station']

for ind in np.arange(sg_grouped.shape[0]):
    sg_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sg_grouped.iloc[ind, :], num_top_venues)

sg_venues_sorted.head()

Unnamed: 0,Station,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Admiralty_MRT_station,Supermarket,Noodle House,Food Court,Bakery,Fast Food Restaurant,Café,Sushi Restaurant,Park,Dessert Shop,Flea Market
1,Aljunied_MRT_station,Chinese Restaurant,Noodle House,Vegetarian / Vegan Restaurant,Asian Restaurant,Seafood Restaurant,Coffee Shop,Café,Food Court,Hostel,Dim Sum Restaurant
2,Ang_Mo_Kio_MRT_station,Dessert Shop,Coffee Shop,Food Court,Bubble Tea Shop,Supermarket,Sandwich Place,Japanese Restaurant,Fast Food Restaurant,Malay Restaurant,Miscellaneous Shop
3,Bedok_MRT_station,Coffee Shop,Chinese Restaurant,Japanese Restaurant,Food Court,Sandwich Place,Asian Restaurant,Bakery,Thrift / Vintage Store,Fried Chicken Joint,Café
4,Bishan_MRT_station,Coffee Shop,Bubble Tea Shop,Café,Chinese Restaurant,Supermarket,Japanese Restaurant,Food Court,Pet Store,Ice Cream Shop,Seafood Restaurant


### Cluster Neighborhoods

In [287]:
# set number of clusters
kclusters = 5

sg_grouped_clustering = sg_grouped.drop('Station', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sg_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 0, 4, 4, 4, 4, 0, 4, 4, 4])

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [288]:
sg_merged = df

# add clustering labels
sg_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
sg_merged = sg_merged.join(sg_venues_sorted.set_index('Station'), on='Station')

sg_merged.head() # check the last columns!

Unnamed: 0,Station,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Jurong_East_MRT_station,1.333415,103.742119,4,Japanese Restaurant,Chinese Restaurant,Shopping Mall,Department Store,Food Court,Bakery,Sandwich Place,Sporting Goods Shop,Clothing Store,Coffee Shop
1,Bukit_Batok_MRT_station,1.349073,103.749664,0,Coffee Shop,Chinese Restaurant,Food Court,Spa,Malay Restaurant,Multiplex,Sandwich Place,Fast Food Restaurant,Park,Bowling Alley
2,Bukit_Gombak_MRT_station,1.358702,103.751787,4,Food Court,Vegetarian / Vegan Restaurant,ATM,Stadium,Supermarket,Fast Food Restaurant,Sandwich Place,Malay Restaurant,Steakhouse,Coffee Shop
3,Choa_Chu_Kang_MRT_station,1.385092,103.744322,4,Asian Restaurant,Fast Food Restaurant,Coffee Shop,Food Court,Park,Thai Restaurant,Shopping Mall,Sandwich Place,Supermarket,Bakery
4,Yew_Tee_MRT_station,1.396986,103.747239,4,Fast Food Restaurant,Pool,Café,Mexican Restaurant,Diner,Japanese Restaurant,Sandwich Place,Shopping Mall,Miscellaneous Shop,Train Station


### Finally, let's visualize the resulting clusters

In [289]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sg_merged['latitude'], sg_merged['longitude'], sg_merged['Station'], sg_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

In [291]:
sg_merged.head()

Unnamed: 0,Station,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Jurong_East_MRT_station,1.333415,103.742119,4,Japanese Restaurant,Chinese Restaurant,Shopping Mall,Department Store,Food Court,Bakery,Sandwich Place,Sporting Goods Shop,Clothing Store,Coffee Shop
1,Bukit_Batok_MRT_station,1.349073,103.749664,0,Coffee Shop,Chinese Restaurant,Food Court,Spa,Malay Restaurant,Multiplex,Sandwich Place,Fast Food Restaurant,Park,Bowling Alley
2,Bukit_Gombak_MRT_station,1.358702,103.751787,4,Food Court,Vegetarian / Vegan Restaurant,ATM,Stadium,Supermarket,Fast Food Restaurant,Sandwich Place,Malay Restaurant,Steakhouse,Coffee Shop
3,Choa_Chu_Kang_MRT_station,1.385092,103.744322,4,Asian Restaurant,Fast Food Restaurant,Coffee Shop,Food Court,Park,Thai Restaurant,Shopping Mall,Sandwich Place,Supermarket,Bakery
4,Yew_Tee_MRT_station,1.396986,103.747239,4,Fast Food Restaurant,Pool,Café,Mexican Restaurant,Diner,Japanese Restaurant,Sandwich Place,Shopping Mall,Miscellaneous Shop,Train Station


In [293]:
sg_merged.loc[sg_merged['Cluster Labels'] == 0, sg_merged.columns[[0] + list(range(3, sg_merged.shape[1]))]]

Unnamed: 0,Station,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Bukit_Batok_MRT_station,0,Coffee Shop,Chinese Restaurant,Food Court,Spa,Malay Restaurant,Multiplex,Sandwich Place,Fast Food Restaurant,Park,Bowling Alley
6,Marsiling_MRT_station,0,Pizza Place,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Bus Station,Food Court,Supermarket,Asian Restaurant,Bakery,Train Station
11,Yishun_MRT_station,0,Chinese Restaurant,Food Court,Supermarket,Coffee Shop,Pet Store,Fast Food Restaurant,Bus Station,Snack Place,Furniture / Home Store,Burger Joint
12,Khatib_MRT_station,0,Coffee Shop,Supermarket,Food Court,Bus Line,American Restaurant,Shopping Mall,Bus Stop,Fast Food Restaurant,Bakery,Grocery Store
16,Braddell_MRT_station,0,Bakery,Food Court,Chinese Restaurant,Asian Restaurant,Café,Noodle House,Thai Restaurant,Fast Food Restaurant,Hakka Restaurant,Metro Station
19,Newton_MRT_station,0,Chinese Restaurant,Seafood Restaurant,Hotel Bar,Dance Studio,Bakery,Italian Restaurant,Hotel,Gym / Fitness Center,Movie Theater,Noodle House
24,Raffles_Place_MRT_station,0,Café,Cocktail Bar,Italian Restaurant,Chinese Restaurant,Salad Place,Gym / Fitness Center,Coffee Shop,Bar,Japanese Restaurant,Sandwich Place
25,Marina_Bay_MRT_station,0,Mexican Restaurant,Yoga Studio,Gym,Pub,Restaurant,Sandwich Place,Coffee Shop,Seafood Restaurant,Chinese Restaurant,Museum
33,Eunos_MRT_station,0,Chinese Restaurant,Noodle House,Bubble Tea Shop,Train Station,Neighborhood,Breakfast Spot,Mediterranean Restaurant,Food Court,Seafood Restaurant,Gym
40,Outram_Park_MRT_station,0,Café,Italian Restaurant,Chinese Restaurant,Japanese Restaurant,Coffee Shop,Seafood Restaurant,Tapas Restaurant,French Restaurant,Bakery,Food Court


In [294]:
sg_merged.loc[sg_merged['Cluster Labels'] == 1, sg_merged.columns[[0] + list(range(3, sg_merged.shape[1]))]]

Unnamed: 0,Station,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,Somerset_MRT_station,1,Hotel,Japanese Restaurant,Café,Shopping Mall,Clothing Store,Bakery,Korean Restaurant,Coffee Shop,Spa,Department Store


In [295]:
sg_merged.loc[sg_merged['Cluster Labels'] == 2, sg_merged.columns[[0] + list(range(3, sg_merged.shape[1]))]]

Unnamed: 0,Station,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
53,Gul_Circle_MRT_station,2,Restaurant,Arcade,Yoga Studio,Field,Gaming Cafe,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Truck


In [296]:
sg_merged.loc[sg_merged['Cluster Labels'] == 3, sg_merged.columns[[0] + list(range(3, sg_merged.shape[1]))]]

Unnamed: 0,Station,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,Bedok_MRT_station,3,Coffee Shop,Chinese Restaurant,Japanese Restaurant,Food Court,Sandwich Place,Asian Restaurant,Bakery,Thrift / Vintage Store,Fried Chicken Joint,Café


In [297]:
sg_merged.loc[sg_merged['Cluster Labels'] == 4, sg_merged.columns[[0] + list(range(3, sg_merged.shape[1]))]]

Unnamed: 0,Station,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Jurong_East_MRT_station,4,Japanese Restaurant,Chinese Restaurant,Shopping Mall,Department Store,Food Court,Bakery,Sandwich Place,Sporting Goods Shop,Clothing Store,Coffee Shop
2,Bukit_Gombak_MRT_station,4,Food Court,Vegetarian / Vegan Restaurant,ATM,Stadium,Supermarket,Fast Food Restaurant,Sandwich Place,Malay Restaurant,Steakhouse,Coffee Shop
3,Choa_Chu_Kang_MRT_station,4,Asian Restaurant,Fast Food Restaurant,Coffee Shop,Food Court,Park,Thai Restaurant,Shopping Mall,Sandwich Place,Supermarket,Bakery
4,Yew_Tee_MRT_station,4,Fast Food Restaurant,Pool,Café,Mexican Restaurant,Diner,Japanese Restaurant,Sandwich Place,Shopping Mall,Miscellaneous Shop,Train Station
5,Kranji_MRT_station,4,Train Station,Racetrack,Bus Line,Bakery,Go Kart Track,Food Court,Café,Yoga Studio,Flower Shop,Food Stand
7,Woodlands_MRT_station,4,Japanese Restaurant,Café,Shopping Mall,Asian Restaurant,Supermarket,Coffee Shop,Clothing Store,Mobile Phone Shop,Chinese Restaurant,Indian Restaurant
8,Admiralty_MRT_station,4,Supermarket,Noodle House,Food Court,Bakery,Fast Food Restaurant,Café,Sushi Restaurant,Park,Dessert Shop,Flea Market
9,Sembawang_MRT_station,4,Café,Playground,Fast Food Restaurant,Bus Station,Coffee Shop,Convenience Store,Chinese Restaurant,Sushi Restaurant,Supermarket,Steakhouse
10,Canberra_MRT_station,4,Asian Restaurant,American Restaurant,Coffee Shop,Thai Restaurant,Food Truck,Flea Market,Flower Shop,Food Court,Food Stand,Yoga Studio
13,Yio_Chu_Kang_MRT_station,4,Chinese Restaurant,Coffee Shop,Food Court,Fast Food Restaurant,Vegetarian / Vegan Restaurant,Gym / Fitness Center,Tennis Court,College Auditorium,Stadium,Train Station


### Internaltional business park (1.329583, 103.7475) move to  Changi Business Park (1.334972, 103.965167)

### find the nearest MRT station of the Internaltional business park (1.329583, 103.7475)

In [303]:
df.head()

Unnamed: 0,Station,latitude,longitude,Cluster Labels
0,Jurong_East_MRT_station,1.333415,103.742119,4
1,Bukit_Batok_MRT_station,1.349073,103.749664,0
2,Bukit_Gombak_MRT_station,1.358702,103.751787,4
3,Choa_Chu_Kang_MRT_station,1.385092,103.744322,4
4,Yew_Tee_MRT_station,1.396986,103.747239,4


In [309]:
distance_IBP = []
distance_CBP = []

for i in range(df.shape[0]):
    distance_IBP.append(geopy.distance.vincenty((df['latitude'][i],df['longitude'][i]),(1.329583, 103.7475)))
    distance_CBP.append(geopy.distance.vincenty((df['latitude'][i],df['longitude'][i]),(1.334972, 103.965167)))
df['distance_IBP'] = pd.Series(distance_IBP).values
df['distance_CBP'] = pd.Series(distance_CBP).values

In [310]:
df.head()

Unnamed: 0,Station,latitude,longitude,Cluster Labels,distance_IBP,distance_CBP
0,Jurong_East_MRT_station,1.333415,103.742119,4,0.7335951448354043 km,24.82350034651711 km
1,Bukit_Batok_MRT_station,1.349073,103.749664,0,2.1685189219704255 km,24.033779063733693 km
2,Bukit_Gombak_MRT_station,1.358702,103.751787,4,3.2549849218938185 km,23.891362720492534 km
3,Choa_Chu_Kang_MRT_station,1.385092,103.744322,4,6.1480832428495376 km,25.194564704806506 km
4,Yew_Tee_MRT_station,1.396986,103.747239,4,7.453147984565796 km,25.20355828724639 km


### Sort by the distance_IBP and choose the nearest MRT station to check the cluster it belons to

In [314]:
df.sort_values(by=['distance_IBP'])

Unnamed: 0,Station,latitude,longitude,Cluster Labels,distance_IBP,distance_CBP
0,Jurong_East_MRT_station,1.333415,103.742119,4,0.7335951448354043 km,24.82350034651711 km
1,Bukit_Batok_MRT_station,1.349073,103.749664,0,2.1685189219704255 km,24.033779063733693 km
48,Chinese_Garden_MRT_station,1.342711,103.732467,4,2.2150260057012545 km,25.911189342120448 km
47,Clementi_MRT_station,1.315303,103.765244,0,2.528453228889151 km,22.355418339121304 km
2,Bukit_Gombak_MRT_station,1.358702,103.751787,4,3.2549849218938185 km,23.891362720492534 km
49,Lakeside_MRT_station,1.344589,103.721139,0,3.370438113021149 km,27.178518032370143 km
46,Dover_MRT_station,1.311314,103.778658,4,4.013121533717504 km,20.920754400058126 km
50,Boon_Lay_MRT_station,1.338883,103.706208,4,4.708996799316189 km,28.822600781255616 km
45,Buona_Vista_MRT_station,1.306817,103.790428,4,5.400113625949741 km,19.69434508450969 km
51,Pioneer_MRT_station,1.337578,103.697217,4,5.665401522234432 km,29.82142534757868 km


In [316]:
df.loc[df['Cluster Labels'] == 4].sort_values(by=['distance_CBP'])

Unnamed: 0,Station,latitude,longitude,Cluster Labels,distance_IBP,distance_CBP
57,Expo_MRT_station,1.335469,103.961767,4,23.854534364992535 km,0.3823935718146434 km
29,Simei_MRT_station,1.343444,103.953172,4,22.940397836036276 km,1.630827283896074 km
30,Tanah_Merah_MRT_station,1.327358,103.946344,4,22.130714712778573 km,2.257607204769997 km
28,Tampines_MRT_station,1.352528,103.945322,4,22.16118477784085 km,2.94038955308073 km
27,Pasir_Ris_MRT_station,1.372411,103.949369,4,22.95950441956012 km,4.497670432713844 km
32,Kembangan_MRT_station,1.320983,103.912842,4,18.42541721175361 km,6.025203719505898 km
34,Paya_Lebar_MRT_station,1.317767,103.892381,4,16.17665414351162 km,8.320748349721612 km
35,Aljunied_MRT_station,1.316442,103.882981,4,15.147496582975782 km,9.373211297455079 km
36,Kallang_MRT_station,1.311469,103.8714,4,13.93355398904876 km,10.754062123832822 km
37,Lavender_MRT_station,1.307167,103.863008,4,13.091731605937248 km,11.777642004417013 km
