<b> 1. Import Libraries <b>

In [11]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

<b> 2. Scrap data from Wikipedia page into a DataFrame <b>

In [12]:
data = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Ho_Chi_Minh_City").text

In [13]:
soup = BeautifulSoup(data, 'html.parser')

In [14]:
DistrictList = []

In [15]:
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    DistrictList.append(row.text.split(",")[0])
    #DistrictList.append(row.text.rsrip(",")[0])

In [16]:
hcmc_df = pd.DataFrame({"District": DistrictList})

In [17]:
hcmc_df

Unnamed: 0,District
0,Bình Chánh District
1,Bình Tân District
2,Bình Thạnh District
3,Cần Giờ District
4,Củ Chi District
5,District 1
6,District 2
7,District 3
8,District 4
9,District 5


In [18]:
hcmc_df.shape

(24, 1)

<b> 3. Get the geographical coordinates <b>

In [19]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Ho Chi Minh, VietNam'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [20]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in hcmc_df["District"].tolist() ]

In [21]:
coords

[[10.679220000000043, 106.57654000000008],
 [10.759090000000072, 106.59150000000005],
 [10.805180000000064, 106.69280000000003],
 [10.41566000000006, 106.96130000000005],
 [10.977340000000027, 106.50223000000005],
 [10.780950000000075, 106.69911000000008],
 [10.791990000000055, 106.74985000000004],
 [10.775670000000048, 106.68670000000009],
 [10.766700000000071, 106.70650000000006],
 [10.755690000000072, 106.66637000000009],
 [10.745970000000057, 106.64769000000007],
 [10.70515000000006, 106.73748000000006],
 [10.74771000000004, 106.66334000000006],
 [10.820040000000063, 106.83185000000009],
 [10.768670000000043, 106.66564000000005],
 [10.763160000000028, 106.64314000000007],
 [10.850430000000074, 106.62732000000005],
 [10.833790000000022, 106.66557000000006],
 [10.888390000000072, 106.59642000000008],
 [10.701530000000048, 106.73818000000006],
 [10.795650000000023, 106.67464000000007],
 [10.759090000000072, 106.59150000000005],
 [10.782320000000027, 106.63667000000004],
 [10.846260000

In [22]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [23]:
hcmc_df['Latitude'] = df_coords['Latitude']
hcmc_df['Longitude'] = df_coords['Longitude']

In [24]:
# check the districts and the coordinates
print(hcmc_df.shape)
hcmc_df

(24, 3)


Unnamed: 0,District,Latitude,Longitude
0,Bình Chánh District,10.67922,106.57654
1,Bình Tân District,10.75909,106.5915
2,Bình Thạnh District,10.80518,106.6928
3,Cần Giờ District,10.41566,106.9613
4,Củ Chi District,10.97734,106.50223
5,District 1,10.78095,106.69911
6,District 2,10.79199,106.74985
7,District 3,10.77567,106.6867
8,District 4,10.7667,106.7065
9,District 5,10.75569,106.66637


In [25]:
# save the DataFrame as CSV file
hcmc_df.to_csv("hcmc_df.csv", index=False)

<b> 4. Create a map of Ho Chi Minh City with neighborhoods superimposed on top <b> 

In [26]:
# get the coordinates of Kuala Lumpur
address = 'Ho Chi Minh, VietNam'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Ho Chi Minh, VietNam {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Ho Chi Minh, VietNam 10.7758439, 106.7017555.


In [27]:
# create map of Ho Chi Minh using latitude and longitude values
map_hcmc = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(hcmc_df['Latitude'], hcmc_df['Longitude'], hcmc_df['District']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hcmc)  
    
map_hcmc

In [29]:
# save the map as HTML file
map_hcmc.save('map_hcmc.html')

<b> 5. Use the Foursquare API to explore the neighborhoods <b> 

In [33]:
CLIENT_ID = 'YourClientIDHere'
CLIENT_SECRET = 'YourClientSecretHere'
VERSION = 'YourVersionHere'

In [36]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(hcmc_df['Latitude'], hcmc_df['Longitude'], hcmc_df['District']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    
     # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [38]:
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['District', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head(10)

(1051, 7)


Unnamed: 0,District,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Bình Chánh District,10.67922,106.57654,Kedai Sarah,10.688974,106.574965,Women's Store
1,Bình Chánh District,10.67922,106.57654,Lò Bánh Mì Vạn Hoà,10.665982,106.570857,Bakery
2,Bình Chánh District,10.67922,106.57654,Xí Nghiep Sx Hang Thu Cong My Nghe 27-7,10.683414,106.562306,Arts & Crafts Store
3,Bình Chánh District,10.67922,106.57654,National Road 1A,10.683168,106.561552,Bus Station
4,Bình Chánh District,10.67922,106.57654,Ốc chị Lượm,10.66373,106.570333,Seafood Restaurant
5,Bình Chánh District,10.67922,106.57654,Quán Sáu Thoảng,10.662458,106.572261,Diner
6,Bình Tân District,10.75909,106.5915,老郷水餃,10.752553,106.597498,Chinese Restaurant
7,Bình Tân District,10.75909,106.5915,Cafe Mộc Lan,10.749942,106.595815,Café
8,Bình Tân District,10.75909,106.5915,Cháo lòng gỏi lòng Ba Cầm,10.758473,106.580485,Soup Place
9,Bình Tân District,10.75909,106.5915,Pho Bac Hai,10.752251,106.601315,Asian Restaurant


In [39]:
venues_df.groupby(["District"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bình Chánh District,6,6,6,6,6,6
Bình Thạnh District,86,86,86,86,86,86
Bình Tân District,10,10,10,10,10,10
Cần Giờ District,4,4,4,4,4,4
Củ Chi District,6,6,6,6,6,6
District 1,100,100,100,100,100,100
District 10,100,100,100,100,100,100
District 11,55,55,55,55,55,55
District 12,9,9,9,9,9,9
District 2,38,38,38,38,38,38


<b> check unique categories <b> 

In [56]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 124 uniques categories.


In [57]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(["Women's Store", 'Bakery', 'Arts & Crafts Store', 'Bus Station',
       'Seafood Restaurant', 'Diner', 'Chinese Restaurant', 'Café',
       'Soup Place', 'Asian Restaurant', 'Vietnamese Restaurant',
       'Karaoke Bar', 'Coffee Shop', 'Snack Place', 'French Restaurant',
       'Spa', 'Supermarket', 'BBQ Joint', 'Japanese Restaurant', 'Park',
       'Convention Center', 'Travel Agency',
       'Vegetarian / Vegan Restaurant', 'Yoga Studio',
       'Gym / Fitness Center', 'Design Studio', 'Breakfast Spot', 'Bar',
       'Museum', 'Dessert Shop', 'Steakhouse', 'Noodle House',
       'Food Truck', 'Bookstore', 'Sushi Restaurant', 'Beer Garden',
       'Ice Cream Shop', 'Dim Sum Restaurant', 'Flea Market', 'Beach',
       'Health & Beauty Service', 'Restaurant', 'Pizza Place',
       'Cupcake Shop', 'Hotel', 'Hotel Bar', 'Hotpot Restaurant',
       'Brewery', 'Massage Studio', 'North Indian Restaurant'],
      dtype=object)

In [58]:
# check if the results contain "Restaurant"
"Restaurant" in venues_df['VenueCategory'].unique()

True

<b> 6. Analyze Each Neighborhood <b>

In [59]:
# one hot encoding
hcmc_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
hcmc_onehot['District'] = venues_df['District'] 

# move neighborhood column to the first column
fixed_columns = [hcmc_onehot.columns[-1]] + list(hcmc_onehot.columns[:-1])
hcmc_onehot = hcmc_onehot[fixed_columns]

print(hcmc_onehot.shape)
hcmc_onehot.head()

(1051, 125)


Unnamed: 0,District,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Buffet,Burger Joint,Bus Station,Café,Cantonese Restaurant,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,Convenience Store,Convention Center,Cupcake Shop,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Dumpling Restaurant,Electronics Store,Fast Food Restaurant,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Gastropub,German Restaurant,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym,Gym / Fitness Center,Health & Beauty Service,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Korean Restaurant,Market,Massage Studio,Mattress Store,Mexican Restaurant,Middle Eastern Restaurant,Motel,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,North Indian Restaurant,Outdoors & Recreation,Paintball Field,Park,Pizza Place,Plaza,Pool,Ramen Restaurant,Residential Building (Apartment / Condo),Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Snack Place,Soup Place,Spa,Spanish Restaurant,Speakeasy,Sports Club,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Temple,Thai Restaurant,Theme Park,Travel Agency,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Women's Store,Yoga Studio
0,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Bình Chánh District,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bình Chánh District,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [60]:
# group rows by neighborhood order by the mean of the frequency of occurrence of each category
hcmc_grouped = hcmc_onehot.groupby(["District"]).mean().reset_index()
print(hcmc_grouped.shape)
hcmc_grouped

(24, 125)


Unnamed: 0,District,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Buffet,Burger Joint,Bus Station,Café,Cantonese Restaurant,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,Convenience Store,Convention Center,Cupcake Shop,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Dumpling Restaurant,Electronics Store,Fast Food Restaurant,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Gastropub,German Restaurant,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym,Gym / Fitness Center,Health & Beauty Service,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Korean Restaurant,Market,Massage Studio,Mattress Store,Mexican Restaurant,Middle Eastern Restaurant,Motel,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,North Indian Restaurant,Outdoors & Recreation,Paintball Field,Park,Pizza Place,Plaza,Pool,Ramen Restaurant,Residential Building (Apartment / Condo),Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Snack Place,Soup Place,Spa,Spanish Restaurant,Speakeasy,Sports Club,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Temple,Thai Restaurant,Theme Park,Travel Agency,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Women's Store,Yoga Studio
0,Bình Chánh District,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
1,Bình Thạnh District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046512,0.011628,0.0,0.011628,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.174419,0.0,0.0,0.0,0.0,0.104651,0.0,0.011628,0.0,0.0,0.011628,0.011628,0.011628,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0,0.023256,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.023256,0.011628,0.023256,0.0,0.0,0.0,0.011628,0.011628,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.05814,0.22093,0.0,0.0,0.0,0.0,0.011628
2,Bình Tân District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.0
3,Cần Giờ District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
4,Củ Chi District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
5,District 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.05,0.0,0.0,0.02,0.01,0.06,0.01,0.01,0.01,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.03,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.11,0.03,0.02,0.0,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.02,0.03,0.0,0.0,0.01,0.0,0.01,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.05,0.09,0.0,0.0,0.01,0.0,0.0
6,District 10,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.08,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.04,0.03,0.0,0.01,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01,0.01,0.02,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.2,0.0,0.0,0.0,0.0,0.0
7,District 11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.018182,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218182,0.036364,0.145455,0.0,0.0,0.018182,0.0,0.0,0.0,0.018182,0.0,0.036364,0.018182,0.0,0.018182,0.0,0.018182,0.0,0.0,0.018182,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036364,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.018182,0.0,0.0,0.072727,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.018182,0.0,0.0,0.018182,0.072727,0.0,0.018182,0.0,0.0,0.0
8,District 12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,District 2,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.052632,0.052632,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.026316,0.0,0.0,0.0,0.026316,0.026316,0.026316,0.0,0.131579,0.0,0.0,0.0,0.0,0.078947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.026316,0.026316,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.026316,0.0,0.0,0.0,0.052632,0.026316,0.0,0.0,0.0,0.0,0.0


In [61]:
len(hcmc_grouped[hcmc_grouped["Restaurant"] > 0])

7

<b> Create a new DataFrame for Restaurant data only <b>

In [79]:
hcmc_restaurant = hcmc_grouped[["District","Restaurant"]]

In [80]:
hcmc_restaurant.head()

Unnamed: 0,District,Restaurant
0,Bình Chánh District,0.0
1,Bình Thạnh District,0.0
2,Bình Tân District,0.0
3,Cần Giờ District,0.0
4,Củ Chi District,0.333333


<b> 7. Cluster Neighborhoods <b> 

Run k-means to cluster the neighborhoods in HoChiMinh City into 3 clusters.

In [81]:
# set number of clusters
kclusters = 3

hcmc_clustering = hcmc_restaurant.drop(["District"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hcmc_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 2, 1, 1, 1, 2, 0])

In [82]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
hcmc_merged = hcmc_restaurant.copy()

# add clustering labels
hcmc_merged["Cluster Labels"] = kmeans.labels_

In [83]:
hcmc_merged.rename(columns={"District": "District"}, inplace=True)
hcmc_merged.head()

Unnamed: 0,District,Restaurant,Cluster Labels
0,Bình Chánh District,0.0,1
1,Bình Thạnh District,0.0,1
2,Bình Tân District,0.0,1
3,Cần Giờ District,0.0,1
4,Củ Chi District,0.333333,2


In [84]:
# merge hcmc_merged with hcmc_df to add latitude/longitude for each neighborhood
hcmc_merged_final = hcmc_merged.join(hcmc_df.set_index("District"), on="District")
print(hcmc_merged_final.shape)
hcmc_merged_final.head() # check the last columns!

(24, 5)


Unnamed: 0,District,Restaurant,Cluster Labels,Latitude,Longitude
0,Bình Chánh District,0.0,1,10.67922,106.57654
1,Bình Thạnh District,0.0,1,10.80518,106.6928
2,Bình Tân District,0.0,1,10.75909,106.5915
3,Cần Giờ District,0.0,1,10.41566,106.9613
4,Củ Chi District,0.333333,2,10.97734,106.50223


In [85]:
# sort the results by Cluster Labels
print(hcmc_merged_final.shape)
hcmc_merged_final.sort_values(["Cluster Labels"], inplace=True)
hcmc_merged_final

(24, 5)


Unnamed: 0,District,Restaurant,Cluster Labels,Latitude,Longitude
21,Thủ Đức District,0.052632,0,10.84626,106.76992
9,District 2,0.105263,0,10.79199,106.74985
0,Bình Chánh District,0.0,1,10.67922,106.57654
20,Phú Nhuận District,0.0,1,10.79565,106.67464
19,Nhà Bè District,0.0,1,10.70153,106.73818
18,Hóc Môn District,0.0,1,10.88839,106.59642
17,Gò Vấp District,0.0,1,10.83379,106.66557
16,District 9,0.0,1,10.82004,106.83185
15,District 8,0.0,1,10.74771,106.66334
14,District 7,0.0,1,10.70515,106.73748


<b> Visualize the cluster <b> 

In [86]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hcmc_merged_final['Latitude'], hcmc_merged_final['Longitude'], hcmc_merged_final['District'], hcmc_merged_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [87]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

<b> 8. Examine Clusters <b> 

Cluster 0

In [88]:
hcmc_merged_final.loc[hcmc_merged['Cluster Labels'] == 0]

Unnamed: 0,District,Restaurant,Cluster Labels,Latitude,Longitude
21,Thủ Đức District,0.052632,0,10.84626,106.76992
9,District 2,0.105263,0,10.79199,106.74985


Cluster 1

In [89]:
hcmc_merged_final.loc[hcmc_merged['Cluster Labels'] == 1]

Unnamed: 0,District,Restaurant,Cluster Labels,Latitude,Longitude
0,Bình Chánh District,0.0,1,10.67922,106.57654
20,Phú Nhuận District,0.0,1,10.79565,106.67464
19,Nhà Bè District,0.0,1,10.70153,106.73818
18,Hóc Môn District,0.0,1,10.88839,106.59642
17,Gò Vấp District,0.0,1,10.83379,106.66557
16,District 9,0.0,1,10.82004,106.83185
15,District 8,0.0,1,10.74771,106.66334
14,District 7,0.0,1,10.70515,106.73748
13,District 6,0.0,1,10.74597,106.64769
12,District 5,0.0,1,10.75569,106.66637


Cluster 2

In [90]:
hcmc_merged_final.loc[hcmc_merged['Cluster Labels'] == 2]

Unnamed: 0,District,Restaurant,Cluster Labels,Latitude,Longitude
8,District 12,0.222222,2,10.85043,106.62732
4,Củ Chi District,0.333333,2,10.97734,106.50223


<b> Observations <b>

A glance at the generated map reveals that the majority of the restaurants in HCMC are situated up North and East of the city. Cluster 1 has almost no recognized restaurants while cluster 2 has the highest number of recognized restaurants. Last not least, cluster 0 has a moderate number of recognized restaurants. As a result, Opening a restaurant in districts in cluster 1 and 0 are more challenging due to the higher competitiveness. More importantly, it is significant that although most of districts in Cluster 1 are closers to the city center, they have very few recognized restaurants. Therefore, it is better to open a restaurant in Cluster 1.  