# Segmenting and Clustering Niehgborhoods in the City of Toronto, Canada  
Hien Nguyen

## 1. Load and Clean Postal Codes  
  


### 1.1. Load data and creat a data frame to store the data  

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # url to wikipedia page that provides postal codes of Toronto
response = requests.get(url) # send a GET request to access the page
response

<Response [200]>

In [3]:
soup = BeautifulSoup(response.text, 'html.parser') # parse html and save to BeautifulSoup object
table = soup.find('table', class_='wikitable sortable') # find the table of Toronto post codes (based on class)
headers = table.find_all('th') # heading items in the html table
data = table.find_all('td') # data items in the html table
dic_data = {} # declare a dictionary to store items in the html table, keys=headers, values=data
arr_headers = [] # array to store headers' text
nCols = len(headers) # number of columns in the table
# get headers' text and put into arr_headers
# and declare an array value for each key in dic_data
for header in headers:
    h_text = header.text.strip()
    arr_headers.append(h_text)
    dic_data[h_text] = []
# go through data items to get the text and put into dic_data
for i in range(len(data)):
    iCol = i%nCols # calculate column index for the data item
    dic_data[arr_headers[iCol]].append(data[i].text.strip())

In [4]:
keys = list(dic_data.keys())
for i in range(nCols):
    print('{0}: {1} items'.format(keys[i], str(len(dic_data[keys[i]]))))


Postal Code: 180 items
Borough: 180 items
Neighbourhood: 180 items


In [5]:
# create a data frame to store the data
df_data = pd.DataFrame(dic_data)
df_data.columns = ['Postal Code', 'Borough', 'Neighborhood']
print(df_data.shape)
df_data.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 1.2. Clean data 

In [6]:
# check Borough
print(df_data.iloc[:,1].value_counts())

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 5
Mississauga          1
Name: Borough, dtype: int64


In [7]:
# check Neighborhood
print(df_data[df_data['Neighborhood']=='Not assigned'].iloc[:,2].value_counts())

Not assigned    77
Name: Neighborhood, dtype: int64


In [8]:
# cleaning the data
na_text = 'Not assigned'
df_data.replace(na_text, np.nan, inplace=True) # replace 'Not assigned' with NaN

# drop rows where Borough and Neighborhood are Not assigned (NaN)
df_data.dropna(thresh=2, inplace=True) # drop rows where there are at least 2 NaN

# if Neighborhood is Not assigned (NaN) then fill with Borough
df_data['Neighborhood'].fillna(df_data['Borough'], inplace=True)

# if a postal code has more than one neighborhood then concatenate neighborhoods
df = df_data.groupby(list(df_data.columns[:-1])) # group by Postal Code and Borough
df = df['Neighborhood'].aggregate(lambda column: ', '.join(column)) # concatenate Neighborhood
df.head()

Postal Code  Borough    
M1B          Scarborough                            Malvern, Rouge
M1C          Scarborough    Rouge Hill, Port Union, Highland Creek
M1E          Scarborough         Guildwood, Morningside, West Hill
M1G          Scarborough                                    Woburn
M1H          Scarborough                                 Cedarbrae
Name: Neighborhood, dtype: object

In [9]:
df = pd.DataFrame(df).reset_index()
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## 2. Collect Geographical Coordinates

In [8]:
# The code was removed by Watson Studio for sharing.

In [38]:
# The code was removed by Watson Studio for sharing.

In [10]:
# using csv file to get the coordinates
# read csv file
file_url = 'https://cocl.us/Geospatial_data'
borough_coordinates = pd.read_csv(file_url)
# the first col in csv will have the same name as the first col in df (i.e. postal code data)
key_col = df.columns[0]
borough_coordinates.rename(columns={borough_coordinates.columns[0]: key_col}, inplace=True) 
# join df and coordinates from csv
df = df.join(borough_coordinates.set_index(key_col), on=key_col)
print(df.shape)
df.head()

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 3. Segmentation  

### 3.1. Import libraries

In [12]:
# convert an address into latitude and longitude values
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
#!conda install -c conda-forge folium=0.5.0 --yes
import folium 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

In [42]:
# The code was removed by Watson Studio for sharing.

### 3.2. Create a data frame to store nearby venues for neighborhoods  
Nearby venues are within the radius of 1000 meters. There is one postal code M1X that has no nearby venues.

In [14]:
# function to use Foursquare to get nearby venues
# given parameters: list of postal code or names of neighborhoods, list of neighborhood latitudes, 
#                   list of neighborhood longitudes, radius (in meters), limit (number of results to return from foursquare)
# return a data frame: Postal Code (of neighborhood), Neighborhood Latitude, Neighborhood Longitude, 
#                      Venue, Venue Latitude, Venue Longitude, Venue Category
def getNearbyVenues(names, latitudes, longitudes, radius=1000, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
# create a data frame to store nearby venues for each neighborhood postal code
df_venues = getNearbyVenues(names=df['Postal Code'], latitudes=df['Latitude'], longitudes=df['Longitude'])
print(df_venues.shape)
df_venues.head()

(4905, 7)


Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,M1B,43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
2,M1B,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,M1B,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
4,M1B,43.806686,-79.194353,RBC Royal Bank,43.798782,-79.19709,Bank


In [16]:
# check the number of venues returned for each postal code
#df_venues.groupby('Postal Code').count()
df_venues['Postal Code'].value_counts()

M4P    100
M5L    100
M4J    100
M4Y    100
M6J    100
M5W    100
M2N    100
M5J    100
M6G    100
M4K    100
M5R    100
M4S    100
M5G    100
M6P    100
M5T    100
M5E    100
M5X    100
M6K    100
M5B    100
M5C    100
M5K    100
M5A    100
M4M    100
M5S    100
M5H    100
M6R    100
M7A    100
M4L     80
M6S     78
M4E     77
      ... 
M2H     21
M4B     21
M2P     21
M9V     19
M6M     18
M8V     18
M9N     17
M2K     16
M9C     16
M9B     15
M9P     15
M9R     14
M5V     14
M4A     13
M9A     13
M2R     12
M1J     12
M1M     12
M1N     11
M6L     11
M3L     10
M1G      9
M8Y      9
M4N      8
M9M      8
M9L      8
M1C      5
M2L      4
M9W      4
M3M      4
Name: Postal Code, Length: 102, dtype: int64

In [17]:
df[~df['Postal Code'].isin(df_venues['Postal Code'])]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [18]:
# return the number of venue categories
print('There are {} uniques categories.'.format(len(df_venues['Venue Category'].unique())))

There are 321 uniques categories.


### 3.3. Create a data frame for the segmentation  
 
(1) Convert *Venue Category* to dummy codes and store in a data frame *df_onehot* (i.e. venue categories as column names in the data frame).    
(2) Create *df_toronto* to store average values of venue categories for each postal code.   
 

In [19]:
# one hot encoding
df_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")
print(df_onehot.shape) # check that the number of columns = the number of venue categories, the number of rows = the number of rows in df_venues

# add postal code column back to the dataframe
df_onehot['Postal Code'] = df_venues['Postal Code'] 

# move postal code column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
print(df_onehot.columns[-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

(4905, 321)
Postal Code


Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# create a data frame for the segmentation
# each neighborhood is characterized by the frequency of venue categories
df_toronto = df_onehot.groupby('Postal Code').mean().reset_index()
df_toronto

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0.00,0.0,0.047619,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
1,M1C,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
2,M1E,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
3,M1G,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
4,M1H,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.034483,0.000000,0.034483,0.00
5,M1J,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
6,M1K,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
7,M1L,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
8,M1M,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00
9,M1N,0.00,0.0,0.000000,0.000000,0.000000,0.00,0.0,0.00,0.0,...,0.000000,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.00


### 3.4. Cluster neighborhoods in Toronto  
Run *k*-means to cluster the neighborhood into 7 clusters.  

In [21]:
# set number of clusters
kclusters = 7

df_toronto_cluster = df_toronto.drop('Postal Code', 1)
df_toronto_cluster.head()
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_toronto_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 3],
      dtype=int32)

In [22]:
# check the number of neighborhood in each cluster
df_toronto.insert(1, 'Cluster Labels', kmeans.labels_)
df_toronto['Cluster Labels'].value_counts()

0    54
1    42
2     2
6     1
5     1
4     1
3     1
Name: Cluster Labels, dtype: int64

### 3.5. Vizualize the clusters  
(1) Get the most common venue categories for each postal code  
(2) Get the labels of 7 clusters that are 0, 1, ..., 5, 6
(3) Merge those information with the postal code data  
(4) Because the postal code M1X has no nearby venues and is not included in the segmentation, give it the cluster lable 7  
(5) Create map to show the result

In [23]:
def return_most_common_venues(row, num_top_venues=10):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [24]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
df_venues_top = pd.DataFrame(columns=columns)
df_venues_top['Postal Code'] = df_toronto['Postal Code']

for ind in np.arange(df_toronto.shape[0]):
    df_venues_top.iloc[ind, 1:] = return_most_common_venues(df_toronto.iloc[ind, :], num_top_venues)

df_venues_top.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Coffee Shop,Trail,Spa,Paper / Office Supplies Store,Bus Station,Martial Arts Dojo,Supermarket,Caribbean Restaurant,Sandwich Place
1,M1C,Cluster Labels,Italian Restaurant,Playground,Park,Burger Joint,Breakfast Spot,Historic Site,Event Space,Dog Run,Doner Restaurant
2,M1E,Cluster Labels,Pizza Place,Coffee Shop,Bank,Fast Food Restaurant,Greek Restaurant,Beer Store,Discount Store,Fried Chicken Joint,Sandwich Place
3,M1G,Cluster Labels,Park,Coffee Shop,Mobile Phone Shop,Chinese Restaurant,Fast Food Restaurant,Indian Restaurant,Pharmacy,Falafel Restaurant,Dumpling Restaurant
4,M1H,Cluster Labels,Coffee Shop,Bakery,Indian Restaurant,Gas Station,Bank,Wings Joint,Pizza Place,Bus Line,Fast Food Restaurant


In [25]:
# add clustering labels
df_venues_top.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df

# merge df_merged with toronto data i.e. df to add latitude/longitude for each neighborhood
df_merged = df_merged.join(df_venues_top.set_index('Postal Code'), on='Postal Code')

df_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0,Fast Food Restaurant,Coffee Shop,Trail,Spa,Paper / Office Supplies Store,Bus Station,Martial Arts Dojo,Supermarket,Caribbean Restaurant,Sandwich Place
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2.0,Cluster Labels,Italian Restaurant,Playground,Park,Burger Joint,Breakfast Spot,Historic Site,Event Space,Dog Run,Doner Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1.0,Cluster Labels,Pizza Place,Coffee Shop,Bank,Fast Food Restaurant,Greek Restaurant,Beer Store,Discount Store,Fried Chicken Joint,Sandwich Place
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Cluster Labels,Park,Coffee Shop,Mobile Phone Shop,Chinese Restaurant,Fast Food Restaurant,Indian Restaurant,Pharmacy,Falafel Restaurant,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Cluster Labels,Coffee Shop,Bakery,Indian Restaurant,Gas Station,Bank,Wings Joint,Pizza Place,Bus Line,Fast Food Restaurant


In [26]:
df_merged.shape

(103, 16)

In [35]:
# give M1X the lable 7
df_merged.loc[df_merged['Postal Code']=='M1X',['Cluster Labels']] = kclusters
df_merged[df_merged['Postal Code']=='M1X']

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,7.0,,,,,,,,,,


In [36]:
# get the coordinate of Toronto
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
kclusters = kclusters + 1
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = ['darkblue','purple','blue','yellow','green','gray','orange','red']#[colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 4. Discuss the result  


In [38]:
# print out the result
cluster_count = df_toronto['Cluster Labels'].value_counts()
print('{0}\t{1}\t{2}'.format('Cluster','Count','Color'))
for i in range(kclusters-1):
    print('{0}\t{1}\t{2}'.format(cluster_count.index[i], cluster_count.values[i], rainbow[cluster_count.index[i]]) )
print('{0}\t{1}\t{2} ({3})'.format('7','1',rainbow[kclusters-1], 'Dummy cluster for M1X'))

Cluster	Count	Color
0	54	darkblue
1	42	purple
2	2	blue
6	1	orange
5	1	gray
4	1	green
3	1	yellow
7	1	red (Dummy cluster for M1X)


We will check the characteristic of vanue categories in each cluster.

In [39]:
# calculate the average appearance of venue category for each postal code and transpose the data
df_2 = df_toronto.groupby(['Cluster Labels']).mean().T.add_prefix('Cluster ').reset_index()
df_2.columns.name = None
df_2.rename(columns={'index':'Venue'}, inplace=True)
df_2.head()

Unnamed: 0,Venue,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6
0,Accessories Store,0.000606,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghan Restaurant,0.0,0.000898,0.0,0.0,0.0,0.0,0.0
2,African Restaurant,0.000882,0.000722,0.0,0.0,0.0,0.0,0.0
3,Airport,0.002165,0.0,0.0,0.0,0.0,0.0,0.0
4,American Restaurant,0.008482,0.003767,0.0,0.0,0.0,0.0,0.0


In [41]:
clusters = []
for i in cluster_count.index:
    t = pd.DataFrame(df_2.sort_values(by=['Cluster {0}'.format(i)], ascending=False).iloc[0:10,[0,i+1]]).rename(columns={'Venue': 'Venue_{0}'.format(i)}).reset_index(drop=True)
    t = t.loc[t.iloc[:,1]>0]
    clusters.append(t)
df_clusters = pd.concat(clusters, axis=1).fillna('')
df_clusters     

Unnamed: 0,Venue_0,Cluster 0,Venue_1,Cluster 1,Venue_2,Cluster 2,Venue_6,Cluster 6,Venue_5,Cluster 5,Venue_4,Cluster 4,Venue_3,Cluster 3
0,Coffee Shop,0.085557,Coffee Shop,0.070825,Park,0.211111,Vietnamese Restaurant,0.5,Gym / Fitness Center,0.125,Hotel,0.25,Park,0.75
1,Café,0.05061,Pizza Place,0.070238,Italian Restaurant,0.211111,Food Truck,0.25,Bookstore,0.125,Rental Car Location,0.25,Pool,0.25
2,Restaurant,0.03165,Park,0.061277,Playground,0.1,Baseball Field,0.25,Trail,0.125,Lounge,0.25,,
3,Park,0.028198,Pharmacy,0.047781,Burger Joint,0.1,,,Coffee Shop,0.125,Coffee Shop,0.25,,
4,Italian Restaurant,0.025887,Grocery Store,0.046805,Breakfast Spot,0.1,,,College Gym,0.125,,,,
5,Pizza Place,0.025733,Bank,0.034916,Ice Cream Shop,0.055556,,,College Quad,0.125,,,,
6,Bakery,0.019631,Convenience Store,0.033467,Bus Stop,0.055556,,,Café,0.125,,,,
7,Japanese Restaurant,0.019078,Chinese Restaurant,0.03044,Shopping Mall,0.055556,,,Park,0.125,,,,
8,Gym,0.018389,Bakery,0.027289,Gym / Fitness Center,0.055556,,,,,,,,
9,Sushi Restaurant,0.018246,Gas Station,0.02689,Eastern European Restaurant,0.055556,,,,,,,,


*Note: this result is archived with the data collected on 2020/07/28.*