### Install Packages

In [8]:
import sys

# !{sys.executable} -m pip install tabulate
# !{sys.executable} -m pip install requests
# !{sys.executable} -m pip install beautifulsoup4
# !{sys.executable} -m pip install geocoder
# !{sys.executable} -m pip install folium
# !{sys.executable} -m pip install geopy

### Import Library

In [9]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
import matplotlib.pyplot as plt
import warnings
import geocoder
import folium
from geopy.geocoders import Nominatim 
import os
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

plt.rcParams["figure.figsize"] = [15,8]
warnings.filterwarnings("ignore")
%matplotlib inline

### Parse and Clean Data

In [10]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_Melbourne_suburbs")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header = 0)[0][['Suburb','Postcode']].groupby(['Postcode']).last().reset_index()
df_coordinates = pd.read_csv('Australian_Post_Codes_Lat_Lon.csv')[['postcode','lat','lon']].groupby(['postcode']).last().reset_index()
df = df.merge(df_coordinates, left_on = ['Postcode'], right_on = ['postcode'], how = 'left').dropna()
df = df.set_index(['Suburb'])[['lat','lon']]

In [11]:
print (df.head())
print (df.shape)

                                          lat         lon
Suburb                                                   
Melbourne CBD                      -37.814563  144.970267
East Melbourne                     -37.816640  144.987811
West Melbourne                     -37.806255  144.941123
Melbourne CBD (St Kilda Road area) -37.837324  144.976335
South Wharf                        -37.823258  144.965926
(273, 2)


### Retrieve Venue Data

In [12]:
CLIENT_ID = 'BUPQ32NEVNQYMBS3Q55YBRG31JIRYLI3PTQ3UWWZYBW0UQCQ' # your Foursquare ID
CLIENT_SECRET = 'KJF2XNTE2ZX4ZAU0NJX3SYFTQEUNTUHMNJLD5B1K1NN4AGNJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 500

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius = 5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        try:

            print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)

            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = [ 'Suburb', 
                                      'lat', 
                                      'lon', 
                                      'Venue', 
                                      'Venue Latitude', 
                                      'Venue Longitude', 
                                      'Venue Category']
        except:
            
            print ("fail")
            
            continue
            
    return(nearby_venues)

def return_most_common_venues(row, num_top_venues):
    
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [19]:
mel_venues = getNearbyVenues(names=df.index.tolist(),
                             latitudes=df['lat'],
                             longitudes=df['lon']
                          )

Melbourne CBD
East Melbourne
West Melbourne
Melbourne CBD (St Kilda Road area)
South Wharf
Docklands
Seddon
West Footscray
Yarraville
South Kingsville
Williamstown North
Seaholme
Braybrook
Sunshine West
St Albans
Ardeer
Ravenhall
Mambourin
Altona North
Laverton North
Williams Landing
Seabrook
Tarneit
Cocoroc
Kensington
Travancore
Keilor East
Avondale Heights
Keilor North
Taylors Hill
Taylors Lakes
Moonee Ponds
Essendon West
Strathmore Heights
Niddrie
Gowanbrae
Pascoe Vale South
Melbourne Airport
Oak Park
Jacana
Meadow Heights
Westmeadows
North Melbourne
Parkville
Carlton
Princes Hill
Brunswick West
Brunswick
Brunswick East
Coburg North
Greenvale
Fawkner
Campbellfield
Somerton
Yuroke
Mickleham
Fitzroy
Collingwood
Abbotsford
Fitzroy North
Northcote
Thornbury
Preston
Reservoir
Thomastown
Lalor
Epping
Fairfield
Ivanhoe East
Heidelberg West
Mill Park
Kingsbury
Viewbank
Yallambie
Watsonia North
St Helena
Diamond Creek
Plenty
Yarrambat
Lower Plenty
Montmorency
Research
Wattle Glen
Watsons Cre

In [20]:
mel_onehot = pd.get_dummies(mel_venues[['Venue Category']], prefix="", prefix_sep="")
mel_onehot['Suburb'] = mel_venues['Suburb'] 
fixed_columns = [mel_onehot.columns[-1]] + list(mel_onehot.columns[:-1])
mel_onehot = mel_onehot[fixed_columns]
mel_grouped = mel_onehot.groupby('Suburb').mean().reset_index()

In [51]:
mel_grouped

Unnamed: 0,Suburb,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,Airport Service,American Restaurant,Antique Shop,...,Vineyard,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Abbotsford,0.0,0.010204,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.010204,0.00,0.0,0.0
1,Altona North,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
2,Ardeer,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
3,Armadale,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
4,Arthurs Seat,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
5,Ashwood,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
6,Avondale Heights,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
7,Balaclava,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0
8,Balwyn North,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.055556,0.000000,0.00,0.0,0.0
9,Bangholme,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0


### Clustering

In [84]:
kclusters = 50
mel_grouped_clustering = mel_grouped.drop('Suburb', 1)
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(mel_grouped_clustering)

In [85]:
num_top_venues = 100
indicators = ['st', 'nd', 'rd']

columns = ['Suburb']

for ind in np.arange(num_top_venues):
    
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
mel_venues_sorted = pd.DataFrame(columns=columns)
mel_venues_sorted['Suburb'] = mel_grouped['Suburb']

for ind in np.arange(mel_grouped.shape[0]):
    
    mel_venues_sorted.iloc[ind, 1:] = return_most_common_venues(mel_grouped.iloc[ind, :], num_top_venues)
    
mel_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
df_merged = df.join(mel_venues_sorted.set_index('Suburb'), on='Suburb')

In [86]:
venue = 'Thai Restaurant'
cols = df_merged.columns.tolist()

df_merged['Thai Restaurant_rank'] = df_merged.apply(lambda row: cols.index(row[row =='Thai Restaurant'].index[0])-2 
                                                                if row.str.contains('Thai Restaurant').sum() > 0
                                                                else 100, axis=1)

In [97]:
df_merged.groupby(['Cluster Labels'])['Thai Restaurant_rank'].mean().sort_values().head()

Cluster Labels
1.0     47.333333
25.0    57.200000
36.0    62.400000
22.0    67.666667
23.0    69.666667
Name: Thai Restaurant_rank, dtype: float64

In [105]:
mel_grouped ['Cluster Labels'] = kmeans.labels_

In [112]:
mel_grouped.groupby(['Cluster Labels'])['Thai Restaurant'].median().sort_values(ascending=False).head()

Cluster Labels
1     0.046296
25    0.005000
12    0.000000
22    0.000000
21    0.000000
Name: Thai Restaurant, dtype: float64

In [101]:
df_merged_cluster = df_merged[df_merged['Cluster Labels'] == 1].sort_values(by=['Thai Restaurant_rank']).reset_index()

In [104]:
df_merged_cluster

Unnamed: 0,Suburb,lat,lon,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,...,92th Most Common Venue,93th Most Common Venue,94th Most Common Venue,95th Most Common Venue,96th Most Common Venue,97th Most Common Venue,98th Most Common Venue,99th Most Common Venue,100th Most Common Venue,Thai Restaurant_rank
0,Deepdene,-37.807312,145.096698,1.0,Thai Restaurant,Wine Shop,Japanese Restaurant,Korean Restaurant,Gym / Fitness Center,Gym,...,Deli / Bodega,Dance Studio,Bookstore,Beer Garden,Beer Store,Big Box Store,Bike Rental / Bike Share,Bistro,Board Shop,1
1,Edithvale,-38.037415,145.107846,1.0,Beach,Paper / Office Supplies Store,Thai Restaurant,Harbor / Marina,Café,Train Station,...,Bakery,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach Bar,Beer Bar,Beer Garden,Beer Store,3
2,Mambourin,-37.891641,144.629467,1.0,Sandwich Place,Playground,Gym,Thai Restaurant,Pizza Place,Asian Restaurant,...,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach,Beach Bar,Beer Bar,Beer Garden,Beer Store,4
3,Jacana,-37.690301,144.915729,1.0,Shopping Mall,Sandwich Place,Grocery Store,Thai Restaurant,Middle Eastern Restaurant,Café,...,Badminton Court,Bagel Shop,Bakery,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach,Beach Bar,4
4,Upwey,-37.903672,145.33131,1.0,Convenience Store,Bakery,Restaurant,Thai Restaurant,Supermarket,Train Station,...,Bagel Shop,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach,Beach Bar,Beer Bar,Beer Garden,4
5,Yallambie,-37.727482,145.102309,1.0,Grocery Store,Pub,Golf Course,Thai Restaurant,Park,Fish Market,...,Baseball Field,Basketball Court,Basketball Stadium,Beach,Beach Bar,Beer Bar,Beer Garden,Beer Store,Big Box Store,4
6,Hallam,-38.004302,145.269261,1.0,Pub,Restaurant,Fried Chicken Joint,Fast Food Restaurant,Thai Restaurant,Shopping Mall,...,Baby Store,Badminton Court,Bagel Shop,Bakery,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach,5
7,Box Hill North,-37.807247,145.11205,1.0,Indian Restaurant,Track Stadium,Fish & Chips Shop,Park,Café,Thai Restaurant,...,Badminton Court,Bagel Shop,Bakery,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach,Beach Bar,6
8,Fingal,-38.35533,144.905978,1.0,Beach,Fast Food Restaurant,Indian Restaurant,Burger Joint,Thrift / Vintage Store,Bakery,...,History Museum,Food,Deli / Bodega,Zoo Exhibit,Cupcake Shop,Beach Bar,Baby Store,Badminton Court,Bagel Shop,7
9,Balwyn North,-37.796947,145.098434,1.0,Asian Restaurant,Shopping Mall,Japanese Restaurant,Grocery Store,Food Court,Fish & Chips Shop,...,Dance Studio,Cupcake Shop,Cultural Center,Bookstore,Beer Store,Big Box Store,Bike Rental / Bike Share,Bistro,Arepa Restaurant,14


In [102]:
df_merged_cluster_exist = df_merged_cluster[df_merged_cluster['Thai Restaurant_rank'] < 100]
df_merged_cluster_candidate = df_merged_cluster[df_merged_cluster['Thai Restaurant_rank'] == 100]

In [103]:
map_newyork = folium.Map(location=[df.loc['Melbourne CBD']['lat'],df.loc['Melbourne CBD']['lon']], zoom_start=10)

for lat, lng, suburb in zip(df_merged_cluster_exist['lat'], 
                            df_merged_cluster_exist['lon'], 
                            df_merged_cluster_exist['Suburb']):
    
    label = suburb
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
                        [lat, lng],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_newyork)  

for lat, lng, suburb in zip(df_merged_cluster_candidate['lat'], 
                            df_merged_cluster_candidate['lon'], 
                            df_merged_cluster_candidate['Suburb']):

    label = suburb
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
                        [lat, lng],
                        radius=5,
                        popup=label,
                        color='red',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_newyork)  

map_newyork