In [25]:
# Import necessary modules
import pandas as pd 
import numpy as np
from numpy import *
import geocoder
import folium
from folium import plugins
from tqdm import tqdm
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
import requests
import sklearn
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# Read the data from the file and store it in a dataframe
cbr_geo = pd.read_csv('indir/cbr-neighbourhood-geo.txt', sep = "\t")

# Inspect the first few rows of the dataframe
print(cbr_geo.head())

# Check the dimensions of the dataframe
print("The dataframe has {} rows and {} columns".format(cbr_geo.shape[0], cbr_geo.shape[1]))

  Neighborhood  Postcode    Country                        Region
0        ACTON      2601  Australia  Australian Capital Territory
1      AINSLIE      2602  Australia  Australian Capital Territory
2       AMAROO      2914  Australia  Australian Capital Territory
3       ARANDA      2614  Australia  Australian Capital Territory
4        BANKS      2906  Australia  Australian Capital Territory
The dataframe has 124 rows and 4 columns


In [29]:
# Import the geocoder module
import geocoder

# Initialize lists to store latitude and longitude
Lat_list = []
Lng_list = []

# Loop through the rows of the dataframe
for i in range(cbr_geo.shape[0]):
    # Geocode the address and store the latitude and longitude in the corresponding lists
    address = '{}, Canberra, Australia'.format(cbr_geo.at[i, 'Neighborhood'])
    g = geocoder.arcgis(address)
    Lat_list.append(g.latlng[0])
    Lng_list.append(g.latlng[1])

# Add the latitude and longitude columns to the dataframe
cbr_geo['Latitude'] = Lat_list
cbr_geo['Longitude'] = Lng_list

# Check the dimensions of the dataframe
print("The dataframe has {} rows and {} columns".format(cbr_geo.shape[0], cbr_geo.shape[1]))

# Inspect the first few rows of the dataframe
cbr_geo.head()

address = 'Canberra, Australian Capital Territory'

geolocator = Nominatim(user_agent="canberra_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_cbr = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="CartoDB dark_matter") # create a base map of Canberra using latitude and longitude values

# add markers to map
for lat, lng, label in zip(cbr_geo['Latitude'], cbr_geo['Longitude'], cbr_geo['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7.5,
        tooltip=label,
        popup=label,
        color='darkgreen',
        fill=True,
        fill_color='green',
        fill_opacity=0.8,
        parse_html=False).add_to(map_cbr)  
    
map_cbr


The dataframe has 124 rows and 6 columns


In [30]:
# initialise Foursquare credentials, version no., and limt
version = '20180604'
limit = 100

# create a function to lookup venues and iterate across in Canberra neighborhoods dataset
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            os.environ['FSQ_CLIENT_ID'], 
            os.environ['FSQ_CLIENT_SECRET'], 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # create the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
# Run the above function on each neighborhood and create a new dataframe called Canberra venues
cbr_venues = getNearbyVenues(names=cbr_geo['Neighborhood'],
                             latitudes=cbr_geo['Latitude'],
                             longitudes=cbr_geo['Longitude']
                        )

ACTON
AINSLIE
AMAROO
ARANDA
BANKS
BARTON
BEARD
BELCONNEN
BLACK MOUNTAIN
BONNER
BONYTHON
BRADDON
BRUCE
CALWELL
CAMPBELL
CAPITAL HILL
CASEY
CHAPMAN
CHARNWOOD
CHIFLEY
CHISHOLM
CITY
CONDER
COOK
COOMBS
CRACE
CURTIN
DEAKIN
DENMAN PROSPECT
DICKSON
DOWNER
DUFFY
DUNLOP
DUNTROON
EVATT
FADDEN
FARRER
FISHER
FLOREY
FLYNN
FORDE
FORREST
FRANKLIN
FRASER
FYSHWICK
GARRAN
GILMORE
GIRALANG
GORDON
GOWRIE
GREENWAY
GRIFFITH
GUNGAHLIN
HACKETT
HALL
HARMAN
HARRISON
HAWKER
HIGGINS
HOLDER
HOLT
HUGHES
HUME
ISAACS
ISABELLA PLAINS
JACKA
KALEEN
KAMBAH
KENNY
KINGSTON
KINLYSIDE
LATHAM
LAWSON
LYNEHAM
LYONS
MACARTHUR
MACGREGOR
MACQUARIE
MAJURA
MAWSON
MCKELLAR
MELBA
MITCHELL
MOLONGLO
MONASH
MONCRIEFF
NARRABUNDAH
NGUNNAWAL
NICHOLLS
OAKS ESTATE
O'CONNOR
O'MALLEY
OXLEY
PAGE
PALMERSTON
PARKES
PEARCE
PHILLIP
PIALLIGO
RED HILL
REID
RICHARDSON
RIVETT
RUSSELL
SCULLIN
SPENCE
STIRLING
STROMLO
SYMONSTON
TAYLOR
THARWA
THEODORE
THROSBY
TORRENS
TURNER
URIARRA
WANNIASSA
WARAMANGA
WATSON
WEETANGERA
WESTON
WILLIAMSDALE
WRIGHT
YARRALUMLA


In [31]:
print(cbr_venues.shape) # check dimensions
cbr_venues.head() # inspect data

(652, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,ACTON,-35.28562,149.11827,Monster Kitchen and Bar,-35.285122,149.122547,Hotel Bar
1,ACTON,-35.28562,149.11827,Palace Electric Cinema,-35.285014,149.123135,Multiplex
2,ACTON,-35.28562,149.11827,National Film & Sound Archive,-35.283131,149.121143,History Museum
3,ACTON,-35.28562,149.11827,Ovolo Nishi,-35.284917,149.122458,Hotel
4,ACTON,-35.28562,149.11827,The Gods Café,-35.282827,149.119645,Coffee Shop


In [12]:
cbr_venues.groupby('Neighborhood').count() # aggregate the number of venues returned for each neighborhood

print('There are {} unique venue categories.'.format(len(cbr_venues['Venue Category'].unique())))
print(cbr_venues.head())

There are 174 unique venue categories.
  Neighborhood  Neighborhood Latitude  Neighborhood Longitude  \
0        ACTON              -35.28562               149.11827   
1        ACTON              -35.28562               149.11827   
2        ACTON              -35.28562               149.11827   
3        ACTON              -35.28562               149.11827   
4        ACTON              -35.28562               149.11827   

                           Venue  Venue Latitude  Venue Longitude  \
0        Monster Kitchen and Bar      -35.285122       149.122547   
1         Palace Electric Cinema      -35.285014       149.123135   
2  National Film & Sound Archive      -35.283131       149.121143   
3                    Ovolo Nishi      -35.284917       149.122458   
4                  The Gods Café      -35.282827       149.119645   

   Venue Category  
0       Hotel Bar  
1       Multiplex  
2  History Museum  
3           Hotel  
4     Coffee Shop  


In [13]:
# analyse each neighbourhood
cbr_onehot = pd.get_dummies(cbr_venues[['Venue Category']], prefix="", prefix_sep="") # one hot encoding

cbr_onehot['Neighborhood'] = cbr_venues['Neighborhood'] # add neighborhood column back to dataframe

fixed_columns = [cbr_onehot.columns[-1]] + list(cbr_onehot.columns[:-1]) # move neighborhood column to the first column 
cbr_onehot = cbr_onehot[fixed_columns]

cbr_onehot.head()

Unnamed: 0,Neighborhood,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,Auto Dealership,Auto Garage,Bakery,Bank,...,Track,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store
0,ACTON,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACTON,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACTON,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACTON,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACTON,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# aggregate
cbr_grouped = cbr_onehot.groupby('Neighborhood').mean().reset_index()
cbr_grouped.head()

Unnamed: 0,Neighborhood,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,Auto Dealership,Auto Garage,Bakery,Bank,...,Track,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store
0,ACTON,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AINSLIE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AMAROO,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ARANDA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BANKS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# top 5 frequencies

num_top_venues = 5

for hood in cbr_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = cbr_grouped[cbr_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----ACTON----
                   venue  freq
0                  Hotel  0.18
1                   Café  0.18
2              Multiplex  0.09
3    Indie Movie Theater  0.09
4  Australian Restaurant  0.09


----AINSLIE----
               venue  freq
0               Café   0.2
1                Pub   0.2
2  Fish & Chips Shop   0.2
3     Shopping Plaza   0.2
4      Grocery Store   0.2


----AMAROO----
                venue  freq
0  Athletics & Sports   1.0
1         Art Gallery   0.0
2     Organic Grocery   0.0
3           Multiplex   0.0
4         Music Store   0.0


----ARANDA----
                venue  freq
0                Café   0.4
1                 Bar   0.2
2        Dance Studio   0.2
3          Playground   0.2
4  Italian Restaurant   0.0


----BANKS----
           venue  freq
0  Grocery Store   1.0
1    Art Gallery   0.0
2          Motel   0.0
3  Movie Theater   0.0
4      Multiplex   0.0


----BARTON----
                  venue  freq
0                  Café  0.25
1                 H

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = cbr_grouped['Neighborhood']

for ind in np.arange(cbr_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cbr_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,ACTON,Hotel,Café,Multiplex,Indie Movie Theater,Australian Restaurant
1,AINSLIE,Café,Pub,Fish & Chips Shop,Shopping Plaza,Grocery Store
2,AMAROO,Athletics & Sports,Art Gallery,Organic Grocery,Multiplex,Music Store
3,ARANDA,Café,Bar,Dance Studio,Playground,Italian Restaurant
4,BANKS,Grocery Store,Art Gallery,Motel,Movie Theater,Multiplex


In [35]:
# set number of clusters
kclusters = 5

cbr_grouped_clustering = cbr_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cbr_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

# add clustering labels
neighborhoods_venues_sorted.insert(1, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.tail()

  cbr_grouped_clustering = cbr_grouped.drop('Neighborhood', 1)


Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
104,WANNIASSA,3,Ice Cream Shop,Theater,Gas Station,Football Stadium,Sandwich Place
105,WARAMANGA,3,Supermarket,Soccer Field,Shopping Plaza,Art Gallery,Noodle House
106,WATSON,3,Café,Supermarket,Filipino Restaurant,Fish & Chips Shop,Shopping Plaza
107,WESTON,2,Café,Athletics & Sports,Fast Food Restaurant,Sandwich Place,Organic Grocery
108,YARRALUMLA,1,Bus Stop,Mountain,Movie Theater,Multiplex,Music Store


In [36]:
# merge dataset and check output
cbr_merged = pd.merge(cbr_geo, neighborhoods_venues_sorted, on='Neighborhood')
cbr_merged.tail()

Unnamed: 0,Neighborhood,Postcode,Country,Region,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
104,WANNIASSA,2903,Australia,Australian Capital Territory,-35.40288,149.09194,3,Ice Cream Shop,Theater,Gas Station,Football Stadium,Sandwich Place
105,WARAMANGA,2611,Australia,Australian Capital Territory,-35.3533,149.06017,3,Supermarket,Soccer Field,Shopping Plaza,Art Gallery,Noodle House
106,WATSON,2602,Australia,Australian Capital Territory,-35.24131,149.15731,3,Café,Supermarket,Filipino Restaurant,Fish & Chips Shop,Shopping Plaza
107,WESTON,2611,Australia,Australian Capital Territory,-35.3365,149.05515,2,Café,Athletics & Sports,Fast Food Restaurant,Sandwich Place,Organic Grocery
108,YARRALUMLA,2600,Australia,Australian Capital Territory,-35.3059,149.10672,1,Bus Stop,Mountain,Movie Theater,Multiplex,Music Store


In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="CartoDB dark_matter")

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cbr_merged['Latitude'], cbr_merged['Longitude'], cbr_merged['Neighborhood'], cbr_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [38]:
# Loop through each cluster
for cluster in range(kclusters):
    # Get the data for the cluster
    cluster_data = cbr_merged.loc[cbr_merged['Cluster Labels'] == cluster, cbr_merged.columns[[1] + list(range(5, cbr_merged.shape[1]))]]
    # Do something with the data for the cluster
    print(cluster_data)

    Postcode  Longitude  Cluster Labels 1st Most Common Venue  \
17      2611  149.04076               0           Supermarket   
54      2611  149.04498               0           Supermarket   

   2nd Most Common Venue 3rd Most Common Venue    4th Most Common Venue  \
17                 Track           Art Gallery  North Indian Restaurant   
54           Art Gallery                 Motel            Movie Theater   

   5th Most Common Venue  
17              Mountain  
54             Multiplex  
     Postcode  Longitude  Cluster Labels 1st Most Common Venue  \
14       2612  149.16004               1              Bus Stop   
29       2615  149.02221               1              Bus Stop   
39       2913  149.14091               1              Bus Stop   
40       2615  149.04607               1              Bus Stop   
44       2906  149.08502               1              Bus Stop   
51       2914  149.16088               1              Bus Stop   
74       2617  149.07570           