# Data Preparation and Exploration

In [26]:
# Import necessary modules
import pandas as pd 
import numpy as np
from numpy import *
import geocoder
import folium
from folium import plugins
from tqdm import tqdm
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
import requests
import sklearn
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# Read the data from the file and store it in a dataframe
cbr_geo = pd.read_csv('indir/cbr-neighbourhood-geo.txt', sep = "\t")

# Function to inspect the first few rows of the dataframe
def inspect_data(dataframe):
    print(dataframe.head())

# Function to check the dimensions of the dataframe
def check_dimensions(dataframe):
    print("The dataframe has {} rows and {} columns".format(dataframe.shape[0], dataframe.shape[1]))

# Use the functions to inspect and check the dimensions of the dataframe
inspect_data(cbr_geo)
check_dimensions(cbr_geo)

  Neighborhood  Postcode    Country                        Region
0        ACTON      2601  Australia  Australian Capital Territory
1      AINSLIE      2602  Australia  Australian Capital Territory
2       AMAROO      2914  Australia  Australian Capital Territory
3       ARANDA      2614  Australia  Australian Capital Territory
4        BANKS      2906  Australia  Australian Capital Territory
The dataframe has 124 rows and 4 columns


In [27]:
# Initialize lists to store latitude and longitude
latitudes = []
longitudes = []

# Function to geocode an address and append the latitude and longitude to the lists
def geocode_address(address):
    g = geocoder.arcgis(address)
    latitudes.append(g.latlng[0])
    longitudes.append(g.latlng[1])

# Loop through the rows of the dataframe and geocode the addresses
for i in range(cbr_geo.shape[0]):
    address = '{}, Canberra, Australia'.format(cbr_geo.at[i, 'Neighborhood'])
    geocode_address(address)

# Add the latitude and longitude columns to the dataframe
cbr_geo['Latitude'] = latitudes
cbr_geo['Longitude'] = longitudes

# Check the dimensions of the dataframe
print("The dataframe has {} rows and {} columns".format(cbr_geo.shape[0], cbr_geo.shape[1]))

# Inspect the first few rows of the dataframe
cbr_geo.head()

# Geocode the address for Canberra, Australia
address = 'Canberra, Australian Capital Territory'
geolocator = Nominatim(user_agent="canberra_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


The dataframe has 124 rows and 6 columns


### Map of Canberra Neighborhoods


In [28]:
# create a base map of Canberra using latitude and longitude values
map_cbr = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="CartoDB dark_matter")

# create a feature group for the markers on the map
marker_group = folium.FeatureGroup(name='Markers')

# add markers to the feature group
for lat, lng, label in zip(cbr_geo['Latitude'], cbr_geo['Longitude'], cbr_geo['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7.5,
        tooltip=label,  # add the suburb name as a tooltip on hover
        color='darkgreen',
        fill=True,
        fill_color='green',
        fill_opacity=0.8,
        parse_html=False
    ).add_to(marker_group)

# add the feature group to the map
marker_group.add_to(map_cbr)

map_cbr

In [29]:
# initialise Foursquare credentials, version no., and limt
version = '20180604'
limit = 100

# create a function to lookup venues and iterate across in Canberra neighborhoods dataset
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            os.environ['FSQ_CLIENT_ID'], 
            os.environ['FSQ_CLIENT_SECRET'], 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # create the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [30]:
# Run the above function on each neighborhood and create a new dataframe called Canberra venues
cbr_venues = getNearbyVenues(names=cbr_geo['Neighborhood'],
                             latitudes=cbr_geo['Latitude'],
                             longitudes=cbr_geo['Longitude']
                        )

ACTON


KeyError: 'FSQ_CLIENT_ID'

In [None]:
print(cbr_venues.shape) # check dimensions
cbr_venues.head() # inspect data

In [None]:
cbr_venues.groupby('Neighborhood').count() # aggregate the number of venues returned for each neighborhood

print('There are {} unique venue categories.'.format(len(cbr_venues['Venue Category'].unique())))
print(cbr_venues.head())

In [None]:
# analyse each neighbourhood
cbr_onehot = pd.get_dummies(cbr_venues[['Venue Category']], prefix="", prefix_sep="") # one hot encoding

cbr_onehot['Neighborhood'] = cbr_venues['Neighborhood'] # add neighborhood column back to dataframe

fixed_columns = [cbr_onehot.columns[-1]] + list(cbr_onehot.columns[:-1]) # move neighborhood column to the first column 
cbr_onehot = cbr_onehot[fixed_columns]

cbr_onehot.head()

In [None]:
# aggregate
cbr_grouped = cbr_onehot.groupby('Neighborhood').mean().reset_index()
cbr_grouped.head()

In [None]:
# top 5 frequencies

num_top_venues = 5

for hood in cbr_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = cbr_grouped[cbr_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = cbr_grouped['Neighborhood']

for ind in np.arange(cbr_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cbr_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 5

cbr_grouped_clustering = cbr_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cbr_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

# add clustering labels
neighborhoods_venues_sorted.insert(1, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.tail()

In [None]:
# merge dataset and check output
cbr_merged = pd.merge(cbr_geo, neighborhoods_venues_sorted, on='Neighborhood')
cbr_merged.tail()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="CartoDB dark_matter")

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cbr_merged['Latitude'], cbr_merged['Longitude'], cbr_merged['Neighborhood'], cbr_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
# Loop through each cluster
for cluster in range(kclusters):
    # Get the data for the cluster
    cluster_data = cbr_merged.loc[cbr_merged['Cluster Labels'] == cluster, cbr_merged.columns[[1] + list(range(5, cbr_merged.shape[1]))]]
    # Do something with the data for the cluster
    print(cluster_data)

In [None]:
cbr_crime = pd.read_csv('indir/cbr-neighbourhood-crime.txt', sep="\t")
print(cbr_crime.shape) # check dimensions
cbr_crime.head() # inspect data

In [None]:
cbr_crime = cbr_crime[ (cbr_crime['Population']>0) & (cbr_crime['Total'] >0) ] # only process those neighborhoods with crime and population counts
cbr_crime['Total per pop'] = (cbr_crime['Total'] /  cbr_crime['Population']) * 100 # determine proportion of total Assault, Robbery, and Burglary per capita
cbr_crime['Rank'] = cbr_crime['Total per pop'].rank(ascending=1) # rank order the results

In [None]:
cbr_crime = pd.merge(cbr_crime, cbr_merged[['Cluster Labels', 'Neighborhood', 'Latitude', 'Longitude']], on='Neighborhood').sort_values('Rank') 
print(cbr_crime.shape) # check dimensions
cbr_crime.head() # inspect data
cbr_crime.tail() # inspect data

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles="CartoDB dark_matter")

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# query filter map data
map_data = cbr_crime[             
        (cbr_crime['Cluster Labels'] == 2) & 
        (cbr_crime['Rank'] < cbr_crime['Rank'].quantile(0.5))] # 90th percentile


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(map_data['Latitude'], map_data['Longitude'], map_data['Neighborhood'],  map_data['Cluster Labels'] ):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
     