**Importing Packages**

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import re
from bs4 import BeautifulSoup as BSoup


/bin/bash: conda: command not found


ModuleNotFoundError: ignored

**Scraping Data from Wikipedia**

In [0]:
BASE_URI = 'https://en.wikipedia.org'
page = requests.get('https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Toronto')
soup = BSoup(page.content, 'html.parser')

boroughs_list = soup.select(".mw-parser-output h3")
neighbours_list = soup.select(".mw-parser-output div table.multicol")

city_info = list()

column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

def geo_calculator(value):
  if len(value) == 4:
    decimal = float(value[0]) + (float(value[1])/60) + (float(value[2])/3600)
  elif len(value) == 3:
    decimal = float(value[0]) + (float(value[1])/60)
  elif len(value) == 2:
    decimal = float(value[0])
  else:
    raise ValueError
  return decimal if value[-1].strip() in ['N', 'E'] else -decimal

def scrape_geodata(url):
  page_ = requests.get(url)
  soup_ = BSoup(page_.content, 'html.parser')
  lat_elem = soup_.select('.geo-default .geo-dms .latitude')
  lon_elem = soup_.select('.geo-default .geo-dms .longitude')
  if lat_elem:
    if 'append' in dir(lat_elem):
      lat_elem = lat_elem[0]
    lt = re.split(u'[°′″]', lat_elem.get_text())
    latitude = geo_calculator(lt)
  
  else:
    lat_elem = soup_.select('.geo-default .geo')[0].get_text()
    latitude = lat_elem.split('; ')[0]

  if lon_elem:
    if 'append' in dir(lon_elem):
      lon_elem = lon_elem[0]
    ln = re.split(u'[°′″]', lon_elem.get_text())
    longitude = geo_calculator(ln)

  else:
    lon_elem = soup_.select('.geo-default .geo')[0].get_text()
    longitude = lon_elem.split('; ')[1]

  return latitude, longitude

def get_geodata(url):
  geolocator = Nominatim()
  geo_name = url.split('/')[-1].replace('_', ' ')
  location = geolocator.geocode(geo_name)
  if location:
    return location.latitude, location.longitude
  else:
    lat, lon = scrape_geodata(BASE_URI + url)
    return lat, lon

neighbourhood_data = list()
for index in range(len(neighbours_list)):
  this_borough = boroughs_list[index].find('a').get_text()
  neighbours_in_borough = neighbours_list[index].select('td li')
  for neighbours in neighbours_in_borough:
    neighbour_dict = dict()
    neighbour_url = neighbours.find('a').get('href')
    neighbour_name = neighbours.find('a').get_text()
    neighbour_dict['Borough'] = this_borough.encode('ascii', 'ignore')
    neighbour_dict['Neighborhood'] = neighbour_name.encode('ascii', 'ignore')
    neighbour_dict['URI'] = BASE_URI + neighbour_url
    neighbour_dict['Latitude'], neighbour_dict['Longitude'] = get_geodata(neighbour_url)
    neighbourhood_data.append(neighbour_dict)

neighbours = pd.DataFrame(columns=['Borough', 'Neighborhood', 'Latitude', 'Longitude'])
for data in neighbourhood_data:
  data.pop('URI')
  neighbours = neighbours.append(data, ignore_index=True)

# print(neighbours)

**Making a Folium Map for Toronto**

In [0]:
address = 'Toronto, Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude, longitude = location.latitude, location.longitude
# print('\n>>> Latitude and Longitude of Toronto: {}, {}'.format(latitude, longitude))

# Creating Map of Toronto:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lt, ln, borough, neighbour in zip(neighbours['Latitude'], neighbours['Longitude'], neighbours['Borough'], neighbours['Neighborhood']):
  label = '{}, {}'.format(neighbour, borough)
  label = folium.Popup(label, parse_html=True)
  try:
    folium.CircleMarker(
      [lt, ln],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7).add_to(map_toronto)
  except TypeError:
    pass
print('\n\t *** Mapping Toronto ***')
# print(map_toronto)

**Adding Markers to Toronto Map**

In [0]:
# toronto data
toronto_data = neighbours.reset_index(drop=True)

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

**Pulling data of Top 100 venues within 500 m of Toronto from Foursquare API**

In [0]:
CLIENT_ID = 'RUQCXQNNJJGBXFNFM0305GOE242UGPX0QM4FG3L34TDU2T3K'
CLIENT_SECRET = 'KXOXMOXDQGGENMPD2DWG4RDQJXFUJ5NZIATE4IXBOQLM3KOZ'
VERSION = '20180410'

neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()

**Extracting the category of the venue**

In [3]:
def get_category_type(row):
  try:
    categories_list = row['categories']
  except:
    categories_list = row['venue.categories']
        
  if len(categories_list) == 0:
    return None
  else:
    return categories_list[0]['name']

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

NameError: ignored

**Handling for every neighbouhood in Toronto**

In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
  
  venues_list=[]
  for name, lat, lng in zip(names, latitudes, longitudes):
    print(name)
        
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
      CLIENT_ID, 
      CLIENT_SECRET, 
      VERSION, 
      lat, 
      lng, 
      radius, 
      LIMIT)
        
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    venues_list.append([(
      name, 
      lat, 
      lng, 
      v['venue']['name'], 
      v['venue']['location']['lat'], 
      v['venue']['location']['lng'],  
      v['venue']['categories'][0]['name']) for v in results])

  nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
  nearby_venues.columns = ['Neighborhood', 
              'Neighborhood Latitude', 
              'Neighborhood Longitude', 
              'Venue', 
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']
  
  return(nearby_venues)
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'], latitudes=toronto_data['Latitude'], longitudes=toronto_data['Longitude'])

**Analysis of each neighbourhood**

In [0]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
  print("----"+hood+"----")
  temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
  temp.columns = ['venue','freq']
  temp = temp.iloc[1:]
  temp['freq'] = temp['freq'].astype(float)
  temp = temp.round({'freq': 2})
  print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
  print('\n')

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


**Displaying the top 10 venues for each neighborhood**

In [0]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted


**Clustering Neighbourhood**

In [0]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)