# This Notebook is used to scrape the Wikipedia page with Toronto neighbourhoods.

# Part I. Web Sraping

#### Installation of the "beautifulsoup4" package.

In [1]:
#!conda install beautifulsoup4

#### Import of the required modules.

In [2]:
import lxml
import requests
import pandas as pd
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

#### Getting the Neighbourhoods table from the Wikipedia page.

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = BeautifulSoup(requests.get(url).content,features = 'lxml')
table = soup.find_all('table',class_ = 'wikitable sortable')[0]
df = pd.read_html(str(table))[0]

#### Transforming the data into the required form.

In [4]:
df = df.loc[(df['Borough'] != 'Not assigned')]
mask = (df['Neighbourhood'] == 'Not assigned')
df['Neighbourhood'][mask] = df['Borough'][mask]
df = df.sort_values(['Postcode','Borough','Neighbourhood'])
df = df.groupby(['Postcode','Borough'],as_index = False).agg({'Neighbourhood':', '.join})
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Printing the number of rows and columns of the resulting dataframe.

In [5]:
print("The number of rows and columns of the resulting dataframe:",df.shape)

The number of rows and columns of the resulting dataframe: (103, 3)


# Part II. Getting the coordinates for every neighbourhood 

In [6]:
!pip install opencage
from opencage.geocoder import OpenCageGeocode



In [7]:
# The code was removed by Watson Studio for sharing.

In [8]:
geocoder = OpenCageGeocode(key)
latitude_list = []
longitude_list = []
response = []

for i in df.index:
    address = df['Postcode'][i] + ', Toronto, Ontario'   
    results = geocoder.geocode(address) 
    for j in range(0,len(results)):
        if df['Postcode'][i] in results[j]['formatted']:
            latitude = results[j]['geometry']['lat']
            longitude = results[j]['geometry']['lng']
            break
        else:
            latitude = 0
            longitude = 0
    response.append(results[j]['formatted'])            
    latitude_list.append(latitude)
    longitude_list.append(longitude)
    #print('The geograpical coordinate are {}, {}.'.format(latitude, longitude))

df['Latitude'] = latitude_list
df['Longitude'] = longitude_list
df['response'] = response

df.loc[(df['Latitude'] == 0)]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,response
25,M3A,North York,Parkwoods,0.0,0.0,"Toronto, ON M6K 1X9, Canada"
85,M7A,Downtown Toronto,Queen's Park,0.0,0.0,"Toronto, ON M6K 1X9, Canada"
86,M7R,Mississauga,Canada Post Gateway Processing Centre,0.0,0.0,"Toronto, ON M6K 1X9, Canada"


In [9]:
df.loc[df['Postcode'] == 'M3A','Latitude'],df.loc[df['Postcode'] == 'M3A','Longitude']  = 43.7533,-79.3297
df.loc[df['Postcode'] == 'M7A','Latitude'],df.loc[df['Postcode'] == 'M7A','Longitude']  = 43.6623,-79.3895
df.loc[df['Postcode'] == 'M7R','Latitude'],df.loc[df['Postcode'] == 'M7R','Longitude']  = 43.6370,-79.6158
df.drop('response',axis = 1, inplace = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.765717,-79.221898
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389


## Part III. Exploring the neighbourhoods in Toronto

In [10]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

In [11]:
import folium

In [12]:
toronto_coord = geocoder.geocode('Toronto, Ontario')
toronto_lat = toronto_coord[0]['geometry']['lat']
toronto_lng = toronto_coord[0]['geometry']['lng']

map_toronto = folium.Map(location = [toronto_lat,toronto_lng],zoom_start = 10)

for lat,lng,borough,neighbourhood in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighbourhood']):
    label = '{}: {}'.format(neighbourhood,borough)
    label = folium.Popup(label,parse_html = True)
    folium.CircleMarker([lat,lng],radius = 5,popup = label,color = 'blue',fill = True,
                        fill_color = '#3186cc',fill_opacity = 0.7,parse_html = False).add_to(map_toronto)
map_toronto

In [15]:
# The code was removed by Watson Studio for sharing.

In [16]:
from pandas.io.json import json_normalize

In [38]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,CLIENT_SECRET,VERSION,df.loc[df['Postcode'] == 'M7R','Latitude'],df.loc[df['Postcode'] == 'M7R','Longitude'],500,100)
results = requests.get(url).json()['meta']
results

{'code': 429,
 'errorType': 'quota_exceeded',
 'errorDetail': 'Quota exceeded',
 'requestId': '5e3d4c8c1835dd001b451484'}

In [28]:
def getNearbyVenues(boroughs,neighbourhoods,latitudes,longitudes,radius = 500,limit = 3):
    venues_list = []
    
    for name,lat,lng in zip(names,latitudes,longitudes):
        # print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,CLIENT_SECRET,VERSION,lat,lng,radius,limit)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name,lat,lng,v['venue']['name'],v['venue']['location']['lat'],v['venue']['location']['lng'],v['venue']['categories'][0]['name'])
                            for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['B0rhood','Neighbourhood','Neighbourhood Latitude','Neighbourhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(boroughs = df['Borough'],neighbourhoods = df['Neighborhood'],latitudes = df['Latitude'],longitudes = df['Longitude'])

In [68]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,5,5,5,5,5,5
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",13,13,13,13,13,13
"Alderwood, Long Branch",7,7,7,7,7,7
"Bathurst Manor, Downsview North, Wilson Heights",23,23,23,23,23,23
"Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara",100,100,100,100,100,100
Bayview Village,3,3,3,3,3,3
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,75,75,75,75,75,75


In [37]:
toronto_venues

NameError: name 'toronto_venues' is not defined