In [145]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
from requests import get
import lxml
import geocoder

**Task 1:**

*Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe*

In [146]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#the dataframe for scraped data will be called 'codes'.
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood:

codes = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighbourhood'])

#getting html as 'src'
src = get(url).text

#creating a soup from 'src':
soup = bs(src, 'lxml')

In [217]:
#getting the table with postal codes
table = soup.table

#getting the rows with data
rows = table.findAll('tr')

#getting the table's fields and putting them in the dataframe
i = 0

for row in rows[1:]:
    tds = row.findAll('td')
    row_data = [j.text.strip() for j in tds]
    codes.loc[i] = row_data
    i+=1

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [218]:
codes = codes[codes['Borough'] != 'Not assigned'].reset_index(drop=True)

**Task citation:**

More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

**Approach:**
1. Get a list of unique postal codes
2. For each unique postal code, walk through scraped data ('codes'), find and concatenate neighbourhoods with the code
3. Create a new dataframe with unique codes in 'PostalCode' column, corresponding boroughs and concatenated neighbourhoods in 'Neighbourhood' column.

In [245]:
#getting a list of unique postal codes:
postcodes = list(set(codes['PostalCode']))

print('postcodes quantity: ', len(postcodes))

#creating a new dataframe for codes without duplicates
codes_nodup = pd.DataFrame(columns = codes.columns)

#For each unique postal code, walk through scraped data ('codes'), find and concatenate neighbourhoods with the code:
for k in range(len(postcodes)):
    
    nbs = [] #creating a list for neighbourhoods sharing the same postcode
    borough = ''
    
    for i in range(codes.shape[0]):
        if postcodes[k]==codes.loc[i, 'PostalCode']:
            nbs.append(codes.loc[i, 'Neighbourhood'])
            borough = codes.loc[i, 'Borough']
            
    #converting the list of neighbourhoods to a string:

    nbs_str = nbs[0] 
    
    for nb in nbs[1:]:
        nbs_str = nbs_str+', '+nb
    #print('nb_str: ', nbs_str)
    
    #recording the resulting row in the new dataframe
    codes_nodup.loc[k] = [postcodes[k], borough, nbs_str]
    
#displaying the head of the resulting dataframe
codes_nodup.head()

postcodes quantity:  103


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel"
1,M2K,North York,Bayview Village
2,M6S,West Toronto,"Runnymede, Swansea"
3,M1G,Scarborough,Woburn
4,M6G,Downtown Toronto,Christie


**Task citation:**

*If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.*

In [246]:
#creating a new dataframe for the result
codes_res = codes_nodup

#replacing 'Not assigned' neighbourhoods with corresponding boroughs
for i in range(codes_res.shape[0]):
    if codes_res.loc[i, 'Neighbourhood'] == 'Not assigned':
        codes_res.loc[i, 'Neighbourhood'] = codes_res.loc[i, 'Borough']
        

**TASK ONE - ANSWER (Dataframe)**

In [247]:
#Here is the resulting dataframe
codes_res

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel"
1,M2K,North York,Bayview Village
2,M6S,West Toronto,"Runnymede, Swansea"
3,M1G,Scarborough,Woburn
4,M6G,Downtown Toronto,Christie
5,M1X,Scarborough,Upper Rouge
6,M4E,East Toronto,The Beaches
7,M3L,North York,Downsview
8,M2N,North York,Willowdale
9,M1T,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan"


Task citation: *In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe*

**TASK ONE - ANSWER (Dataframe shape)**

In [93]:
codes_res.shape

(103, 3)

**END OF THE TASK ONE**

--------------------------------------------------------------------------------------------------------------------------------

**TASK TWO**

**Task citation:**


*Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.*

Let's try to get coordinates from Geocoder. If we can't, let's use the csv.

In [240]:
#creating a function for getting coordinates from Geocoder

def get_coordinates_from_geocoder(postal_code, try_limit = 3):
    # task citation: initialize your variable to None
    lat_lng_coords = None
    
    i = 0
    
    # task citation: loop until you get the coordinates
    # task comment: will loop only until we are within pre-defined loop number limit
    
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        i+=1
        
        if i > try_limit: #exiting the loop when the max number of loops is exceeded
            break

    try:
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
        
    except:
        print('error getting coordinates from Geocoder for the code: ', postal_code)
        latitude = longitude = 'error'
    
    return (latitude, longitude)

Let's get the coordinates. If we can't get any coordinates from Geocoder 10 times, we switch to the csv.

In [252]:
if 'Latitude' not in codes_res.columns:
    codes_res['Latitude'] = None
    codes_res['Longitude'] = None
    

rows_to_request = codes_res[(codes_res['Latitude'].isnull()) | (codes_res['Longitude'].isnull())].index

try_count = 0
try_limit = 3

for row in rows_to_request:
    
    if try_count > try_limit:
        print('Stopping attempts to get coordinates from Geocoder.')
        use_csv = True
        break
    
    latlon = get_coordinates_from_geocoder(codes_res.loc[row, 'PostalCode'])
        
    if latlon[0] == 'error' or latlon[1] == 'error':
        
        print('error getting coordinates from Geocoder. Try count: ', try_count)
        try_count+=1

    else:
        codes_res.loc[row, 'Latitude'] = latlon[0]
        codes_res.loc[row, 'Longitude'] = latlon[1]

error getting coordinates from Geocoder for the code:  M5G
error getting coordinates from Geocoder. Try count:  0
error getting coordinates from Geocoder for the code:  M6J
error getting coordinates from Geocoder. Try count:  1
error getting coordinates from Geocoder for the code:  M3B
error getting coordinates from Geocoder. Try count:  2
error getting coordinates from Geocoder for the code:  M4S
error getting coordinates from Geocoder. Try count:  3
Stopping attempts to get coordinates from Geocoder.


In [253]:
if use_csv:
    backup_csv = pd.read_csv('Geospatial_Coordinates.csv', encoding='cp1252')
    codes_res = codes_res.drop(['Latitude', 'Longitude'],axis = 1)
    codes_res = codes_res.join(backup_csv.set_index('Postal Code'), on = 'PostalCode')

In [254]:
codes_res

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
1,M2K,North York,Bayview Village,43.786947,-79.385975
2,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.484450
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M6G,Downtown Toronto,Christie,43.669542,-79.422564
5,M1X,Scarborough,Upper Rouge,43.836125,-79.205636
6,M4E,East Toronto,The Beaches,43.676357,-79.293031
7,M3L,North York,Downsview,43.739015,-79.506944
8,M2N,North York,Willowdale,43.770120,-79.408493
9,M1T,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan",43.781638,-79.304302


In [137]:
codes_res.shape

(103, 5)

In [139]:
codes_res.to_csv('codes_res.csv', index=None)

**END OF TASK TWO**

---------------------------------------------------------------------------------------------------------------------------------