In [8]:
#! pip install selenium
#! pip install webdriver-manager
#! wget https://chromedriver.storage.googleapis.com/83.0.4103.14/chromedriver_win32.zip

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ExpectedConditions

from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

import time

import os

## We will be using Selenuim to obtain the content of dinamic web elements, which contents changes as JavaScript executes:

The web-site https://torontolife.com/ provides Toronto's neighborhoods ranking on the bases of the following 10 criterias: housing, safety, transit, shopping, health, entertainment, community, diversity, education and employment. Each of them is ranked form 0 to 100 which meand that the data do not need rescaling. But as will be evident later, the names of the neighborhoods slighty differ from our initial dataset, which is based on Wiki's Toronto Neighborhoods Postal Codes. Hence, a mapping of the neighborhoods will be needed.
  
For a start, a list of all the neighborhoods ranked on the following link: https://torontolife.com/neighbourhood-rankings/# will be obtained.

In [9]:
driver = webdriver.Chrome(ChromeDriverManager().install())

wait = WebDriverWait(driver, 40)

driver.get('https://torontolife.com/neighbourhood-rankings/#Mount%20Pleasant%20East')

element = wait.until(ExpectedConditions.visibility_of_element_located((By.XPATH, '//div[@class="sjmnabes-main-content-copy js--sjmnabes-list-container"]/ul')))
element.text


Looking for [chromedriver 81.0.4044.69 win32] driver in cache 
File found in cache by path [C:\Users\Lenovo\.wdm\drivers\chromedriver\81.0.4044.69\win32\chromedriver.exe]


"1.\nRunnymede-Bloor West Village\n2.\nNorth Riverdale\n3.\nMount Pleasant East\n4.\nDanforth\n5.\nLawrence Park North\n6.\nRosedale-Moore Park\n7.\nPlayter Estates-Danforth\n8.\nTrinity-Bellwoods\n9.\nLeaside-Bennington\n10.\nYonge-St. Clair\n11.\nThe Beaches\n12.\nPalmerston-Little Italy\n13.\nBedford Park-Nortown\n14.\nCabbagetown-South St. James Town\n15.\nMount Pleasant West\n16.\nDanforth-East York\n17.\nYonge-Eglinton\n18.\nKingsway South\n19.\nWychwood\n20.\nBay Street Corridor\n21.\nUniversity\n22.\nOld East York\n23.\nHigh Park North\n24.\nAnnex\n25.\nCasa Loma\n26.\nEast End-Danforth\n27.\nHigh Park-Swansea\n28.\nCorso Italia-Davenport\n29.\nMoss Park\n30.\nYorkdale-Glen Park\n31.\nLawrence Park South\n32.\nChurch-Yonge Corridor\n33.\nJunction Area\n34.\nLittle Portugal\n35.\nWaterfront Communities-The Island\n36.\nBlake-Jones\n37.\nGreenwood-Coxwell\n38.\nBridle Path-Sunnybrook-York Mills\n39.\nRoncesvalles\n40.\nKensington-Chinatown\n41.\nStonegate-Queensway\n42.\nForest H

In [10]:
list_of_ranked_neighborhoods = element.text.split("\n")
list_of_ranked_neighborhoods = list_of_ranked_neighborhoods[1: len(list_of_ranked_neighborhoods): 2]
len_ranked_neighborhoods = len(list_of_ranked_neighborhoods)
len_ranked_neighborhoods

140

In [11]:
list_of_ranked_neighborhoods

['Runnymede-Bloor West Village',
 'North Riverdale',
 'Mount Pleasant East',
 'Danforth',
 'Lawrence Park North',
 'Rosedale-Moore Park',
 'Playter Estates-Danforth',
 'Trinity-Bellwoods',
 'Leaside-Bennington',
 'Yonge-St. Clair',
 'The Beaches',
 'Palmerston-Little Italy',
 'Bedford Park-Nortown',
 'Cabbagetown-South St. James Town',
 'Mount Pleasant West',
 'Danforth-East York',
 'Yonge-Eglinton',
 'Kingsway South',
 'Wychwood',
 'Bay Street Corridor',
 'University',
 'Old East York',
 'High Park North',
 'Annex',
 'Casa Loma',
 'East End-Danforth',
 'High Park-Swansea',
 'Corso Italia-Davenport',
 'Moss Park',
 'Yorkdale-Glen Park',
 'Lawrence Park South',
 'Church-Yonge Corridor',
 'Junction Area',
 'Little Portugal',
 'Waterfront Communities-The Island',
 'Blake-Jones',
 'Greenwood-Coxwell',
 'Bridle Path-Sunnybrook-York Mills',
 'Roncesvalles',
 'Kensington-Chinatown',
 'Stonegate-Queensway',
 'Forest Hill North',
 'Woodbine Corridor',
 'Forest Hill South',
 'Thistletown-Beaumon

It turns out that there 140 ranked Neighborhoods on https://torontolife.com/ while the initial data has 103 Neighborhoods.
Hence, some mapping would be needed after exploring the two datasets.

But first, the rankings in all of the 10 categories for each of the 140 neighborhoods will be gathered and stored in a pandas dataframe:

In [12]:
driver.close()

In [13]:
ranked_df = pd.DataFrame(columns = ['Housing', 'Safety', 'Transit', 'Shopping', 'Health', 'Entertainment', 'Community', 
                                   'Diversity', 'Education', 'Employment'])
latitude = []
longitude = []

ranked_df

Unnamed: 0,Housing,Safety,Transit,Shopping,Health,Entertainment,Community,Diversity,Education,Employment


In [26]:
driver = webdriver.Chrome(ChromeDriverManager().install())

driver.get('https://torontolife.com/neighbourhood-rankings/#Mount%20Pleasant%20East')

wait = WebDriverWait(driver, 60)

for idx, neighbh in enumerate(list_of_ranked_neighborhoods):

        url_ending = neighbh.replace(" ", '%20')
        
        driver.get('https://torontolife.com/neighbourhood-rankings/#' + url_ending)
        time.sleep(2)
        driver.refresh()
        time.sleep(2)
        
        xpath = '//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[%s]' %(idx + 1) #
        print(xpath)
        element = wait.until(ExpectedConditions.visibility_of_element_located((By.XPATH, xpath)))
        element.click()

        element = wait.until(ExpectedConditions.visibility_of_element_located((By.CLASS_NAME, 'sjmnabes-nabe-detail__list')))
        time.sleep(2)
        ratings = element.text.split("\n")[1: len(element.text.split("\n")):2]
        ratings = pd.Series(ratings, index=['Housing', 'Safety', 'Transit', 'Shopping', 'Health', 'Entertainment', 'Community', 
                                       'Diversity', 'Education', 'Employment'])
        ranked_df = pd.concat([ranked_df, pd.DataFrame(ratings).T], axis = 0, ignore_index=True)
        
        element = wait.until(ExpectedConditions.visibility_of_element_located((By.XPATH, '//*[contains(text(), "View on Google Maps")]')))
        time.sleep(2)     
         
        lat_lng = element.get_attribute('href').split("@")[1].split(",")[0:2]
        latitude.append(lat_lng[0])
        longitude.append(lat_lng[1])
        print(lat_lng)

driver.close()


Looking for [chromedriver 81.0.4044.69 win32] driver in cache 
File found in cache by path [C:\Users\Lenovo\.wdm\drivers\chromedriver\81.0.4044.69\win32\chromedriver.exe]
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[1]
['43.6511002', '-79.4765826']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[2]
['43.6752909', '-79.3485883']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[3]
['43.7103203', '-79.390914']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[4]
['43.6815973', '-79.3316902']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[5]
['43.7291819', '-79.4034093']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[6]
['43.6799982', '-79.3907494']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[7]
['43.6773993', '-79.3526198']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[8]
['43.6452347', '-79.4132022']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/l

['43.7901906', '-79.1947833']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[84]
['43.6694492', '-79.523003']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[85]
['43.6478821', '-79.587296']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[86]
['43.5946998', '-79.5075801']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[87]
['43.7501108', '-79.5774908']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[88]
['43.7690416', '-79.4075313']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[89]
['43.68594', '-79.3598478']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[90]
['43.7477111', '-79.5049964']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[91]
['43.7701', '-79.3854235']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[92]
['43.7016045', '-79.5204799']
//ul[@class="sjmnabes-main-content-list js--sjmnabes-list"]/li[93]
['43.6957454', '-79.4348245']
//ul[@c

Let's add the corresponding columns for 'Neighborhood', 'Latitude' and 'Longitude'

In [21]:
ranked_df['Neighborhood'] = list_of_ranked_neighborhoods
ranked_df.set_index('Neighborhood', inplace=True)
ranked_df.reset_index(inplace=True)

ranked_df['Latitude'] = latitude
ranked_df['Longitude'] = longitude
ranked_df.head(5)

ValueError: Length of values does not match length of index

In [None]:
ranked_df.to_csv(os.path.join(os.getcwd(), 'data', 'torontolife_rankings.csv')

In [None]:
ranked_df1 = pd.read_csv(os.path.join(os.getcwd(), 'data', 'torontolife_rankings.csv'), index_col=0)
ranked_df1.head()