# Name: Margaret Nguyen

# Web-scraping PeopleForBikes BNA

**Assignment: Scrape all of the Bicycle Network Analysis scores (BNA), location labels (city, state, country), and other relevant information from People for bikes, located [here](https://bna.peopleforbikes.org/#/places////).**

In [1]:
import pandas as pd 
from selenium import webdriver # Imports the webdriver module from the Selenium library.
import re # Provide support for regular expressions, useful for pattern matching and text manipulation.
import time 
import random
from selenium.common.exceptions import NoSuchElementException # Handle situations where an element is not found on a web page.
from selenium.common.exceptions import StaleElementReferenceException # Handle the stale element exception by re-finding the element or taking appropriate action
from selenium.webdriver.common.keys import Keys # Be used for various keyboard interactions when automating web testing or interactions with web elements.

In [2]:
driver = webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver")
driver.get('https://bna.peopleforbikes.org/#/places////')

  driver = webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver")


In [3]:
# Scrape single page
def get_bna():
    places = []
    num = 1
    while True:
        # Construct the XPath expressions with the current page number
        place_xpath = '//*[@id="scrollHeaders"]/section[1]/div[{}]/div[1]/div[2]'.format(num)
        score_xpath = '//*[@id="scrollHeaders"]/section[1]/div[{}]/div[2]/div[1]'.format(num)
        
        # Check if the place element is present on the page
        try:
            place_element = driver.find_element('xpath', place_xpath)
        except NoSuchElementException:
            # Place element not found, break out of the loop
            break
        
        # Get place
        place = place_element.text.strip().split(',')
        
        # Get the state from place 
        state = place[1].strip()
        
        # Get the country from place
        country = place[2].strip().split('\n')

        # Get BNA score
        score_element = driver.find_element('xpath', score_xpath)
        score = score_element.text.strip()
        
        # For country does not have state (clean data)
        if re.match(r'\d{4}', country[0]):
            # If country[0] matches the pattern, replace it with state
            # Replace state with place
            country[0] = state
            country = country[0].split('\n')
            state = place[0].split('\n')[0]
        else:
            pass
        
        # For one specific city in the US: Central Core, Austin (clean data)
        if country[0] == 'TX':
            state = country[0]
            country[0] = 'US'
            place[0] = 'Central Core, Austin'
            
        # Include in the list
        places.append([place[0], state, country[0], score])
        
        # Increment page number
        num += 1
    
    return places

In [4]:
# Srape multiple pages
def get_all_bna():
    # Define an empty dataframe that will be populated
    df_out = pd.DataFrame()

    while True:
        # Scrape a single page
        single_bna = get_bna()

        # DataFrame
        df_single = pd.DataFrame(single_bna, columns = ['City', 'State', 'Country', 'BNA Score'])

        # df_out that keeps the entire data
        df_out = pd.concat([df_out, df_single])
        
        # Next page
        button = driver.find_element('xpath', '//*[@id="scrollHeaders"]/section[2]/button[2]')
        if button.is_enabled():
            # Click on the button
            button.click()
            # Wait between 2-3 seconds
            time.sleep(random.uniform(2, 3))
        else:
            break
        
    return df_out

In [5]:
# Define an empty dataframe that will be populated
df_final = pd.DataFrame()

# Filter by country
for i in range(2, 27):
    try:
        place_search = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[2]/div[1]/div/select/option[{}]'.format(i))
        place_search.click()
        
        # Wait between 2-3 seconds
        time.sleep(random.uniform(2, 3))
        
    except StaleElementReferenceException:
        place_search = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[2]/div[1]/div/select/option[{}]'.format(i))
        place_search.click()
        
        # Wait between 2-3 seconds
        time.sleep(random.uniform(2, 3))
        
        print(f"Page {i} is missing")
        break # Skip to the next page
        
    except NoSuchElementException:
        print(f"Element not found for page {i}")
        break # Skip to the next page

    # Scrape all the places from each country
    df_country_bna = get_all_bna()

    # df_final that keeps the entire data
    df_final = pd.concat([df_final, df_country_bna])
    
# Reset index
df_final = df_final.reset_index(drop=True)

In [6]:
# Check for duplicate rows to verify the accuracy of web scraping
duplicateRows = df_final[df_final.duplicated()]

duplicateRows

Unnamed: 0,City,State,Country,BNA Score
1028,Jackson,MS,US,12
1266,Midland,MI,US,48
1946,Westfield,IN,US,31


In [10]:
# re-webscraping

# Go back to the page
driver = webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver")
driver.get('https://bna.peopleforbikes.org/#/places////')

# Define an empty dataframe that will be populated
df_missing_place = pd.DataFrame()

for city in duplicateRows.loc[:, 'City']:
    # Input the place name to scrape the missing data
    place_input = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[1]/div/div/input')
    place_input.send_keys(f"{city}")
    place_input.send_keys(Keys.ENTER)
    
    # Wait between 2-3 seconds
    time.sleep(random.uniform(2, 3))
    
    # Scrape the missing data
    df_city = get_all_bna()

    # df_final that keeps the entire data
    df_missing_place = pd.concat([df_missing_place, df_city])
    
    # Delete the old search:
    for i in range(len(city)):
        place_input.send_keys(Keys.BACKSPACE)
        
# Close the WebDriver when done
driver.quit()

df_missing_place

  driver = webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver")


Unnamed: 0,City,State,Country,BNA Score
0,Jackson,MS,US,12
1,Jackson,TN,US,16
2,Jackson,WY,US,79
3,Jackson,MI,US,37
4,Jacksonville,FL,US,17
5,Jacksonville Beach,FL,US,13
0,Midland,TX,US,9
1,Midland,MI,US,48
0,Westfield,MA,US,15
1,Westfield,IN,US,31


In [11]:
# Clean final dataframe
df_final = pd.concat([df_final, df_missing_place])
df_final = df_final.drop_duplicates()
df_final = df_final.sort_values(by=['Country', 'City'])
df_final = df_final.reset_index(drop=True)

# Show data
df_final

Unnamed: 0,City,State,Country,BNA Score
0,Adelaide,SA,AU,27
1,Adelaide LGA,SA,AU,43
2,Albury-Wodonga,NSW,AU,37
3,Alice Springs,NT,AU,50
4,Ballarat,VIC,AU,28
...,...,...,...,...
2009,Ypsilanti,MI,US,26
2010,Yuba City,CA,US,25
2011,Yuma,AZ,US,17
2012,Zeeland,MI,US,52


In [15]:
# Assuming 'data' is a subdirectory in your current working directory
folder_path = 'data/'
file_name = 'BNA_score.csv'

# Combine the folder path and file name to create the full file path
full_file_path = folder_path + file_name

# Export dataframe to csv file
df_final.to_csv(full_file_path, index=True)