In [None]:
# setup for dynamic scraping
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
# driver = webdriver.Chrome('chromedriver')
cService = webdriver.ChromeService(executable_path='/Users/lixuewei/Desktop/MACS30112/4/chromedriver-mac-arm64/chromedriver')
driver = webdriver.Chrome(service=cService)
driver.get("https://www.arcgis.com/apps/dashboards/7846c3c37dff4728923609a9f55f849c")
wait = WebDriverWait(driver, 5) # explicit wait for a time period   

In [None]:
# Zoom out
zoom_out_buttom = driver.find_element(By.XPATH, '/html/body/div/calcite-shell/div[2]/div[2]/div/div/div/margin-container/full-container/div[5]/margin-container/full-container/dashboard-tab-zone/section/div/div/div/div[3]/div/div[4]/div/div[2]')
for _ in range(9):
    zoom_out_buttom.click()
    time.sleep(0.5)

In [None]:
def scrape_and_collect_data(driver, num_pages):
    """
    Scrapes data from a table over multiple pages and returns a list of lists.

    Args:
        driver: A Selenium webdriver instance.
        num_pages: The number of pages to scrape.

    Returns:
        A list of lists, where each inner list represents the poverty data from one page.
    """    
    data_all_pages = []

    for _ in range(num_pages):
        # Get HTML source code
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Wait for the table data to be present
        wait = WebDriverWait(driver, 10)
        poverty_tds = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'td'))) 

        # Extract poverty data 
        poverty_tds = soup.findAll("td")
        poverty_values = [td.text for td in poverty_tds]

        # Store data for the current page
        data_all_pages.append(poverty_values)

        # Click the next page button (if it exists)
        try:
            next_page_button = driver.find_element(By.XPATH, '/html/body/div/calcite-shell/div[2]/div[2]/div/div/div/margin-container/full-container/div[6]/margin-container/full-container/dashboard-tab-zone/section/div/div[3]/calcite-action')
            next_page_button.click()

        except Exception:  # Handle potential errors if the button isn't found
            print(f"Page navigation may have finished. Collected data from {len(data_all_pages)} pages.")
            break

    return data_all_pages

In [None]:
LA_county_poverty = scrape_and_collect_data(driver, num_pages=348)
driver.quit() 

In [None]:
# Select Communities in Los Angeles City
LA_city_poverty = []
for row in LA_county_poverty:
    if row[4] == 'Los Angeles City':
        LA_city_poverty.append({
            "community": row[0],
            "2020poverty_rate": row[3],
            "city": row[4]
        })

In [None]:
# Create the DataFrame
df = pd.DataFrame(LA_city_poverty)
df.to_csv('LA_PovertyRate.csv', index = False)

In [None]:
# Combine the poverty rate of communities in the three cities
def convert_percent_to_float(value):
    return float(value.strip('%'))

# New York
nyc = pd.read_csv('nyc.csv')
nyc['poverty.rate'] = nyc['poverty.rate'].apply(convert_percent_to_float)
nyc_poverty = nyc[nyc['poverty.rate'] >= 20]
nyc_poverty = nyc_poverty.iloc[:, [0, 1, 4]]
nyc_poverty.columns = ['borough', 'community', 'poverty.rate']
nyc_poverty['city'] = 'New York City'
nyc_poverty.to_csv('nyc_poverty.csv', index=False)

In [None]:
# Chicago
chi = pd.read_csv('chi.csv')
chi['Poverty rate'] = chi['Poverty rate'].apply(convert_percent_to_float)
chi_poverty = chi[chi['Poverty rate'] >= 20]
chi_poverty.columns = ['community', 'poverty.rate']
chi_poverty['city'] = 'Chicago'
chi_poverty.to_csv('chi_poverty.csv', index=False)

In [None]:
# LA
la = pd.read_csv('la.csv')
la_poverty = la[la['2020poverty_rate'] >= 20]
la_poverty['community'] = la['community'].str[14:]
la_poverty.rename(columns={'2020poverty_rate': 'poverty.rate'}, inplace=True)
la_poverty.to_csv('la_poverty.csv', index=False)

In [None]:
# qualified communities in three cities
all_poverty = pd.concat([nyc_poverty, chi_poverty, la_poverty], axis=0)
all_poverty.to_csv('all_poverty.csv', index=False)