In [1]:
# setup for dynamic scraping
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# driver = webdriver.Chrome('chromedriver')
cService = webdriver.ChromeService(executable_path='/Users/lixuewei/Desktop/MACS30112/4/chromedriver-mac-arm64/chromedriver')
driver = webdriver.Chrome(service=cService)
driver.get("https://www.arcgis.com/apps/dashboards/7846c3c37dff4728923609a9f55f849c")
wait = WebDriverWait(driver, 5) # explicit wait for a time period   

In [None]:
# Zoom out
zoom_out_buttom = driver.find_element(By.XPATH, '/html/body/div/calcite-shell/div[2]/div[2]/div/div/div/margin-container/full-container/div[5]/margin-container/full-container/dashboard-tab-zone/section/div/div/div/div[3]/div/div[4]/div/div[2]')
for _ in range(9):
    zoom_out_buttom.click()
    time.sleep(0.5)

In [None]:
def scrape_and_collect_data(driver, num_pages):
    """
    Scrapes data from a table over multiple pages and returns a list of lists.

    Args:
        driver: A Selenium webdriver instance.
        num_pages: The number of pages to scrape.

    Returns:
        A list of lists, where each inner list represents the poverty data from one page.
    """    
    data_all_pages = []

    for _ in range(num_pages):
        # Get HTML source code
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Wait for the table data to be present
        wait = WebDriverWait(driver, 10)
        poverty_tds = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'td'))) 

        # Extract poverty data 
        poverty_tds = soup.findAll("td")
        poverty_values = [td.text for td in poverty_tds]

        # Store data for the current page
        data_all_pages.append(poverty_values)

        # Click the next page button (if it exists)
        try:
            next_page_button = driver.find_element(By.XPATH, '/html/body/div/calcite-shell/div[2]/div[2]/div/div/div/margin-container/full-container/div[6]/margin-container/full-container/dashboard-tab-zone/section/div/div[3]/calcite-action')
            next_page_button.click()

        except Exception:  # Handle potential errors if the button isn't found
            print(f"Page navigation may have finished. Collected data from {len(data_all_pages)} pages.")
            break

    return data_all_pages

In [None]:
LA_county_poverty = scrape_and_collect_data(driver, num_pages=348)
driver.quit() 
print(LA_county_poverty)

In [None]:
# Filter Non-LA Communities
LA_city_poverty = []
for row in LA_county_poverty:
    if row[4] == 'Los Angeles City':
        LA_city_poverty.append({
            "community": row[0],
            "2020poverty_rate": row[3],
            "city": row[4]
        })

In [None]:
# Create the DataFrame
df = pd.DataFrame(LA_city_poverty)
df.to_csv('LA_PovertyRate.csv', index = False)

In [15]:
# Combine the poverty rate of communities in the three cities
def convert_percent_to_float(value):
    return float(value.strip('%'))

# New York
nyc = pd.read_csv('nyc.csv')
nyc['poverty_rate'] = nyc['poverty_rate'].apply(convert_percent_to_float)
nyc_poverty = nyc[nyc['poverty_rate'] >= 20]
nyc_poverty = nyc_poverty.iloc[:, [0, 4]]
nyc_poverty.columns = ['community', 'poverty.rate']
nyc_poverty['city'] = 'New York City'
nyc_poverty.to_csv('nyc_poverty.csv', index=False)

In [36]:
print(nyc_poverty)
len(nyc_poverty)

   community  poverty.rate           city
1         BX          26.4  New York City
6      BX 01          36.6  New York City
7      BX 02          36.6  New York City
8      BX 03          35.8  New York City
9      BX 04          28.3  New York City
10     BX 05          33.7  New York City
11     BX 06          35.8  New York City
12     BX 07          27.1  New York City
14     BX 09          26.7  New York City
20     BK 03          24.2  New York City
21     BK 04          24.2  New York City
24     BK 07          22.9  New York City
26     BK 09          22.4  New York City
28     BK 11          23.1  New York City
29     BK 12          23.8  New York City
30     BK 13          24.1  New York City
33     BK 16          28.8  New York City
38     MN 03          24.2  New York City
44     MN 09          25.3  New York City
45     MN 10          28.4  New York City
46     MN 11          31.2  New York City
47     MN 12          21.0  New York City
61     QN 14          23.8  New Yo

23

In [27]:
# Chicago
chi = pd.read_csv('chi.csv')
chi['Poverty rate'] = chi['Poverty rate'].apply(convert_percent_to_float)
chi_poverty = chi[chi['Poverty rate'] >= 20]
chi_poverty.columns = ['community', 'poverty.rate']
chi_poverty['city'] = 'Chicago'
chi_poverty.to_csv('chi_poverty.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chi_poverty['city'] = 'Chicago'


In [35]:
print(chi_poverty)
len(chi_poverty)

                 community  poverty.rate     city
0                Riverdale          51.0  Chicago
1              Fuller Park          48.8  Chicago
2          Washington Park          46.9  Chicago
3       East Garfield Park          45.5  Chicago
4                Englewood          40.0  Chicago
5           North Lawndale          36.7  Chicago
6   Greater Grand Crossing          34.5  Chicago
7                  Oakland          33.6  Chicago
8            South Deering          32.4  Chicago
9           West Englewood          32.0  Chicago
10      West Garfield Park          31.6  Chicago
11             South Shore          31.2  Chicago
12                 Douglas          30.6  Chicago
13           South Chicago          30.3  Chicago
14                New City          29.5  Chicago
15                Woodlawn          29.0  Chicago
16           Armour Square          26.2  Chicago
17         Grand Boulevard          25.5  Chicago
18          Auburn Gresham          25.3  Chicago


31

In [26]:
# LA
la = pd.read_csv('la.csv')
la_poverty = la[la['2020poverty_rate'] >= 20]
la_poverty['community'] = la['community'].str[14:]
la_poverty.rename(columns={'2020poverty_rate': 'poverty.rate'}, inplace=True)
la_poverty.to_csv('la_poverty.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  la_poverty['community'] = la['community'].str[14:]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  la_poverty.rename(columns={'2020poverty_rate': 'poverty.rate'}, inplace=True)


In [34]:
print(la_poverty)
len(la_poverty)

                community  poverty.rate              city
3           Baldwin Hills         21.92  Los Angeles City
7           Boyle Heights         23.97  Los Angeles City
13                Central         25.79  Los Angeles City
15     Century Palms/Cove         25.88  Los Angeles City
18              Chinatown         27.61  Los Angeles City
24               Downtown         25.99  Los Angeles City
26         East Hollywood         20.42  Los Angeles City
29           Elysian Park         20.35  Los Angeles City
33        Exposition Park         25.93  Los Angeles City
35   Figueroa Park Square         26.46  Los Angeles City
36     Florence-Firestone         26.39  Los Angeles City
40          Green Meadows         24.68  Los Angeles City
46           Harvard Park         24.00  Los Angeles City
51              Hyde Park         20.93  Los Angeles City
52         Jefferson Park         23.87  Los Angeles City
53              Koreatown         24.26  Los Angeles City
59         Lit

37

In [33]:
# qualified communities in three cities
all_poverty = pd.concat([nyc_poverty, chi_poverty, la_poverty], axis=0)
all_poverty.to_csv('all_poverty.csv', index=False)
#len(all_poverty)

91