In [75]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import asyncio
from playwright.async_api import async_playwright

### Plan of action:
1. Go to main search page
2. Click on see all link
3. Get table contents from first page
4. Navigate to url with next search results
5. Get table contents from that page
6. Repeat until end

Pages have 50 results each, so page numbers for URL are
51
101
151
...
1901

### Code required:
1. Initial setup to get to first table
2. Current page counter
3. Array to push all results into
4. Function to grab table info from each table
5. Function to navigate to next page
6. While loop for current page counter <= 1901

#### Start up playwright and get to the first page

In [56]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [57]:
firstUrl = "https://cumulis.epa.gov/supercpad/Cursites/srchsites.cfm"
await page.goto(firstUrl)

<Response url='https://cumulis.epa.gov/supercpad/Cursites/srchsites.cfm' request=<Request url='https://cumulis.epa.gov/supercpad/Cursites/srchsites.cfm' method='GET'>>

In [58]:
seeAllLink =  await page.query_selector("#nplAll")
print(seeAllLink)

JSHandle@node


In [59]:
await seeAllLink.click()

#### Save all table text here:

In [60]:
tableText = []
testTableText = []
testFirstTable = []

#### Functions for grabbing table info and going to next page

In [61]:
async def get_table_text():
    table = await page.query_selector("#tablesorter")
    tableBody = await table.query_selector("tbody")
    rows = await tableBody.query_selector_all("tr")
    for row in rows:
        cells = await row.query_selector_all("td")
        rowInfo = []
        for cell in cells:
            info = await cell.text_content()
            rowInfo.append(info)
        tableText.append(rowInfo)
        # testTableText.append(rowInfo)
        # testFirstTable.append(rowInfo)

In [62]:
state = {"test_page_counter": 1, "page_counter": 1}

In [63]:
async def go_to_next_page():
    # state["test_page_counter"] += 50  
    state["page_counter"] += 50  
    # nextUrl = f"https://cumulis.epa.gov/supercpad/Cursites/srchrslt.cfm?Start={state['test_page_counter']}&sortby=site"
    nextUrl = f"https://cumulis.epa.gov/supercpad/Cursites/srchrslt.cfm?Start={state['page_counter']}&sortby=site"
    await page.goto(nextUrl)

#### Test for the first table and navigating to the next table

In [8]:
await get_table_text()

In [9]:
print(testFirstTable)

[['NED981713837', '10TH STREET SITE', 'COLUMBUS', 'PLATTE', 'NE', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'Yes', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t '], ['KSD007241656', '29TH & MEAD GROUND WATER CONTAMINATION', 'WICHITA', 'SEDGWICK', 'KS', '\n\t\t\t\n\t\t\t\t\tDeleted NPL\n\t\t\t\t\n\t\t', 'No ', 'No ', 'No ', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t '], ['ALN000410750', '35TH AVENUE', 'BIRMINGHAM', 'JEFFERSON', 'AL', '\n\t\t\t\n\t\t\t\t\tProposed NPL\n\t\t\t\t\n\t\t', 'No ', 'No ', 'No ', '\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t', '\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t '], ['KSD981710247', '57TH AND NORTH BROADWAY STREETS SITE', 'WICHITA', 'SEDGWICK', 'KS', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'No ', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tNo\n\t\t\n\t '], ['MDD980918387', '68TH STREET DUMP/INDUSTRIAL ENTERPRISES', 'ROSEDALE', 'BALTIMORE', 'MD', '\n\t\t\t\n\t\t\t\t\tPropo

In [36]:
# Test
# await go_to_next_page()

#### Test loop to get pages 1, 2, 3
(navigate to page 1 happens above)
1. call get_table_text to capture 1st page
2. call go_to_next_page for 2nd page --> gets us up to 51
3. call get_table_text to capture 2nd page
4. call go_to_next_page for 3rd page --> gets us up to 101
5. call get_table_text to capture 3rd page

So while loop should be for < 101

In [48]:
# get first table text
await get_table_text()
# enter into loop
while state["test_page_counter"] < 101:
    await go_to_next_page()
    await get_table_text()
    print('just got table text after navigating to:', state["test_page_counter"])

just got table text after navigating to: 51
just got table text after navigating to: 101


In [49]:
print(len(testTableText))

150


In [50]:
print(testTableText)

[['NED981713837', '10TH STREET SITE', 'COLUMBUS', 'PLATTE', 'NE', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'Yes', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t '], ['KSD007241656', '29TH & MEAD GROUND WATER CONTAMINATION', 'WICHITA', 'SEDGWICK', 'KS', '\n\t\t\t\n\t\t\t\t\tDeleted NPL\n\t\t\t\t\n\t\t', 'No ', 'No ', 'No ', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t '], ['ALN000410750', '35TH AVENUE', 'BIRMINGHAM', 'JEFFERSON', 'AL', '\n\t\t\t\n\t\t\t\t\tProposed NPL\n\t\t\t\t\n\t\t', 'No ', 'No ', 'No ', '\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t', '\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t '], ['KSD981710247', '57TH AND NORTH BROADWAY STREETS SITE', 'WICHITA', 'SEDGWICK', 'KS', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'No ', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tNo\n\t\t\n\t '], ['MDD980918387', '68TH STREET DUMP/INDUSTRIAL ENTERPRISES', 'ROSEDALE', 'BALTIMORE', 'MD', '\n\t\t\t\n\t\t\t\t\tPropo

### Run to get all tables
1. we have already navigated to first table
2. get first table info
3. start while loop
4. 1904 total sites so will go until < 1901

In [65]:
# get first table text
await get_table_text()
# enter into loop
while state["page_counter"] < 1901:
    await go_to_next_page()
    await get_table_text()
    print('just got table text after navigating to:', state["page_counter"])

just got table text after navigating to: 51
just got table text after navigating to: 101
just got table text after navigating to: 151
just got table text after navigating to: 201
just got table text after navigating to: 251
just got table text after navigating to: 301
just got table text after navigating to: 351
just got table text after navigating to: 401
just got table text after navigating to: 451
just got table text after navigating to: 501
just got table text after navigating to: 551
just got table text after navigating to: 601
just got table text after navigating to: 651
just got table text after navigating to: 701
just got table text after navigating to: 751
just got table text after navigating to: 801
just got table text after navigating to: 851
just got table text after navigating to: 901
just got table text after navigating to: 951
just got table text after navigating to: 1001
just got table text after navigating to: 1051
just got table text after navigating to: 1101
just got

In [66]:
print(len(tableText))

1904


In [67]:
print(tableText[-1])

['NJD986643153', 'ZSCHIEGNER REFINING', 'HOWELL TOWNSHIP', 'MONMOUTH', 'NJ', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'Yes', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t ']


In [68]:
print(tableText[0])

['NED981713837', '10TH STREET SITE', 'COLUMBUS', 'PLATTE', 'NE', '\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t', 'No ', 'Yes', 'Yes', '\n\t\t\n\t\t\tYes\n\t\t\n\t', '\n\t\t\n\t\t\tYes\n\t\t\n\t ']


In [69]:
columns = ["EPA ID", "Site Name", "City", "Country", "State", "National Priorities List Status", "Superfund Alternative Approach", "Construction Complete", "Site-wide Ready for Anticipated Use", "Human Exposure Under Control", "Groundwater Migration Under Control"]

In [70]:
len(columns)

11

In [71]:
print(len(tableText[-1]))

11


In [76]:
df = pd.DataFrame(tableText, columns=columns)

In [77]:
print(df)

            EPA ID                                Site Name             City  \
0     NED981713837                         10TH STREET SITE         COLUMBUS   
1     KSD007241656   29TH & MEAD GROUND WATER CONTAMINATION          WICHITA   
2     ALN000410750                              35TH AVENUE       BIRMINGHAM   
3     KSD981710247     57TH AND NORTH BROADWAY STREETS SITE          WICHITA   
4     MDD980918387  68TH STREET DUMP/INDUSTRIAL ENTERPRISES         ROSEDALE   
...            ...                                      ...              ...   
1899  NYD000511733                             YORK OIL CO.            MOIRA   
1900  AZ0971590062            YUMA MARINE CORPS AIR STATION             YUMA   
1901  OHD980794598                    ZANESVILLE WELL FIELD       ZANESVILLE   
1902  FLD049985302      ZELLWOOD GROUND WATER CONTAMINATION         ZELLWOOD   
1903  NJD986643153                      ZSCHIEGNER REFINING  HOWELL TOWNSHIP   

        Country State                  

In [78]:
df.head(10)

Unnamed: 0,EPA ID,Site Name,City,Country,State,National Priorities List Status,Superfund Alternative Approach,Construction Complete,Site-wide Ready for Anticipated Use,Human Exposure Under Control,Groundwater Migration Under Control
0,NED981713837,10TH STREET SITE,COLUMBUS,PLATTE,NE,\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t,No,Yes,Yes,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
1,KSD007241656,29TH & MEAD GROUND WATER CONTAMINATION,WICHITA,SEDGWICK,KS,\n\t\t\t\n\t\t\t\t\tDeleted NPL\n\t\t\t\t\n\t\t,No,No,No,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
2,ALN000410750,35TH AVENUE,BIRMINGHAM,JEFFERSON,AL,\n\t\t\t\n\t\t\t\t\tProposed NPL\n\t\t\t\t\n\t\t,No,No,No,\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t,\n\t\t\n\t\t\t\tStatus Unavailable\n\t\t\t\n\t
3,KSD981710247,57TH AND NORTH BROADWAY STREETS SITE,WICHITA,SEDGWICK,KS,\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t,No,Yes,No,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tNo\n\t\t\n\t
4,MDD980918387,68TH STREET DUMP/INDUSTRIAL ENTERPRISES,ROSEDALE,BALTIMORE,MD,\n\t\t\t\n\t\t\t\t\tProposed NPL\n\t\t\t\t\n\t\t,Yes,No,No,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
5,UTD981548985,700 SOUTH 1600 EAST PCE PLUME,SALT LAKE CITY,SALT LAKE,UT,\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t,No,No,No,\n\t\t\n\t\t\tNo\n\t\t\n\t,\n\t\t\n\t\t\tInsufficient Data\n\t\t\n\t
6,ILD980397079,"A & F MATERIAL RECLAIMING, INC.",GREENUP,CUMBERLAND,IL,\n\t\t\t\n\t\t\t\t\tDeleted NPL\n\t\t\t\t\n\t\t,No,Yes,Yes,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
7,NJD030253355,A. O. POLYMER,SPARTA TOWNSHIP,SUSSEX,NJ,\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t,No,Yes,Yes,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
8,PAD004351003,A.I.W. FRANK/MID-COUNTY MUSTANG,EXTON,CHESTER,PA,\n\t\t\t\n\t\t\t\t\tFinal NPL\n\t\t\t\t\n\t\t,No,Yes,Yes,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t
9,KYD980500961,A.L. TAYLOR (VALLEY OF DRUMS),BROOKS,BULLITT,KY,\n\t\t\t\n\t\t\t\t\tDeleted NPL\n\t\t\t\t\n\t\t,No,Yes,No,\n\t\t\n\t\t\tYes\n\t\t\n\t,\n\t\t\n\t\t\tYes\n\t\t\n\t


In [79]:
df.to_csv('scraped_data_uncleaned.csv', index=False)
print("DataFrame exported to 'scraped_data_uncleaned.csv'")

DataFrame exported to 'scraped_data_uncleaned.csv'


In [80]:
await browser.close()
playwright.stop()

<coroutine object PlaywrightContextManager.__aexit__ at 0x1263c6030>