In [1]:
## import libraries
import requests  # Makes HTTP requests to fetch web pages from URLs
from bs4 import BeautifulSoup  # Parses HTML content into navigable Python objects for web scraping
import pandas as pd  # Creates and manipulates DataFrames for organizing scraped data into tables
import time  # Adds delays between requests to avoid overwhelming the server
from random import uniform  # Generates random time intervals to make scraping delays less predictable

In [2]:
## create headers

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

In [3]:
## Request web content

##scrape url website
url = "https://comptroller.nyc.gov/reports/?fwp_agency=health-and-hospitals-corporation"
response = requests.get(url, headers=headers)

In [4]:
## did it work?
response.status_code


200

In [5]:
## convert response.text into a BeautifulSoup object
soup = BeautifulSoup(response.text, "html.parser")
soup

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<title>
  Reports - Office of the New York City Comptroller
Brad Lander</title>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="website" property="og:type">
<meta content="@NYCComptroller" name="twitter:site">
<meta content="Office of the New York City Comptroller Brad Lander" name="twitter:title"/>
<meta content="Comptroller Brad Lander is New York City’s Chief Financial Officer. An independently elected official, he safeguards the City’s fiscal health, roots out waste, fraud and abuse in local government, and ensures that municipal agencies serve the needs of all New Yorkers." name="twitter:description"/>
<meta content="Comptroller Brad Lander is New York City’s Chief Financial Officer. An independently elected official, he safeguards the City’s fiscal health, roots out waste, fraud and abuse in local government, and ensures that municipal agencies serve the needs of all New Yor

## Code for scraping report date and urls to webages with additional info on each report

In [7]:
all_dfs = []  ## hold all dfs (dataframes) that will be created for each page
base_url = "https://comptroller.nyc.gov/reports/?fwp_agency=health-and-hospitals-corporation"  ## base URL of the site to scrape
end_url = "&fwp_paged="
date_list = []
links_list = []

for page in range(1,3): 
    try:
        if page == 1:
            response = requests.get(f"{base_url}")
        else:
            response = requests.get(f"{base_url}{end_url}{page}")
    except:
        print(f"Problem with {base_url}{end_url}{page}")
    finally:
        soup = BeautifulSoup(response.text, "html.parser")

    ## find all report dates on the page (all dates are inside a class 'text-decoration-none row')
    all_targets = soup.find_all("a", class_="text-decoration-none row")

    ## extract report dates from each 'text-decoration' class
    ## used 'extend' since it won't let me use 'append'
    date_list.extend([target.find("small", class_="text-muted d-block").get_text() for target in all_targets])
    ## extract links of reports
    ## used 'extend' since it won't let me use 'append'
    links_list.extend([target.get("href") for target in all_targets])
            
    ## pause between page requests to avoid overwhelming the server (random delay between 30–40 seconds)
    snoozer = uniform(30,40)
    print(f"Created DF from page {page} and snoozing for {snoozer} seconds before next page")
    time.sleep(snoozer)  ## actually wait the random time before continuing
print(f"Done scraping all {page} pages")  ## confirm completion once all pages are processed

## combine lists and convert to a DataFrame
all_dfs = pd.DataFrame({"Report Date": date_list, "Report Link": links_list})


Created DF from page 1 and snoozing for 34.33968532322574 seconds before next page
Created DF from page 2 and snoozing for 39.25549092243375 seconds before next page
Done scraping all 2 pages


In [8]:
## check to make sure dataframe holds all requested information
all_dfs

Unnamed: 0,Report Date,Report Link
0,"Jun 6, 2023",https://comptroller.nyc.gov/reports/letter-to-...
1,"Jun 29, 2022",https://comptroller.nyc.gov/reports/audit-repo...
2,"Jul 17, 2020",https://comptroller.nyc.gov/reports/review-of-...
3,"Jun 23, 2020",https://comptroller.nyc.gov/reports/audit-repo...
4,"Jan 31, 2018",https://comptroller.nyc.gov/reports/audit-repo...
5,"Sep 18, 2014",https://comptroller.nyc.gov/reports/audit-repo...
6,"May 4, 2011",https://comptroller.nyc.gov/reports/audit-repo...
7,"Oct 13, 2010",https://comptroller.nyc.gov/reports/audit-repo...
8,"Jul 8, 2010",https://comptroller.nyc.gov/reports/audit-repo...
9,"Nov 24, 2009",https://comptroller.nyc.gov/reports/audit-repo...


In [9]:
## in preparation for scraping PDF urls and downloading PDFS, check to make sure links_list holds all report urls
links_list

['https://comptroller.nyc.gov/reports/letter-to-dr-mitchell-katz-regarding-hh-spending-on-agency-and-travel-nurses/',
 'https://comptroller.nyc.gov/reports/audit-report-on-the-new-york-city-health-hospitals-controls-over-inventory-of-medical-surgical-supplies-including-personal-protective-equipment-at-its-post-acute-and-long-term-acute-care-fa/',
 'https://comptroller.nyc.gov/reports/review-of-health-and-hospitals-corporations-response-to-covid-19/',
 'https://comptroller.nyc.gov/reports/audit-report-on-the-financial-and-operating-practices-of-the-children-of-bellevue-inc-2/',
 'https://comptroller.nyc.gov/reports/audit-report-on-the-epic-electronic-medical-record-system-that-nyc-health-hospitals-implemented-at-the-elmhurst-hospital-center/',
 'https://comptroller.nyc.gov/reports/audit-report-on-the-evaluation-of-the-efforts-to-manage-emergency-department-wait-times-by-kings-county-lincoln-and-elmhurst-hospitals/',
 'https://comptroller.nyc.gov/reports/audit-report-on-the-health-and-ho

In [12]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [14]:
## import in preparation for downloading PDFs
import wget

## Code for scraping PDF urls and downloading PDFs

In [18]:
# Loop through each link in links_list, keeping track of its index (starting at 1)
for i, link in enumerate(links_list, start = 1):
    # Print progress message showing which link is being scraped out of the total
    print(f"Scraping {i} of {len(links_list)}")
    
    try:
        # Send a GET request to the current link and store the response
        response = requests.get(link)
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        # Find the first <div> element on the page with class "sidebarPanel"
        target_panel = soup.find("div", class_="sidebarPanel")
        # Within that div, find the first <a> tag and extract its "href" attribute (the URL)
        target_url = target_panel.find("a").get("href")
        # Download the file located at the extracted URL using wget
        wget.download(target_url)
        
    except:
        # If any error occurs during the try block, print which link had a problem
        print(f"Problem with {link}")
        # Add the problematic link to the list of broken links for later review
        broken_list.append(link)
    
    finally:
        # Generate a random float between 30 and 40 seconds to use as a sleep duration
        snoozer = uniform(30,40)
        # Print how long the script will pause before moving to the next link
        print(f"Snoozing for {snoozer} seconds")
        # Pause the script for the randomly chosen number of seconds
        time.sleep(snoozer)

# After all links have been processed, print a completion message
print(f"Completed")


Scraping 1 of 20
100% [........................................................] 162746 / 162746Snoozing for 35.77659467925376 seconds
Scraping 2 of 20
100% [........................................................] 670683 / 670683Snoozing for 34.1268842544064 seconds
Scraping 3 of 20
100% [........................................................] 295608 / 295608Snoozing for 38.63378685640633 seconds
Scraping 4 of 20
100% [......................................................] 1007472 / 1007472Snoozing for 33.93237512473867 seconds
Scraping 5 of 20
100% [......................................................] 1059639 / 1059639Snoozing for 38.6621278216528 seconds
Scraping 6 of 20
100% [......................................................] 1222419 / 1222419Snoozing for 36.98374376535527 seconds
Scraping 7 of 20
100% [........................................................] 885050 / 885050Snoozing for 36.88809063600674 seconds
Scraping 8 of 20
100% [..................................