In [1]:
import os                                            # For file and directory operations
from selenium import webdriver                       # Core Selenium WebDriver interface
from selenium.webdriver.common.by import By          # “By” lets us select elements by CSS, XPath, etc.
from selenium.webdriver.chrome.service import Service # Wraps ChromeDriver in a service object
from webdriver_manager.chrome import ChromeDriverManager  # Auto-installs the correct ChromeDriver
import time                                          # To pause execution (time.sleep)
import csv                                           # To read/write CSV files

In [2]:
# Set up WebDriver using WebDriver Manager

service = Service(ChromeDriverManager().install())    # Download & start ChromeDriver
driver = webdriver.Chrome(service=service)            # Launch a Chrome browser session

In [3]:
# Navigate to the target URL
url = "https://documents.worldbank.org/en/publication/documents-reports/documentlist?docty_exact=Program+Document&srt=docdt&order=desc"
driver.get(url)                                       # Open the URL in the browser
time.sleep(5)                                         # Wait 5 seconds for all content to load


In [4]:
# Define the base path where we’ll save our data
path1 = '/Users/pastudilloe/Library/CloudStorage/Dropbox/01 CONSULTING/WB_PriorActions_Poverty'

In [5]:
# Scrape through all pages, gathering data into a list

all_data = []                                         # Initialize an empty list to hold each document’s metadata

while True:
    # — find every document “card” on the current page
    listings = driver.find_elements(
        By.CSS_SELECTOR,
        "div.search-listing.ng-tns-c0-0.ng-star-inserted"
    )

    for listing in listings:
        # Prepare placeholders for each field
        project_name = ""
        link = ""
        doc_type = ""
        report_no = ""
        doc_date = ""
        disclosure_status = ""
        author = ""

        # — Extract title and link
        try:
            title_elem = listing.find_element(By.CSS_SELECTOR, "h3.ng-tns-c0-0 a")
            project_name = title_elem.text.strip()       # Visible text of the link
            link = title_elem.get_attribute("href")      # URL behind the link
        except Exception as e:
            print("Error extracting title:", e)

        # — Extract the other fields from the “info” section
        try:
            info_elem = listing.find_element(By.CSS_SELECTOR, "div.search-listing-info")
            spans = info_elem.find_elements(By.CSS_SELECTOR, "span.info-list-item")
            for span in spans:
                text = span.text.strip()
                if "Document Type:" in text:
                    doc_type = text.split("Document Type:")[-1].strip()
                elif "Report No.:" in text:
                    report_no = text.split("Report No.:")[-1].strip()
                elif "Document Date:" in text:
                    doc_date = text.split("Document Date:")[-1].strip()
                elif "Disclosure Status:" in text:
                    disclosure_status = text.split("Disclosure Status:")[-1].strip()
                elif "Author:" in text:
                    author = text.split("Author:")[-1].strip()
        except Exception as e:
            print("Error extracting document details:", e)

        # — Add this document’s info to our master list
        all_data.append({
            "Project Name": project_name,
            "Link": link,
            "Document Type": doc_type,
            "Report No.": report_no,
            "Document Date": doc_date,
            "Disclosure Status": disclosure_status,
            "Author": author
        })

    # — Try to move to the next page; if there isn’t one, break out
    try:
        next_button = driver.find_element(
            By.XPATH,
            "//ul[contains(@class, 'pagination')]"
            "//li[not(contains(@class, 'disabled'))]"
            "//a[.//i[contains(@class, 'fa-angle-right')]]"
        )
        next_button.click()                            # Click the “>” arrow
        time.sleep(5)                                  # Allow the new page to load
    except Exception as e:
        print("No next page found or error clicking next page:", e)
        break                                         # Exit the loop when pagination ends



KeyboardInterrupt: 

In [6]:
# Prepare the folder where we’ll dump our CSV

path = os.getcwd()                                   # Current working directory
documents_dir = os.path.join(path, path1 + "/Documents")
if not os.path.exists(documents_dir):
    os.makedirs(documents_dir)                       # Create directory if it doesn’t exist


In [7]:
# Write everything out to a CSV

csv_filename = path1 + "/Documents/world_bank_documents_urls.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
    fieldnames = [
        "Project Name", "Link", "Document Type", "Report No.",
        "Document Date", "Disclosure Status", "Author"
    ]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()                             # Column headers
    writer.writerows(all_data)                       # All our scraped rows

print(f"Data saved to {csv_filename}")               # Confirmation message
driver.quit()                                        # Close Chrome and end the session


Data saved to /Users/pastudilloe/Library/CloudStorage/Dropbox/01 CONSULTING/WB_PriorActions_Poverty/Documents/world_bank_documents_urls.csv
