In [2]:
import os                                                       # For file and directory operations
import time                                                     # To pause execution (time.sleep)
import csv                                                      # To read/write CSV files
import requests                                                 # For HTTP requests to download text files
from selenium import webdriver                                  # Core Selenium WebDriver interface
from selenium.webdriver.common.by import By                     # To locate elements by CSS, XPath, etc.
from selenium.webdriver.chrome.service import Service            # To wrap ChromeDriver in a service object
from webdriver_manager.chrome import ChromeDriverManager         # To auto-install the correct ChromeDriver
from selenium.common.exceptions import (                         # Common Selenium exceptions for robust error handling
    NoSuchWindowException,
    UnexpectedAlertPresentException,
    InvalidSessionIdException
)

In [3]:
# Define a helper to initialize the Selenium WebDriver

def initialize_driver():
    """Initialize and return a new Selenium Chrome WebDriver instance."""
    service = Service(ChromeDriverManager().install())            # Download & start ChromeDriver
    return webdriver.Chrome(service=service)                      # Launch a Chrome browser session

# --- Initialize Selenium WebDriver once at start ---
driver = initialize_driver()


In [4]:
# Load the CSV of missing document links

path1 = '/Users/pastudilloe/Library/CloudStorage/Dropbox/01 CONSULTING/WB_PriorActions_Poverty'
csv_filename = os.path.join(path1, "Documents", "world_bank_documents_urls_missing.csv")
rows = []
with open(csv_filename, mode="r", encoding="utf-8") as f:
    reader = csv.DictReader(f)                                   # Create a DictReader to parse CSV rows by header
    for row in reader:
        rows.append(row)                                         # Collect each row (a dict) into a list


In [5]:
# Ensure the "Text" directory exists for saving downloads

text_dir = os.path.join(path1, "Text")
if not os.path.exists(text_dir):
    os.makedirs(text_dir)                                        # Create the directory if it’s not already there



In [6]:
# Set up a log CSV to record download statuses

log_filename = os.path.join(path1, "download_log.csv")
log_fields = ["Report No.", "Status", "Message", "Link"]
log_file = open(log_filename, mode="w", encoding="utf-8", newline="")
log_writer = csv.DictWriter(log_file, fieldnames=log_fields)
log_writer.writeheader()                                        # Write the header row



32

In [7]:
# Define a function to close pop-ups or alerts on the page

def close_popup(driver):
    """
    Attempt to dismiss unexpected alerts and click common popup-close buttons.
    """
    # 1) Dismiss any JavaScript alert
    try:
        alert = driver.switch_to.alert
        alert.dismiss()
        print("Dismissed an unexpected alert.")
    except Exception:
        pass

    # 2) Click any visible close buttons matching common selectors
    try:
        close_buttons = driver.find_elements(
            By.XPATH,
            "//*[contains(@class, 'popup-close') or contains(@class, 'close-button') or contains(@id, 'close')]"
        )
        for btn in close_buttons:
            try:
                btn.click()
                print("Popup closed via close button.")
                break
            except Exception as inner_e:
                print(f"Failed to click a popup close button: {inner_e}")
    except Exception as e:
        print(f"No popup found or error closing popup: {e}")



In [8]:
# Iterate over each CSV row to download the corresponding TXT file

for row in rows:
    link = row["Link"]                                           # The document’s detail page URL
    report_no = row["Report No."]                                # Unique identifier for the document

    print(f"\nProcessing Report No.: {report_no}")

    # Navigate to the document detail page, handling alerts or session issues
    try:
        driver.get(link)
    except UnexpectedAlertPresentException:
        # If an unexpected alert pops up during navigation, dismiss it and retry
        try:
            alert = driver.switch_to.alert
            alert.dismiss()
            print("Dismissed unexpected alert during page load.")
            driver.get(link)
        except Exception as inner_e:
            print(f"Error dismissing alert for Report No. {report_no}: {inner_e}")
    except InvalidSessionIdException:
        # If the WebDriver session is invalid, restart it
        print("Invalid session detected. Reinitializing driver.")
        try:
            driver.quit()
        except Exception:
            pass
        driver = initialize_driver()
        driver.get(link)

    time.sleep(20)                                               # Wait for the page to fully load (adjust as needed)

    # Close any pop-ups that appear
    close_popup(driver)

    # Locate the link to the plain-text version of the document
    try:
        txt_link_elem = driver.find_element(
            By.XPATH,
            "//div[@class='main-detail']//ul[@class='document-link']//a[contains(@href, '/text/')]"
        )
        txt_url = txt_link_elem.get_attribute("href")
        print(f"Found TXT URL: {txt_url} for Report No.: {report_no}")
    except Exception as e:
        # Log and skip if we can’t find the TXT link
        err_msg = f"Error finding TXT link: {e}"
        print(f"Error for Report No. {report_no}: {err_msg}")
        log_writer.writerow({
            "Report No.": report_no,
            "Status": "Link Not Found",
            "Message": err_msg,
            "Link": link
        })
        continue

    # Download the text file via HTTP
    try:
        response = requests.get(txt_url)
        if response.status_code == 200:
            file_path = os.path.join(text_dir, f"{report_no}.txt")
            with open(file_path, mode="w", encoding="utf-8") as f:
                f.write(response.text)
            success_msg = f"Saved to {file_path}"
            print(f"Saved text file for Report No. {report_no} to {file_path}")
            log_writer.writerow({
                "Report No.": report_no,
                "Status": "Downloaded",
                "Message": success_msg,
                "Link": txt_url
            })
        else:
            # Log non-200 HTTP responses
            fail_msg = f"HTTP status code: {response.status_code}"
            print(f"Failed to download TXT for Report No. {report_no}, {fail_msg}")
            log_writer.writerow({
                "Report No.": report_no,
                "Status": "Failed",
                "Message": fail_msg,
                "Link": txt_url
            })
    except Exception as e:
        # Log any exceptions during download
        err_msg = f"Error downloading TXT file: {e}"
        print(f"Error downloading TXT file for Report No. {report_no}: {err_msg}")
        log_writer.writerow({
            "Report No.": report_no,
            "Status": "Error",
            "Message": err_msg,
            "Link": txt_url
        })

# Clean up: close browser and log file
driver.quit()
log_file.close()

print(f"Log generated: {log_filename}")


Processing Report No.: 87083


KeyboardInterrupt: 