In [1]:
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Start the browser 
def setup_browser(headless=False):
    options = Options()
    options.headless = headless
    
    # path to geckodriver
    service = Service('/Users/noemilucchi/Desktop/CodeTerm2/TextMining/geckodriver')
    browser = webdriver.Firefox(service=service, options=options)
    return browser

In [3]:
def scroll_page(browser, scroll_amount=300):
    browser.execute_script(f"window.scrollBy(0, {scroll_amount});")
    time.sleep(0.5)

In [None]:
def find_president_checkbox(browser, president_name):
    try:
        # Scroll down in the page to ensure all presidents are loaded
        for _ in range(15):
            scroll_page(browser)
        
        # Wait explicitly for the president elements to be fully loaded
        WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "label[for^='edit-field-president-target-id']")))
        
        # FIRST ATTEMPT: Force the browser to reload or interact with the dropdown
        try:
            # Try clicking on the dropdown or a related element to activate it
            dropdown_element = browser.find_element(By.CSS_SELECTOR, "select[name='field_president_target_id'], div.form-item-field-president-target-id")
            dropdown_element.click()
            time.sleep(3)  
        except Exception as e:
            logging.warning(f"Could not click dropdown: {str(e)}")
        
        time.sleep(5)
        
        # SECOND ATTEMPT: Try using JavaScript to get the element content
        presidents = browser.find_elements(By.CSS_SELECTOR, "label[for^='edit-field-president-target-id']")
        if not presidents:
            presidents = browser.find_elements(By.CSS_SELECTOR, "div.js-form-item-field-president-target-id label")
        if not presidents:
            presidents = browser.find_elements(By.CSS_SELECTOR, "div.form-item-field-president-target-id label")
            
        logging.info(f"Found {len(presidents)} president elements")
        
        # Check if first few elements have empty text, if so try JavaScript approach
        if presidents and not presidents[0].text.strip():
            logging.warning("Text content not available through normal means, trying JavaScript")
            
            # Use JavaScript to extract text content
            president_texts = []
            for i, president in enumerate(presidents):
                try:
                    # Get text content using JavaScript
                    js_text = browser.execute_script("return arguments[0].textContent;", president)
                    president_texts.append((president, js_text.strip()))
                except Exception as e:
                    logging.error(f"Error getting text via JavaScript: {str(e)}")
            
            # Look for the president by name in the JavaScript-extracted text
            for president_element, president_text in president_texts:
                logging.info(f"JS found president option: {president_text}")
                
                if president_name == president_text:
                    logging.info(f"Found exact match for {president_name} using JavaScript")
                    return president_element
                elif president_name in president_text:
                    logging.info(f"Found partial match for {president_name} using JavaScript: {president_text}")
                    return president_element
        
        # THIRD ATTEMPT: use XPath to find by text
        try:
            # Try a direct XPath search for text containing the president's name
            first_name = president_name.split()[0]
            last_name = president_name.split()[-1]
            xpath_query = f"//label[contains(text(), '{first_name}') and contains(text(), '{last_name}')]"
            xpath_elements = browser.find_elements(By.XPATH, xpath_query)
            
            if xpath_elements:
                logging.info(f"Found match using XPath for {president_name}")
                return xpath_elements[0]
        except Exception as e:
            logging.error(f"XPath search failed: {str(e)}")
        
        # FOURTH ATTEMPT: Try to find all possible president elements by any means
        all_labels = browser.find_elements(By.TAG_NAME, "label")
        for label in all_labels:
            try:
                label_text = label.text.strip()
                if not label_text:
                    label_text = browser.execute_script("return arguments[0].textContent;", label).strip()
                
                if label_text and (president_name == label_text or president_name in label_text):
                    logging.info(f"Found match in general label search: {label_text}")
                    return label
            except Exception:
                continue
        
        # If all attempts failed, log the error
        logging.error(f"Could not find element for {president_name} after multiple attempts")
        return None
        
    except Exception as e:
        logging.error(f"Error finding president {president_name}: {str(e)}")
        return None

In [None]:
# find and select a president by name 
def select_president(browser, president_name):
    try:
        president_element = find_president_checkbox(browser, president_name)
        
        if not president_element:
            return False
            
        # Click on the president element
        browser.execute_script("arguments[0].scrollIntoView({block: 'center'});", president_element)
        time.sleep(1)
        browser.execute_script("arguments[0].click();", president_element)
        logging.info(f"Selected president: {president_name}")
        time.sleep(5)  
        return True
        
    except Exception as e:
        logging.error(f"Error selecting president {president_name}: {str(e)}")
        return False

In [None]:
#  Get all links to speeches on the current page 
def get_speech_links(browser):
    speech_elements = []
    
    try:
        # Scroll down to make sure speeches are visible
        for _ in range(20):
            scroll_page(browser)

        speech_titles = browser.find_elements(By.CSS_SELECTOR, "span.field-content a")
        
        logging.info(f"Found {len(speech_titles)} speech title elements")
        
        for title_element in speech_titles:
            try:
                title = title_element.text.strip()
                link = title_element.get_attribute("href")
                
                if title and link:
                    speech_elements.append({"title": title, "link": link})
            except Exception as e:
                logging.error(f"Error processing speech title: {str(e)}")
                
    except Exception as e:
        logging.error(f"Error finding speech elements: {str(e)}")
    
    logging.info(f"Collected {len(speech_elements)} speech links")
    return speech_elements

In [None]:
# Click the specific button to see the transcript when the text is not immediately available (because of video and audio)
def click_view_transcript_button(browser):
    try:
        view_button = WebDriverWait(browser, 3).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.expandable-text-trigger[href='#dp-expandable-text']"))
        )
        browser.execute_script("arguments[0].scrollIntoView({block: 'center'});", view_button)
        time.sleep(1)
        browser.execute_script("arguments[0].click();", view_button)
        logging.info("Clicked 'View Transcript' button")
        time.sleep(2)
        return True
    except:
        return False

In [8]:
def scrape_speech_content(browser, speech_link):
    try:
        browser.get(speech_link)
        time.sleep(3)
        
        # Try to click the View Transcript button if present
        click_view_transcript_button(browser)
        
        try:
            content_element = WebDriverWait(browser, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.view-transcript"))
            )
            
            speech_content = content_element.text.strip()
            if speech_content:
                logging.info("Extracted speech content")
                return speech_content
        except:
            # Fallback to main content if transcript element not found
            try:
                main_content = browser.find_element(By.TAG_NAME, "main")
                speech_content = main_content.text.strip()
                logging.info("Extracted speech content from main page")
                return speech_content
            except:
                return "Content could not be extracted"
            
    except Exception as e:
        logging.error(f"Error scraping speech content: {str(e)}")
        return "Error occurred while extracting content"

In [9]:
def scrape_president_speeches(browser, president_name):
    speeches_data = []
    
    # Select the president
    if not select_president(browser, president_name):
        logging.error(f"Failed to select president {president_name}, skipping...")
        return speeches_data
    
    # Get all speech links for this president
    speech_links = get_speech_links(browser)
    
    # Scrape content for each speech
    for i, speech in enumerate(speech_links):
        logging.info(f"Processing speech {i+1}/{len(speech_links)}: {speech['title']}")
        content = scrape_speech_content(browser, speech["link"])
        
        speeches_data.append({
            "President": president_name,
            "Title": speech["title"],
            "Content": content,
            "URL": speech["link"]})

    return speeches_data

In [None]:
def main():
    # List of presidents to scrape 
    presidents_to_scrape = [ "Harry S. Truman", "Franklin D. Roosevelt", "Dwight D. Eisenhower", "John F. Kennedy", "Lyndon B. Johnson",
                            "Richard M. Nixon",  "Gerald Ford", "Jimmy Carter", "Ronald Reagan", "George H. W. Bush", "Bill Clinton",
                            "George W. Bush", "Barack Obama", "Donald Trump", "Joe Biden", "George Washington",  "James Monroe",
                             "William Harrison","Millard Fillmore", "Andrew Johnson", "Chester A. Arthur", "Theodore Roosevelt",
                             "Calvin Coolidge", "John Adams", "John Quincy Adams", "John Tyler", "Franklin Pierce", "Ulysses S. Grant",
                             "Grover Cleveland",  "William Taft", "Herbert Hoover", "Thomas Jefferson", "Andrew Jackson", "James K. Polk",
                             "James Buchanan", "Rurtherford B. Hayes", "Benjamin Harrison", "Woodrow Wilson", "Martin Van Buren",
                             "Zachary Taylor", "Abraham Lincoln", "James A. Garfield", "William McKinley", "Warren G. Harding", "James Madison"]
    
    browser = setup_browser(headless=False)
    
    try:
        # Process one president at a time
        for president_name in presidents_to_scrape:
            logging.info(f"Starting to process speeches for {president_name}")
            
            # Navigate to the speeches page for each president (fresh start)
            browser.get("https://millercenter.org/the-presidency/presidential-speeches")
            browser.maximize_window()
            time.sleep(5)
            
            # Scrape speeches for this president
            president_speeches = scrape_president_speeches(browser, president_name)
            
            # Save the data for this president
            if president_speeches:
                president_df = pd.DataFrame(president_speeches)
                president_df.to_csv(f"{president_name.replace(' ', '_')}.csv", index=False)
                logging.info(f"Saved {len(president_speeches)} speeches for {president_name}")
            else:
                logging.warning(f"No speeches collected for {president_name}")
        
        logging.info("Scraping completed successfully")
        
    except Exception as e:
        logging.error(f"Main process error: {str(e)}")
        import traceback
        logging.error(traceback.format_exc())
    
    finally:
        browser.quit()

if __name__ == "__main__":
    main()

2025-03-03 21:51:10,713 [INFO] Starting to process speeches for Harry S. Truman
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:197:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:527:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

2025-03-03 21:51:30,698 [INFO] Found 45 president elements
2025-03-03 21:51:30,752 [INFO] JS found president option: George Washington
2025-03-03 21:51:30,752 [INFO] JS found president option: John Adams
2025-03-03 21:51:30,753 [INFO] JS found president option: Thomas Jefferson
2025-03-03 21:51:30,753 [INFO] JS found president option: James Madison
2025-03-03 21:51:30,753 [INFO] JS found president option: James Monroe
2025-03-03 21:51:30,753 [INFO] JS found president option: John Quincy Adams
2025-03-03 21:51:30,754 [INFO] JS found president option: Andrew Jackson
2025-03-03 21:51:30,754 [INFO] JS found presiden