In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
import time
import csv
import re
from datetime import datetime

options = Options()
options.headless = False
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 20)
base_url = "https://www.federalreserve.gov"

def extract_speech_details(link):
    try:
        print(f"\nBesuche Detailseite: {link}")
        driver.execute_script("window.open('');")
        time.sleep(1)
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)
        print("Seite geladen, suche Inhalt...")

        content = ""
        possible_selectors = [
            "div.col-xs-12.col-md-8",
            "div.col-md-9",
            "div.panel-body",
            "div#content",
            "div.article",
            "div#article",
            "main",
            "body"
        ]

        for selector in possible_selectors:
            try:
                containers = driver.find_elements(By.CSS_SELECTOR, selector)
                for container in containers:
                    text = container.get_attribute("innerText").strip()
                    if text and len(text) > len(content):
                        content = text
            except:
                continue

        if not content:
            print("Kein Inhalt gefunden.")
            content = "Nicht verfügbar"
        else:
            print(f"Inhalt gefunden mit Länge: {len(content)} Zeichen")

        # Versuche das Datum direkt aus dem Seiteninhalt zu holen
        date = "Nicht verfügbar"
        try:
            full_page = driver.find_element(By.TAG_NAME, "body").text
            date_match = re.search(r"\b([A-Z][a-z]+ \d{1,2}, \d{4})\b", full_page)
            if date_match:
                raw_date = date_match.group(1)
                try:
                    parsed_date = datetime.strptime(raw_date, "%B %d, %Y")
                    date = parsed_date.strftime("%Y-%m-%d")
                except:
                    date = raw_date
        except:
            pass

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return date, content

    except Exception as e:
        print(f"Fehler beim Extrahieren der Details: {str(e)}")
        try:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
        except:
            pass
        return "Nicht verfügbar", "Nicht verfügbar"

def process_year(year_url):
    print(f"\nVerarbeite Jahr: {year_url}")
    driver.get(year_url)
    time.sleep(3)
    speeches = []

    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#speechIndex li, div.panel.panel-default, div.speech-list")))
        print("Speech-Liste gefunden")

        speech_items = driver.find_elements(By.CSS_SELECTOR, "#speechIndex li, div.panel.panel-default, div.row.speech-list-item")
        print(f"Gefundene Einträge: {len(speech_items)}")

        for i, item in enumerate(speech_items, 1):
            print(f"\nVerarbeite Eintrag {i}/{len(speech_items)}")
            try:
                full_text = item.text.strip()
                date_match = re.search(r"\b([A-Z][a-z]+ \d{1,2}, \d{4})\b", full_text)
                if date_match:
                    raw_date = date_match.group(1)
                    try:
                        parsed_date = datetime.strptime(raw_date, "%B %d, %Y")
                        date = parsed_date.strftime("%Y-%m-%d")
                    except ValueError:
                        date = raw_date
                else:
                    date = "Nicht verfügbar"

                speaker = "Nicht verfügbar"
                for sel in [".speaker", "div.speaker", "p.speaker", "div.author"]:
                    try:
                        speaker = item.find_element(By.CSS_SELECTOR, sel).text.strip()
                        break
                    except:
                        continue

                title = "Nicht verfügbar"
                link = "Nicht verfügbar"
                for sel in [".title a", "a.news__item-title", "a.speech-title", "a"]:
                    try:
                        elem = item.find_element(By.CSS_SELECTOR, sel)
                        title = elem.text.strip()
                        link = elem.get_attribute("href")
                        if link and not link.startswith("http"):
                            link = urljoin(base_url, link)
                        break
                    except:
                        continue

                if link != "Nicht verfügbar":
                    print("Extrahiere Details...")
                    detailed_date, speech_text = extract_speech_details(link)

                    speeches.append({
                        "date": detailed_date if detailed_date != "Nicht verfügbar" else date,
                        "speaker": speaker,
                        "title": title,
                        "link": link,
                        "text": speech_text
                    })

            except Exception as e:
                print(f"Fehler bei Element {i}: {str(e)}")
                continue

    except Exception as e:
        print(f"Fehler beim Verarbeiten des Jahres: {str(e)}")

    return speeches

# CSV-Erstellung
with open("fed_speeches_all.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["year", "date", "speaker", "title", "link", "text"])
    writer.writeheader()

    # Jahreslinks sammeln
    print("\nSammle Jahreslinks...")
    driver.get(f"{base_url}/newsevents/speech/speeches-archive.htm")
    time.sleep(3)

    year_links = []
    for selector in ["a[href*='speech/2']", "a[href*='newsevents/speech/']", "ul.list-unstyled a", "div.panel-body a"]:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for el in elements:
                href = el.get_attribute("href")
                if href and any(str(year) in href for year in range(1996, 2024)):
                    if href not in year_links:
                        year_links.append(href)
        except:
            continue

    print(f"Gefundene Jahreslinks: {len(year_links)}")

    for year_url in year_links:
        if not year_url.startswith("http"):
            year_url = urljoin(base_url, year_url)

        year = year_url.split("/")[-1][:4]
        print(f"\nStarte Verarbeitung für Jahrgang: {year} ({year_url})")

        try:
            speeches = process_year(year_url)
            print(f"Gefundene Reden: {len(speeches)}")

            for speech in speeches:
                writer.writerow({**speech, "year": year})

            print(f"Fertig mit Jahrgang {year}")
        except Exception as e:
            print(f"Fehler bei {year_url}: {str(e)}")
            continue

print("\nScraping abgeschlossen")
driver.quit()



Sammle Jahreslinks...
Gefundene Jahreslinks: 10

Starte Verarbeitung für Jahrgang: 2005 (https://www.federalreserve.gov/newsevents/speech/2005speech.htm)

Verarbeite Jahr: https://www.federalreserve.gov/newsevents/speech/2005speech.htm
Speech-Liste gefunden
Gefundene Einträge: 87

Verarbeite Eintrag 1/87
Extrahiere Details...

Besuche Detailseite: https://www.federalreserve.gov/boarddocs/speeches/2005/20051214/default.htm
Seite geladen, suche Inhalt...
Inhalt gefunden mit Länge: 7694 Zeichen

Verarbeite Eintrag 2/87
Extrahiere Details...

Besuche Detailseite: https://www.federalreserve.gov/boarddocs/speeches/2005/20051206/default.htm
Seite geladen, suche Inhalt...
Inhalt gefunden mit Länge: 26773 Zeichen

Verarbeite Eintrag 3/87
Extrahiere Details...

Besuche Detailseite: https://www.federalreserve.gov/boarddocs/speeches/2005/20051205/default.htm
Seite geladen, suche Inhalt...
Inhalt gefunden mit Länge: 13983 Zeichen

Verarbeite Eintrag 4/87
Extrahiere Details...

Besuche Detailseite:

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
from datetime import datetime
import time
import csv
import re

options = Options()
options.headless = False
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 20)
base_url = "https://www.federalreserve.gov"

def extract_speech_details(link):
    try:
        print(f"\nBesuche Detailseite: {link}")
        driver.execute_script("window.open('');")
        time.sleep(1)
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)
        time.sleep(3)

        content = ""
        possible_selectors = [
            "div.col-xs-12.col-md-8",
            "div.col-md-9",
            "div.panel-body",
            "div#content",
            "div.article",
            "div#article",
            "main",
            "body"
        ]

        for selector in possible_selectors:
            try:
                containers = driver.find_elements(By.CSS_SELECTOR, selector)
                for container in containers:
                    text = container.get_attribute("innerText").strip()
                    if text and len(text) > len(content):
                        content = text
            except:
                continue

        if not content:
            content = "Nicht verfügbar"

        # Datum aus dem Text holen
        date = "Nicht verfügbar"
        try:
            full_page = driver.find_element(By.TAG_NAME, "body").text
            date_match = re.search(r"\b([A-Z][a-z]+ \d{1,2}, \d{4})\b", full_page)
            if date_match:
                raw_date = date_match.group(1)
                try:
                    parsed_date = datetime.strptime(raw_date, "%B %d, %Y")
                    date = parsed_date.strftime("%Y-%m-%d")
                except:
                    date = raw_date
        except:
            pass

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return date, content

    except Exception as e:
        print(f"Fehler beim Extrahieren der Details: {str(e)}")
        try:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
        except:
            pass
        return "Nicht verfügbar", "Nicht verfügbar"

def process_year(year):
    print(f"\nVerarbeite Jahr: {year}")
    if int(year) <= 2010:
        url = f"{base_url}/newsevents/speech/{year}speech.htm"
    else:
        url = f"{base_url}/newsevents/speech/{year}-speeches.htm"

    driver.get(url)
    time.sleep(3)

    speeches = []

    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".col-sm-8 .row")))
        outer_container = driver.find_element(By.CSS_SELECTOR, ".col-sm-8, .col-md-8")

        speech_blocks = outer_container.find_elements(By.CSS_SELECTOR, "div.row")
        print(f"Gefundene Einträge: {len(speech_blocks)}")

        for i, block in enumerate(speech_blocks, 1):
            try:
                date_elem = block.find_element(By.CSS_SELECTOR, "time")
                raw_date = date_elem.text.strip()
                try:
                    parsed_date = datetime.strptime(raw_date, "%m/%d/%Y")
                    date = parsed_date.strftime("%Y-%m-%d")
                except:
                    date = raw_date

                title_elem = block.find_element(By.CSS_SELECTOR, "p a[href*='/newsevents/speech/']")
                title = title_elem.text.strip()
                link = title_elem.get_attribute("href")
                if link and not link.startswith("http"):
                    link = urljoin(base_url, link)

                try:
                    speaker = block.find_element(By.CSS_SELECTOR, "p.news__speaker").text.strip()
                except:
                    speaker = "Nicht verfügbar"

                print(f"{i}. {date} | {speaker} | {title}")

                detailed_date, text = extract_speech_details(link)

                speeches.append({
                    "date": detailed_date if detailed_date != "Nicht verfügbar" else date,
                    "speaker": speaker,
                    "title": title,
                    "link": link,
                    "text": text
                })

            except Exception as e:
                print(f"Fehler bei Eintrag {i}: {str(e)}")
                continue

    except Exception as e:
        print(f"Fehler beim Verarbeiten von Jahr {year}: {str(e)}")

    return speeches

# CSV schreiben
with open("fed_speeches_2006_2025.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["year", "date", "speaker", "title", "link", "text"])
    writer.writeheader()

    for year in range(2006, 2026):
        try:
            speeches = process_year(str(year))
            print(f"Gefundene Reden für {year}: {len(speeches)}")
            for speech in speeches:
                writer.writerow({**speech, "year": year})
        except Exception as e:
            print(f"Fehler bei Jahr {year}: {str(e)}")
            continue

print("\nScraping abgeschlossen")
driver.quit()



Verarbeite Jahr: 2006
Gefundene Einträge: 73
1. 2006-12-15 | Chairman Ben S. Bernanke | The Chinese Economy: Progress and Challenges

Besuche Detailseite: https://www.federalreserve.gov/newsevents/speech/bernanke20061215a.htm
2. 2006-12-01 | Chairman Ben S. Bernanke | Welcoming remarks

Besuche Detailseite: https://www.federalreserve.gov/newsevents/speech/bernanke20061201a.htm
3. 2006-12-01 | Vice Chairman Donald L. Kohn | Monetary Policy and Uncertainty

Besuche Detailseite: https://www.federalreserve.gov/newsevents/speech/kohn20061201a.htm
4. 2006-11-30 | Governor Susan Schmidt Bies | A U.S. Perspective on Basel II Implementation

Besuche Detailseite: https://www.federalreserve.gov/newsevents/speech/bies20061130a.htm
5. 2006-11-28 | Chairman Ben S. Bernanke | The Economic Outlook

Besuche Detailseite: https://www.federalreserve.gov/newsevents/speech/bernanke20061128a.htm
6. 2006-11-21 | Governor Kevin Warsh | Financial Markets and the Federal Reserve

Besuche Detailseite: https://ww

In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv('fed_speeches_all.csv')
df1

Unnamed: 0,year,date,speaker,title,link,text
0,2005,2005-12-14,Chairman Alan Greenspan,Remarks on receipt of honorary degree,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nAcceptance...
1,2005,2005-12-06,Governor Susan Schmidt Bies,Linkages between Internal Capital Measures and...,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Susan Schmidt Bies\nAt the...
2,2005,2005-12-05,Governor Mark W. Olson,Economic Growth: Lessons from the Sioux Falls ...,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Mark W. Olson\nAt a Rotary...
3,2005,2005-12-02,Chairman Alan Greenspan,International imbalances,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nInternatio...
4,2005,2005-12-02,Chairman Alan Greenspan,Budget policy,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nBudget pol...
...,...,...,...,...,...,...
640,1996,1996-10-02,Governor Lawrence B. Lindsey,Small business is big business,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Lawrence B. Lindsey\nAt th...
641,1996,1996-09-19,Chairman Alan Greenspan,Regulation and electronic payment systems,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nRegulation...
642,1996,1996-09-08,Governor Laurence H. Meyer,Monetary policy objectives and strategy,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Laurence H. Meyer\nAt the ...
643,1996,1996-06-18,"Governor Edward W. Kelley, Jr.",Developments in electronic money and banking,https://www.federalreserve.gov/boarddocs/speec...,"Remarks by Governor Edward W. Kelley, Jr.\nDev..."


In [4]:
df2 = pd.read_csv('fed_speeches_2006_2025.csv')
df2

Unnamed: 0,year,date,speaker,title,link,text
0,2006,2006-12-15,Chairman Ben S. Bernanke,The Chinese Economy: Progress and Challenges,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1,2006,2006-12-01,Chairman Ben S. Bernanke,Welcoming remarks,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
2,2006,2006-12-01,Vice Chairman Donald L. Kohn,Monetary Policy and Uncertainty,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
3,2006,2006-11-30,Governor Susan Schmidt Bies,A U.S. Perspective on Basel II Implementation,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
4,2006,2006-11-28,Chairman Ben S. Bernanke,The Economic Outlook,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
...,...,...,...,...,...,...
1187,2025,2025-02-04,Vice Chair Philip N. Jefferson,U.S. Economic Outlook and Monetary Policy,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1188,2025,2025-01-31,Governor Michelle W. Bowman,"Brief Remarks on the Economy, and Perspective ...",https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1189,2025,2025-01-09,Governor Michelle W. Bowman,"Reflections on 2024: Monetary Policy, Economic...",https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1190,2025,2025-01-08,Governor Christopher J. Waller,Challenges Facing Central Bankers,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...


In [6]:
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,year,date,speaker,title,link,text
0,2005,2005-12-14,Chairman Alan Greenspan,Remarks on receipt of honorary degree,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nAcceptance...
1,2005,2005-12-06,Governor Susan Schmidt Bies,Linkages between Internal Capital Measures and...,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Susan Schmidt Bies\nAt the...
2,2005,2005-12-05,Governor Mark W. Olson,Economic Growth: Lessons from the Sioux Falls ...,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Governor Mark W. Olson\nAt a Rotary...
3,2005,2005-12-02,Chairman Alan Greenspan,International imbalances,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nInternatio...
4,2005,2005-12-02,Chairman Alan Greenspan,Budget policy,https://www.federalreserve.gov/boarddocs/speec...,Remarks by Chairman Alan Greenspan\nBudget pol...
...,...,...,...,...,...,...
1832,2025,2025-02-04,Vice Chair Philip N. Jefferson,U.S. Economic Outlook and Monetary Policy,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1833,2025,2025-01-31,Governor Michelle W. Bowman,"Brief Remarks on the Economy, and Perspective ...",https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1834,2025,2025-01-09,Governor Michelle W. Bowman,"Reflections on 2024: Monetary Policy, Economic...",https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
1835,2025,2025-01-08,Governor Christopher J. Waller,Challenges Facing Central Bankers,https://www.federalreserve.gov/newsevents/spee...,Skip to main content\nStay Connected \nRecent ...
