# Webscrapping WSJ

In [None]:
!pip install selenium webdriver-manager beautifulsoup4

In [7]:
import json
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# === Cookie-Datei laden ===
def load_cookies_from_file(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        raw_cookies = json.load(f)
    cookies = []
    for c in raw_cookies:
        cookies.append({
            "name": c["name"],
            "value": c["value"],
            "domain": c["domain"].lstrip("."),
            "path": c.get("path", "/")
        })
    return cookies

# === Cookie-Banner akzeptieren (auch im iFrame) ===
def accept_cookie_banner(driver):
    try:
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for frame in iframes:
            driver.switch_to.frame(frame)
            try:
                button = driver.find_element(By.XPATH, "//button[contains(text(), 'Yes, I Agree')]")
                button.click()
                print("✅ Cookie-Banner akzeptiert")
                time.sleep(2)
                driver.switch_to.default_content()
                return
            except NoSuchElementException:
                driver.switch_to.default_content()
                continue
        print("ℹ️ Kein Cookie-Banner gefunden")
    except Exception as e:
        print(f"❌ Fehler beim Akzeptieren des Cookie-Banners: {e}")

# === Artikeltext aus Artikel-Link laden ===
def extract_article_text(driver, url):
    try:
        driver.get(url)
        time.sleep(4)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        article = soup.find("div", id="wsj-article-wrap")
        if not article:
            return ""
        paragraphs = article.find_all("p")
        return "\n".join(p.get_text(strip=True) for p in paragraphs)
    except Exception as e:
        return f"[Fehler beim Laden: {e}]"

# === Hauptscraper ===
def scrape_wsj_articles(pages=2):
    cookies = load_cookies_from_file("cookies_wsj.json")
    url_base = "https://www.wsj.com/search?query=apple+stock&dateRange=1yr&isToggleOn=true&products=wsj%2Cblogs%2Cpro%2Clivecoverage%2Cbuyside&page="

    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # sichtbar lassen für Debug
    options.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # WSJ öffnen, Cookies setzen, Banner akzeptieren
    driver.get("https://www.wsj.com")
    time.sleep(3)
    accept_cookie_banner(driver)

    for cookie in cookies:
        try:
            driver.add_cookie(cookie)
        except Exception as e:
            print(f"⚠️ Cookie-Fehler: {cookie['name']} – {e}")

    driver.get("https://www.wsj.com")
    time.sleep(3)

    all_results = []

    for page in range(1, pages + 1):
        url = url_base + str(page)
        print(f"\n🔎 Lade Seite {page}: {url}")
        driver.get(url)
        time.sleep(5)

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        results = soup.find_all("div", class_="css-14rwwjy-PromoWrapper e1icd8hk0")

        print(f"🔍 {len(results)} Artikel gefunden")

        # Debug speichern
        with open(f"debug_page_{page}.html", "w", encoding="utf-8") as f:
            f.write(html)
        driver.save_screenshot(f"debug_page_{page}.png")

        for r in results:
            a = r.find("a", href=True)
            title = a.text.strip() if a else ""
            link = a["href"] if a else ""
            date_tag = r.find("time")
            date = date_tag["datetime"] if date_tag else ""
            print(f"📰 Lade Artikel: {title[:60]}...")
            content = extract_article_text(driver, link)
            all_results.append([title, link, date, content])
            time.sleep(2)

    driver.quit()

    # CSV speichern
    with open("wsj_apple_articles.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["title", "url", "date", "content"])
        writer.writerows(all_results)

    print(f"\n✅ {len(all_results)} Artikel gespeichert in wsj_apple_articles.csv")

# === Starten ===
if __name__ == "__main__":
    scrape_wsj_articles(pages=3)

ℹ️ Kein Cookie-Banner gefunden

🔎 Lade Seite 1: https://www.wsj.com/search?query=apple+stock&dateRange=1yr&isToggleOn=true&products=wsj%2Cblogs%2Cpro%2Clivecoverage%2Cbuyside&page=1
🔍 0 Artikel gefunden

🔎 Lade Seite 2: https://www.wsj.com/search?query=apple+stock&dateRange=1yr&isToggleOn=true&products=wsj%2Cblogs%2Cpro%2Clivecoverage%2Cbuyside&page=2
🔍 0 Artikel gefunden

🔎 Lade Seite 3: https://www.wsj.com/search?query=apple+stock&dateRange=1yr&isToggleOn=true&products=wsj%2Cblogs%2Cpro%2Clivecoverage%2Cbuyside&page=3
🔍 0 Artikel gefunden

✅ 0 Artikel gespeichert in wsj_apple_articles.csv
