In [1]:
import os
import random
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime
import json

In [3]:

TICKER_LIST = [
    "AAPL", "MSFT", "AMZN", "TSLA", "GOOG", "NVDA", "BRK-B", "JNJ",
    "V", "WMT", "XOM", "JPM", "O", "PG", "HD", "PFE", "MA", "UNH", "BAC",
    "PEP", "KO", "DIS", "CVX", "AVGO", "MRK", "LLY", "ABBV", "INTC", "T",
    "CSCO", "CMCSA", "MCD", "NKE", "ADBE", "CRM", "COST", "WFC", "ABT", "TXN",
    "AMGN", "QCOM", "UPS", "LOW", "IBM", "GE", "CAT", "DE", "ORCL", "BA",
    "MDT", "MS", "GS", "LMT", "VRTX", "ADI", "FDX", "ZTS", "SBUX", "DHR"
]

# --- User-Agent and headers setup ---
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
]


OUTPUT_FOLDER = f"{datetime.now().strftime('%Y%m%d')}_scraping"


user_agent = random.choice(USER_AGENTS)

# --- Configure headless Chrome ---
options = Options()
options.add_argument("--headless")  # Run without UI
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument(f"user-agent={user_agent}")
options.add_argument("window-size=1920,1080")

for ticker in TICKER_LIST:
    # --- Launch browser ---
    driver = webdriver.Chrome(options=options)
    try:
        url = f"https://finance.yahoo.com/quote/{ticker}/news/"
        driver.get(url)

        wait = WebDriverWait(driver, 10)

        # --- Accept cookie banner ---
        try:
            cookie_button = wait.until(EC.element_to_be_clickable((By.NAME, "agree")))
            cookie_button.click()
            print("✅ Cookie banner accepted")
            time.sleep(3)
        except:
            print("⚠️ No cookie banner appeared")

        # --- Scroll to load more content ---
        scroll_pause = 2
        last_height = driver.execute_script("return document.body.scrollHeight")

        for _ in range(100):  # Scroll up to 100 times, or break if no new content loaded
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause)

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  # Stop scrolling if no more content loads
            last_height = new_height

        print("✅ Finished scrolling")

        # --- Parse content ---
        soup = BeautifulSoup(driver.page_source, "html.parser")
        articles = soup.find_all("div", class_="content yf-1y7058a")

        article_data = []
        for article in articles:
            link = article.find("a", class_="subtle-link")
            if link:
                href = link.get("href")
                title = link.get("title") or link.get_text(strip=True)
                article_data.append({
                    "title": title,
                    "link": href
                })
                print(f"Title: {title}")
                print(f"Link: {href}")
                print("-" * 40)

    finally:
        driver.quit()
        os.makedirs(OUTPUT_FOLDER, exist_ok=True)
        with open(f"{OUTPUT_FOLDER}/{ticker}_news.json", "w", encoding="utf-8") as f:
            json.dump(article_data, f, indent=2, ensure_ascii=False)

    print(f"Number of articles for ticker {ticker}: {len(articles)}")

✅ Cookie banner accepted
✅ Finished scrolling
Title: Apple's difficult 2025 is Tim Cook's biggest test yet
Link: https://finance.yahoo.com/news/apples-difficult-2025-is-tim-cooks-biggest-test-yet-193007489.html
----------------------------------------
Title: Trust the process, not the pitch
Link: https://finance.yahoo.com/video/trust-process-not-pitch-110020509.html
----------------------------------------
Title: The Top Holding for CalPERS, America's Largest Public Pension Fund, Is the Closest Thing You'll Find to a Guaranteed Investment on Wall Street
Link: https://finance.yahoo.com/news/top-holding-calpers-americas-largest-070600142.html
----------------------------------------
Title: Google begins direct online sales of Pixel phones in India
Link: https://finance.yahoo.com/news/google-begins-direct-online-sales-053958226.html
----------------------------------------
Title: US Tightens China Chip Curbs by Targeting Design Software
Link: https://finance.yahoo.com/news/us-tightens-chi