In [2]:
import os
import json
from os import mkdir

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import random
import time

TICKER_LIST = [
    "AAPL", "MSFT", "AMZN", "TSLA", "GOOG", "NVDA", "BRK-B", "JNJ",
    "V", "WMT", "XOM", "JPM", "O", "PG", "HD", "PFE", "MA", "UNH", "BAC",
    "PEP", "KO", "DIS", "CVX", "AVGO", "MRK", "LLY", "ABBV", "INTC", "T",
    "CSCO", "CMCSA", "MCD", "NKE", "ADBE", "CRM", "COST", "WFC", "ABT", "TXN",
    "AMGN", "QCOM", "UPS", "LOW", "IBM", "GE", "CAT", "DE", "ORCL", "BA",
    "MDT", "MS", "GS", "LMT", "VRTX", "ADI", "FDX", "ZTS", "SBUX", "DHR"
]

# --- User-Agent and headers setup ---
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
]

OUTPUT_FOLDER = "20250515_scraping"

user_agent = random.choice(USER_AGENTS)

# --- Configure headless Chrome ---
options = Options()
#options.add_argument("--headless")  # Run without UI
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument(f"user-agent={user_agent}")
options.add_argument("window-size=1920,1080")

for ticker in TICKER_LIST:
    # --- Launch browser ---
    driver = webdriver.Chrome(options=options)
    try:
        url = f"https://finance.yahoo.com/quote/{ticker}/news/"
        driver.get(url)

        wait = WebDriverWait(driver, 10)

        # --- Accept cookie banner ---
        try:
            cookie_button = wait.until(EC.element_to_be_clickable((By.NAME, "agree")))
            cookie_button.click()
            print("✅ Cookie banner accepted")
            time.sleep(1)
        except:
            print("⚠️ No cookie banner appeared")

        # --- Scroll to load more content ---
        scroll_pause = 2
        last_height = driver.execute_script("return document.body.scrollHeight")

        for _ in range(100):  # Scroll up to 100 times, or break if no new content loaded
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause)

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  # Stop scrolling if no more content loads
            last_height = new_height

        print("✅ Finished scrolling")

        # --- Parse content ---
        soup = BeautifulSoup(driver.page_source, "html.parser")
        articles = soup.find_all("div", class_="content yf-1y7058a")

        article_data = []
        for article in articles:
            link = article.find("a", class_="subtle-link")
            if link:
                href = link.get("href")
                title = link.get("title") or link.get_text(strip=True)
                article_data.append({
                    "title": title,
                    "link": href
                })
                print(f"Title: {title}")
                print(f"Link: {href}")
                print("-" * 40)

    finally:
        driver.quit()
        os.makedirs(OUTPUT_FOLDER, exist_ok=True)
        with open(f"{OUTPUT_FOLDER}/{ticker}_news.json", "w", encoding="utf-8") as f:
            json.dump(article_data, f, indent=2, ensure_ascii=False)

    print(f"Number of articles for ticker {ticker}: {len(articles)}")

✅ Cookie banner accepted
✅ Finished scrolling
Title: What’s next for Tesla as Musk enters a new phase
Link: https://finance.yahoo.com/news/whats-next-for-tesla-as-musk-enters-a-new-phase-100043769.html
----------------------------------------
Title: Google adds virtual 'try-on' option to search, agentic AI for purchases
Link: https://finance.yahoo.com/news/google-adds-virtual-try-on-option-to-search-agentic-ai-for-purchases-174510345.html
----------------------------------------
Title: Game On: Take-Two reports mixed Q4 earnings results
Link: https://finance.yahoo.com/news/game-two-reports-mixed-q4-162531162.html
----------------------------------------
Title: Apple’s India Push Receives a Boost on Foxconn’s $1.5B Investment
Link: https://finance.yahoo.com/news/apple-india-push-receives-boost-161433776.html
----------------------------------------
Title: Apple to let third-parties write software with its AI models, Bloomberg reports
Link: https://finance.yahoo.com/news/apple-let-third-

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=136.0.7103.114)
Stacktrace:
	GetHandleVerifier [0x00007FF784DECF45+75717]
	GetHandleVerifier [0x00007FF784DECFA0+75808]
	(No symbol) [0x00007FF784BB8F9A]
	(No symbol) [0x00007FF784B91B41]
	(No symbol) [0x00007FF784C3EC2E]
	(No symbol) [0x00007FF784C5ECC2]
	(No symbol) [0x00007FF784C37153]
	(No symbol) [0x00007FF784C00421]
	(No symbol) [0x00007FF784C011B3]
	GetHandleVerifier [0x00007FF7850ED71D+3223453]
	GetHandleVerifier [0x00007FF7850E7CC2+3200322]
	GetHandleVerifier [0x00007FF785105AF3+3322739]
	GetHandleVerifier [0x00007FF784E06A1A+180890]
	GetHandleVerifier [0x00007FF784E0E11F+211359]
	GetHandleVerifier [0x00007FF784DF5294+109332]
	GetHandleVerifier [0x00007FF784DF5442+109762]
	GetHandleVerifier [0x00007FF784DDBA59+4825]
	BaseThreadInitThunk [0x00007FF8FDEDE8D7+23]
	RtlUserThreadStart [0x00007FF90017C5DC+44]
