In [31]:
import pandas as pd
from dotenv import load_dotenv
import os

In [32]:
load_dotenv()

True

In [36]:
import requests
import time
import json

API_KEY = os.getenv('api_key')
BASE_URL = "https://eodhd.com/api/news"

years = range(2017, 2026)
all_articles = []

for year in years:
    from_date = f"{year}-01-01"
    to_date = f"{year}-12-31"
    offset = 0
    
    while True:
        params = {
            "api_token": API_KEY,
            "fmt": "json",
            "s": "SPY.US",
            "from": from_date,
            "to": to_date,
            "limit": 100,
            "offset": offset
        }
        
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        
        if not data:
            break
        
        all_articles.extend(data)
        offset += 100
        
        print(f"{year} - Pulled {offset} articles...")
        time.sleep(0.5)  # throttle safety

print(f"Total articles collected: {len(all_articles)}")

with open("SP500_news_2017_2025.json", "w") as f:
    json.dump(all_articles, f)

print("Download complete.")

2020 - Pulled 100 articles...
2021 - Pulled 100 articles...
2021 - Pulled 200 articles...
2021 - Pulled 300 articles...
2021 - Pulled 400 articles...
2021 - Pulled 500 articles...
2021 - Pulled 600 articles...
2021 - Pulled 700 articles...
2022 - Pulled 100 articles...
2022 - Pulled 200 articles...
2022 - Pulled 300 articles...
2022 - Pulled 400 articles...
2022 - Pulled 500 articles...
2022 - Pulled 600 articles...
2022 - Pulled 700 articles...
2022 - Pulled 800 articles...
2022 - Pulled 900 articles...
2022 - Pulled 1000 articles...
2023 - Pulled 100 articles...
2024 - Pulled 100 articles...
2024 - Pulled 200 articles...
2025 - Pulled 100 articles...
2025 - Pulled 200 articles...
2025 - Pulled 300 articles...
2025 - Pulled 400 articles...
2025 - Pulled 500 articles...
2025 - Pulled 600 articles...
2025 - Pulled 700 articles...
2025 - Pulled 800 articles...
2025 - Pulled 900 articles...
2025 - Pulled 1000 articles...
2025 - Pulled 1100 articles...
2025 - Pulled 1200 articles...
2025 -

In [None]:
import requests
import time
import json
import csv


# Largest S&P 500 companies by market cap
tickers = [
    "AAPL.US", "MSFT.US", "AMZN.US", "GOOGL.US", "META.US",
    "TSLA.US", "NVDA.US", "JPM.US", "JNJ.US"
]

years = range(2017, 2026)  # 2017 through 2025
BASE_URL = "https://eodhd.com/api/news"
all_articles = []
article_ids = set()  # for deduplication
MAX_ARTICLES_PER_YEAR = 300  # cap per ticker per year

# -------------------------
# FUNCTION TO PULL NEWS
# -------------------------
def pull_news_for_ticker_year(ticker, year):
    from_date = f"{year}-01-01"
    to_date = f"{year}-12-31"
    offset = 0
    year_articles = []

    while len(year_articles) < MAX_ARTICLES_PER_YEAR:
        params = {
            "api_token": API_KEY,
            "fmt": "json",
            "s": ticker,
            "from": from_date,
            "to": to_date,
            "limit": 100,
            "offset": offset
        }
        response = requests.get(BASE_URL, params=params)

        try:
            data = response.json()
        except requests.JSONDecodeError:
            print(f"Warning: JSON decode error for {ticker} {year} offset {offset}. Retrying...")
            time.sleep(2)
            continue  # Retry this request

        if not data:
            break

        for article in data:
            unique_id = article.get("link") or article.get("title")
            if unique_id and unique_id not in article_ids:
                article_ids.add(unique_id)
                year_articles.append(article)
                if len(year_articles) >= MAX_ARTICLES_PER_YEAR:
                    break

        offset += len(data)
        print(f"{ticker} {year} - collected {len(year_articles)} articles so far...")
        time.sleep(0.5)

    return year_articles


# -------------------------
# MAIN LOOP
# -------------------------
for ticker in tickers:
    print(f"Starting ticker: {ticker}")
    for year in years:
        articles = pull_news_for_ticker_year(ticker, year)
        all_articles.extend(articles)
    print(f"Finished ticker: {ticker}, total articles collected: {len(all_articles)}\n")

print(f"TOTAL articles collected for all tickers: {len(all_articles)}")

# -------------------------
# SAVE JSON
# -------------------------
json_filename = "Stocks_biggest_news_2017_2025_capped.json"
with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(all_articles, f, ensure_ascii=False, indent=2)
print(f"JSON saved as {json_filename}")

# -------------------------
# SAVE CSV
# -------------------------
csv_filename = "Stocks_biggest_news_2017_2025_capped.csv"

df = pd.read_csv(json_filename)
df.to_csv(csv_filename, index=False)


In [30]:
SP500_news = pd.read_csv("../data/SP500_news_2017_2025.csv")
Stocks_news = pd.read_csv("../data/Stocks_biggest_news_2017_2025.csv")
out_path = "../data/news_2017-2025.csv"

news =  pd.concat([SP500_news, Stocks_news], ignore_index=True)


news = news.sort_values(by="date", ascending=True)
news.to_csv(out_path, index=False)