
Config 

In [10]:
# 5 years
START_DATE = datetime.today().strftime("%Y%m%d")
END_DATE = datetime(TODAY.year - 5, TODAY.month, TODAY.day).strftime("%Y%m%d")



## NYT CONFIG
from dotenv import load_dotenv
import os
from datetime import datetime

# get NYT api key from .env
load_dotenv()
NYT_API_KEY = os.getenv("NYT_API_KEY")

# NYT API constraints
NYT_DAILY_LIMIT = 500 # calls per day 
NYT_RATE_LIMIT_SLEEP = 12 # seconds between API requests (ie. 5/min)
NYT_PAGE_LIMIT = 100 


Load NYT

In [None]:
### NYT ARTICLE DATA

import requests
from time import sleep
import json
from datetime import datetime
import pandas as pd
from pathlib import Path


from config import (
    NYT_API_KEY,
    NYT_DAILY_LIMIT,
    NYT_RATE_LIMIT_SLEEP,
    NYT_PAGE_LIMIT,
    START_DATE,
    END_DATE,
)



def get_nyt_page(page, query, begin_date, end_date):
    params = {
        "api-key": NYT_API_KEY,
        "q": query,                                
        "begin_date": begin_date,                  
        "end_date": end_date,
        "page": page                              
    }

    
    response = requests.get(BASE_URL, params=params, timeout=30)

    # Error Handling
    if response.status_code == 429:
        raise RuntimeError("NYT rate limit hit")
        
    if response.status_code != 200:
        raise RuntimeError(f"NYT API error {response.status_code}: {response.text}")
   
    return response.json()



def load_nyt_data(query, start_date, end_date, max_requests):
    docs = [] # store articles here
    request_count = 0

    # pull articles from each page
    for page in range(NYT_PAGE_LIMIT):

        if request_count >= max_requests:
            print(f"Reached request limit ({max_requests})")
            break

        print(f"Page {page} (request {request_count+1}/{max_requests})")

        # get one page
        data = get_nyt_page(query, start_date, end_date, page)
        request_count += 1

        # get article info
        try:
            response = data.get("response", {})
            page_docs = response.get("docs") or [] # maybe remove or [] ????
            meta = response.get("meta", {}) # was metadata
        except Exception as e:
            print(f"Invalid NYT response: {e}")
            break

        # no more pages
        if not page_docs:
            break

        docs.extend(page_docs)

        hits = meta.get("hits", 0)
        # safety stop one page before hitting the limit
        if (page + 1) * 10 >= hits:
            break

        sleep(NYT_RATE_LIMIT_SLEEP)

    return docs



#Set Up
BASE_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
QUERY = "IBM"  # can be any keyword

# get articles
articles = load_nyt_data(
    query=QUERY,
    f=START_DATE,
    end_date=END_DATE,
    max_requests= 5 #5 for testing, NYT_DAILY_LIMIT is actually  500
)

# save data
output_path = Path("data/raw/nyt_data.json") 
with open(output_path, "w") as f:
    json.dump(articles, f, indent=2)


In [None]:
# dispaly first five rows
pd.json_normalize(articles[:5]).head()


Load yfinance

In [8]:
##load finance
### YFINANCE
import yfinance as yf
import pandas as pd
from datetime import datetime

def load_yf_data(ticker="IBM", period="5y"):
    try:
        tick = yf.Ticker(ticker) # set ticker object
        df = tick.history(period=period) # historical data as pandas df
    except Exception as e:
        raise ConnectionError(f"Failed to fetch data for {ticker}: {e}")
    
    # Check data loaded 
    if df.empty:
        raise ValueError("No data returned from Yahoo Finance.")

    # Add Date column
    df.reset_index(inplace=True)
    
    # Sort chronologically
    df = df.sort_values("Date")

    return df

# RUN YFINANCE
yf_df = load_yf_data()

# Save raw file
yf_df.to_csv("./data/raw/yf_df.csv", index=False)
yf_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-11-23 00:00:00-05:00,91.275649,93.677434,91.15128,93.343201,5910318,0.0,0.0
1,2020-11-24 00:00:00-05:00,93.941702,96.949768,93.902841,96.708809,8109115,0.0,0.0
2,2020-11-25 00:00:00-05:00,95.550677,96.638865,94.913306,96.537819,4326151,0.0,0.0
3,2020-11-27 00:00:00-05:00,96.537813,97.400586,96.312398,96.654404,2187395,0.0,0.0
4,2020-11-30 00:00:00-05:00,96.460086,97.159635,95.675035,96.009262,6263448,0.0,0.0


In [9]:
yf_df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
1251,2025-11-17 00:00:00-05:00,305.589996,306.0,296.51001,297.170013,3909700,0.0,0.0
1252,2025-11-18 00:00:00-05:00,297.0,297.0,289.920013,289.950012,4861900,0.0,0.0
1253,2025-11-19 00:00:00-05:00,290.5,291.109985,288.070007,288.529999,3595900,0.0,0.0
1254,2025-11-20 00:00:00-05:00,294.640015,300.709991,290.160004,290.399994,5597000,0.0,0.0
1255,2025-11-21 00:00:00-05:00,293.480011,300.480011,291.890015,297.440002,5703100,0.0,0.0



Load Google Trends

In [1]:
### Google Trends Data
import time
from pytrends.request import TrendReq
from config import (
    START_DATE,
    END_DATE,
)


def load_google_data(kw_list = ["IBM"], 
                     start_date = START_DATE, 
                     end_date = END_DATE, 
                     tz=360):
     
    
    if len(kw_list) > 5:
        raise ValueError("kw_list must be 5 or less key words") 
    
     # Add a realistic browser header so Google does not block you
    headers = {
        "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
    }
    
    pytrends = TrendReq(hl='en-US', 
                    tz=tz, # tz: timezone offset (360 is US CST) 
                    timeout=(10,25), 
                    retries=3, 
                    backoff_factor=0.4,
                    requests_args={"headers": headers}
                    ) 
    time.sleep(8) 

    # Build request payload
    pytrends.build_payload(kw_list, 
                           cat=0, 
                           timeframe=f'{start_date} {end_date}', 
                           geo='US', 
                           gprop='')
    time.sleep(8) 
    # return df of historical data for when the keyword was searched most
    return pytrends.interest_over_time()


# GOOGLE
google_df = load_google_data(kw_list = ["IBM"], tz=360)

#download if needed
google_df.to_csv("google_df.csv", index=False)




Process yfinance


Process Google Trends


Process NYT


Merge Data


Sentiment Analysis


EDA


Feature Engineering


Modeling