In [15]:
from pathlib import Path
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# If this notebook is in spy-ann/notebooks, project root is its parent
PROJECT_ROOT = Path.cwd().resolve().parent

DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

NEWS_CSV = DATA_RAW / "sp500_news_headlines.csv"
OUT_PATH = DATA_PROCESSED / "news_daily_features.parquet"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_RAW:", DATA_RAW)
print("DATA_PROCESSED:", DATA_PROCESSED)
print("NEWS_CSV exists:", NEWS_CSV.exists())



PROJECT_ROOT: C:\Users\KDP only\Documents\ANN_Final_Project\spy-ann
DATA_RAW: C:\Users\KDP only\Documents\ANN_Final_Project\spy-ann\data\raw
DATA_PROCESSED: C:\Users\KDP only\Documents\ANN_Final_Project\spy-ann\data\processed
NEWS_CSV exists: True


In [16]:
df_news = pd.read_csv(NEWS_CSV)
print("=== RAW NEWS PREVIEW ===")
display(df_news.head())
print("Columns:", df_news.columns)
print("Shape:", df_news.shape)


=== RAW NEWS PREVIEW ===


Unnamed: 0,Title,Date,CP
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16
2,2008 predictions for the S&P 500,2008-01-02,1447.16
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18


Columns: Index(['Title', 'Date', 'CP'], dtype='object')
Shape: (19127, 3)


In [17]:
df_news["date"] = pd.to_datetime(df_news["Date"]).dt.date
df_news = df_news.dropna(subset=["date"])
df_news["date"] = pd.to_datetime(df_news["date"])

print("Date range:", df_news["date"].min(), "→", df_news["date"].max())


Date range: 2008-01-02 00:00:00 → 2024-03-04 00:00:00


In [18]:
START_DATE = pd.to_datetime("2010-01-01")   # or "2018-01-01"
END_DATE   = pd.to_datetime("2024-12-31")

mask = (df_news["date"] >= START_DATE) & (df_news["date"] <= END_DATE)
df_news = df_news.loc[mask].reset_index(drop=True)

print("After filter:")
print("Date range:", df_news["date"].min(), "→", df_news["date"].max())
print("Rows:", len(df_news))


After filter:
Date range: 2010-01-05 00:00:00 → 2024-03-04 00:00:00
Rows: 18873


In [19]:
analyzer = SentimentIntensityAnalyzer()

def compute_vader_sent(text):
    if not isinstance(text, str) or not text.strip():
        return np.nan
    scores = analyzer.polarity_scores(text)
    return scores["compound"]

df_news["sent_compound"] = df_news["Title"].apply(compute_vader_sent)

print("Sentiment preview:")
display(df_news[["date", "Title", "sent_compound"]].head())


Sentiment preview:


Unnamed: 0,date,Title,sent_compound
0,2010-01-05,Average P/E Ratio by Decade,0.0
1,2010-01-05,Top picks from top pros - Watching the cash fl...,0.3818
2,2010-01-05,Berkshire Hathaway Pales Against 2009 Stock Ma...,0.0
3,2010-01-06,Why Can't We Let Too-Big-to-Fail Companies Fail?,-0.5423
4,2010-01-07,Five Stocks for the Long Term Investor,0.0


In [20]:
# base daily stats
agg = df_news.groupby("date").agg(
    news_sent_mean=("sent_compound", "mean"),
    news_sent_std=("sent_compound", "std"),
    news_n_headlines=("sent_compound", "count"),
)

# std is NaN when there's only one headline that day
agg["news_sent_std"] = agg["news_sent_std"].fillna(0.0)

# optional: fraction of clearly positive / negative headlines
tmp = df_news.copy()
tmp["is_pos"] = tmp["sent_compound"] > 0.05
tmp["is_neg"] = tmp["sent_compound"] < -0.05

agg["news_frac_pos"] = tmp.groupby("date")["is_pos"].mean()
agg["news_frac_neg"] = tmp.groupby("date")["is_neg"].mean()

# back to column
agg = agg.reset_index()

print("=== DAILY NEWS FEATURES PREVIEW ===")
display(agg.head())
print("Shape:", agg.shape)


=== DAILY NEWS FEATURES PREVIEW ===


Unnamed: 0,date,news_sent_mean,news_sent_std,news_n_headlines,news_frac_pos,news_frac_neg
0,2010-01-05,0.127267,0.220432,3,0.333333,0.0
1,2010-01-06,-0.5423,0.0,1,0.0,1.0
2,2010-01-07,0.0,0.0,1,0.0,0.0
3,2010-01-11,-0.1779,0.0,1,0.0,1.0
4,2010-01-13,-0.20095,0.284186,2,0.0,0.5


Shape: (3311, 6)


In [21]:
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
agg.to_parquet(OUT_PATH, index=False)
OUT_PATH


WindowsPath('C:/Users/KDP only/Documents/ANN_Final_Project/spy-ann/data/processed/news_daily_features.parquet')