In [None]:
import pandas as pd

CSV file saved as SP500_news_2017_2025.csv


In [2]:
# --- FinBERT scoring 

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------
# CONFIG
# -----------------------
IN_CSV     = "../data/news_2017-2025.csv"          # <-- your file
OUT_SCORED = "finbert_scored_news_2017_2025.csv"   # article-level outputs (CSV)
OUT_DAILY  = "daily_sentiment_news_2017_2025.csv"  # daily aggregated outputs (CSV)

TEXT_COL = "content"   # adjust if needed
DATE_COL = "date"      # adjust if needed

MAX_ARTICLES_PER_DAY = 20
BATCH_SIZE = 32
MAX_TOKENS = 256

MODEL_NAME = "ProsusAI/finbert"

# -----------------------
# Load data
# -----------------------
df = pd.read_csv(IN_CSV)

if TEXT_COL not in df.columns:
    raise ValueError(f"Expected '{TEXT_COL}' column. Found: {df.columns.tolist()}")
if DATE_COL not in df.columns:
    raise ValueError(f"Expected '{DATE_COL}' column. Found: {df.columns.tolist()}")

# Parse datetime -> daily key (kept as YYYY-MM-DD string for clean CSV)
dt = pd.to_datetime(df[DATE_COL], errors="coerce", utc=True)
df["date"] = dt.dt.strftime("%Y-%m-%d")

df = df.dropna(subset=["date", TEXT_COL]).reset_index(drop=True)
df[TEXT_COL] = df[TEXT_COL].astype(str)

print("Rows to score:", len(df), "| date range:", df["date"].min(), "->", df["date"].max())
print("Columns:", df.columns.tolist())

# -----------------------
# Device
# -----------------------
if torch.cuda.is_available():
    device = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print("Device:", device)

# -----------------------
# Load FinBERT
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()

# Robust label mapping
id2label = model.config.id2label
label2id = {v.lower(): k for k, v in id2label.items()}

def get_label_id(name: str) -> int:
    name = name.lower()
    if name not in label2id:
        raise ValueError(f"Label '{name}' not found. Available: {id2label}")
    return label2id[name]

pos_id = get_label_id("positive")
neg_id = get_label_id("negative")
neu_id = get_label_id("neutral")

@torch.no_grad()
def finbert_batch(texts):
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_TOKENS,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()

    p_pos = probs[:, pos_id]
    p_neg = probs[:, neg_id]
    p_neu = probs[:, neu_id]
    score = p_pos - p_neg
    return p_pos, p_neg, p_neu, score

# -----------------------
# Score all articles
# -----------------------
p_pos_all, p_neg_all, p_neu_all, score_all = [], [], [], []

for i in tqdm(range(0, len(df), BATCH_SIZE), desc=f"FinBERT scoring ({device})"):
    batch_texts = df[TEXT_COL].iloc[i:i+BATCH_SIZE].tolist()
    ppos, pneg, pneu, sc = finbert_batch(batch_texts)
    p_pos_all.append(ppos)
    p_neg_all.append(pneg)
    p_neu_all.append(pneu)
    score_all.append(sc)

df["p_pos"] = np.concatenate(p_pos_all)
df["p_neg"] = np.concatenate(p_neg_all)
df["p_neu"] = np.concatenate(p_neu_all)
df["sent_score"] = np.concatenate(score_all)

# Discrete label from max prob
prob_mat = np.vstack([df["p_pos"].values, df["p_neu"].values, df["p_neg"].values]).T
label_idx = prob_mat.argmax(axis=1)
df["sent_label"] = np.where(label_idx == 0, "positive",
                     np.where(label_idx == 1, "neutral", "negative"))

# Save article-level scores (CSV)
df.to_csv(OUT_SCORED, index=False)
print(f"Saved scored articles -> {OUT_SCORED} (rows={len(df)})")

# -----------------------
# Keep max N articles/day (strongest signal by |sent_score|)
# -----------------------
df_day = (df.assign(abs_score=np.abs(df["sent_score"]))
            .sort_values(["date", "abs_score"], ascending=[True, False])
            .groupby("date", as_index=False)
            .head(MAX_ARTICLES_PER_DAY)
            .drop(columns=["abs_score"])
         )

print("After per-day cap:", len(df_day),
      "rows | avg/day:", len(df_day) / df_day["date"].nunique())

# -----------------------
# Daily aggregation
# -----------------------
daily = (df_day.groupby("date")
         .agg(
             sent_mean=("sent_score", "mean"),
             sent_median=("sent_score", "median"),
             sent_std=("sent_score", "std"),
             news_count=("sent_score", "size"),
             frac_neg=("sent_label", lambda s: float(np.mean(s == "negative"))),
             frac_pos=("sent_label", lambda s: float(np.mean(s == "positive"))),
             frac_neu=("sent_label", lambda s: float(np.mean(s == "neutral"))),
         )
         .reset_index()
        )

# Optional: fill std NaNs (happens when news_count==1)
daily["sent_std"] = daily["sent_std"].fillna(0.0)

# Save daily aggregated (CSV)
daily.to_csv(OUT_DAILY, index=False)
print(f"Saved daily sentiment -> {OUT_DAILY} (days={len(daily)})")

daily.head()

Rows to score: 21159 | date range: 2017-01-03 -> 2025-12-31
Columns: ['date', 'title', 'content', 'link', 'symbols', 'tags', 'sentiment']
Device: mps


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: ProsusAI/finbert
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


FinBERT scoring (mps):   0%|          | 0/662 [00:00<?, ?it/s]

Saved scored articles -> finbert_scored_news_2017_2025.csv (rows=21159)
After per-day cap: 12063 rows | avg/day: 6.788407428249859
Saved daily sentiment -> daily_sentiment_news_2017_2025.csv (days=1777)


Unnamed: 0,date,sent_mean,sent_median,sent_std,news_count,frac_neg,frac_pos,frac_neu
0,2017-01-03,0.042987,0.042987,0.0,1,0.0,1.0,0.0
1,2017-01-04,0.289485,0.289485,0.0,1,0.0,0.0,1.0
2,2017-01-05,0.394566,0.457251,0.167108,6,0.0,0.333333,0.666667
3,2017-01-18,0.052208,0.052208,0.0,1,0.0,0.0,1.0
4,2017-01-19,0.291051,0.291051,0.0,1,0.0,0.0,1.0
