# Historical Sentiment Generation with FinBERT

This notebook runs FinBERT inference on historical news headlines to generate sentiment scores.

**Requirements:**
- GPU runtime recommended (Runtime > Change runtime type > T4 GPU)
- ~5 minutes for 100k headlines on T4 GPU

**Usage:**
1. Upload your `news_combined.csv` to Colab
2. Run all cells
3. Download `historical_sentiment.csv`

In [1]:
# Install dependencies
!pip install -q transformers torch pandas numpy

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from tqdm import tqdm

# Configuration
MODEL_NAME = "ProsusAI/finbert"
BATCH_SIZE = 32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {DEVICE}")

In [None]:
# Load FinBERT model
print("Loading FinBERT model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()
print("Model loaded!")

In [None]:
def predict_sentiment_batch(texts, batch_size=32):
    """Run FinBERT inference on a batch of texts."""
    results = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing"):
        batch = texts[i:i + batch_size]
        
        # Tokenize
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(DEVICE)
        
        # Inference
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
        
        # FinBERT: positive=0, negative=1, neutral=2
        label_map = {0: "positive", 1: "negative", 2: "neutral"}
        
        for prob in probs:
            pred_idx = prob.argmax().item()
            confidence = prob[pred_idx].item()
            
            # Weighted score: positive=1, negative=-1, neutral=0
            score = prob[0].item() * 1.0 + prob[1].item() * -1.0 + prob[2].item() * 0.0
            
            results.append({
                "sentiment_score": score,
                "sentiment_label": label_map[pred_idx],
                "confidence": confidence,
            })
    
    return results

In [None]:
# Upload your news file or use sample data
# Option 1: Upload file
# from google.colab import files
# uploaded = files.upload()
# news_df = pd.read_csv(list(uploaded.keys())[0])

# Option 2: Load from path
NEWS_FILE = "news_combined.csv"  # Update this path

try:
    news_df = pd.read_csv(NEWS_FILE)
    print(f"Loaded {len(news_df)} headlines")
    print(news_df.head())
except FileNotFoundError:
    print(f"File not found: {NEWS_FILE}")
    print("Please upload your news file or update the path.")

In [None]:
# Run sentiment inference
headlines = news_df["headline"].fillna("").tolist()
print(f"Processing {len(headlines)} headlines...")

results = predict_sentiment_batch(headlines, batch_size=BATCH_SIZE)

# Add results to dataframe
news_df["sentiment_score"] = [r["sentiment_score"] for r in results]
news_df["sentiment_label"] = [r["sentiment_label"] for r in results]
news_df["sentiment_confidence"] = [r["confidence"] for r in results]

print("\nSentiment distribution:")
print(news_df["sentiment_label"].value_counts())

In [None]:
# Aggregate to daily sentiment per ticker
def aggregate_daily(df):
    grouped = df.groupby(["date", "ticker"])
    
    daily = grouped.agg({
        "sentiment_score": ["mean", "std", "count"],
    }).reset_index()
    
    daily.columns = ["date", "ticker", "sentiment_score", "sentiment_std", "news_count"]
    daily["sentiment_std"] = daily["sentiment_std"].fillna(0)
    
    # Compute ratios
    def compute_ratios(group):
        scores = group["sentiment_score"]
        pos = (scores > 0.1).sum() / len(scores) if len(scores) > 0 else 0
        neg = (scores < -0.1).sum() / len(scores) if len(scores) > 0 else 0
        return pd.Series({"positive_ratio": pos, "negative_ratio": neg})
    
    ratios = df.groupby(["date", "ticker"]).apply(compute_ratios, include_groups=False).reset_index()
    daily = daily.merge(ratios, on=["date", "ticker"])
    
    return daily.sort_values(["date", "ticker"]).reset_index(drop=True)

daily_sentiment = aggregate_daily(news_df)
print(f"\nDaily sentiment records: {len(daily_sentiment)}")
print(daily_sentiment.head(10))

In [None]:
# Save results
OUTPUT_FILE = "historical_sentiment.csv"
daily_sentiment.to_csv(OUTPUT_FILE, index=False)
print(f"Saved to {OUTPUT_FILE}")

# Summary
print("\n" + "="*60)
print("SENTIMENT GENERATION COMPLETE")
print("="*60)
print(f"Headlines processed: {len(news_df)}")
print(f"Daily records: {len(daily_sentiment)}")
print(f"Date range: {daily_sentiment['date'].min()} to {daily_sentiment['date'].max()}")
print(f"Tickers: {daily_sentiment['ticker'].nunique()}")
print(f"Unique dates: {daily_sentiment['date'].nunique()}")
print("="*60)

In [None]:
# Download results (Colab)
# from google.colab import files
# files.download(OUTPUT_FILE)