# Enhanced Sentiment Analysis Pipeline (One-Cell Version)

This notebook is the streamlined replacement for the original 3k-line plan. It calls helper functions from `sentiment_utils.py`, keeping the notebook easy to read, debug, and rerun.

In [None]:
from sentiment_utils import (
    collect_yahoo_news,
    collect_google_news_monthly,
    collect_reddit_sentiment,
    load_finbert_pipeline,
    analyze_sentiment_finbert_enhanced,
    aggregate_monthly_sentiment_enhanced,
    validate_sentiment_vs_returns,
)

import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------
# Portfolio tickers
# ------------------------------------------------------------------
portfolio_assets = {
    'RDDT': 'Reddit Inc',
    'NVDA': 'NVIDIA Corporation',
    'SMR': 'NuScale Power Corporation',
    'MU': 'Micron Technology Inc',
    'MRVL': 'Marvell Technology Group',
    'MSFT': 'Microsoft Corporation',
    'ASML': 'ASML Holding NV',
    'AEM': 'Agnico Eagle Mines Ltd',
    'AMD': 'Advanced Micro Devices',
    'VERU': 'Veru Inc',
    'AI': 'C3.ai Inc',
    'GOOGL': 'Alphabet Inc (Google)',
    'INGM': 'Inogen Inc',
    'PLUG': 'Plug Power Inc',
    'IONQ': 'IonQ Inc',
    'CHYM': 'Anterix Inc',
    'RGTI': 'Rigetti Computing Inc',
    'ARBE': 'Arbe Robotics Ltd',
}
tickers = list(portfolio_assets.keys())

# ------------------------------------------------------------------
# 1  Load FinBERT
# ------------------------------------------------------------------
finbert = load_finbert_pipeline()

# ------------------------------------------------------------------
# 2  Collect news (Yahoo + Google + Reddit) with caching
# ------------------------------------------------------------------
yahoo_df  = collect_yahoo_news(tickers, portfolio_assets, start_year=2024)
google_df = collect_google_news_monthly(tickers, portfolio_assets, start_year=2020, news_per_month=20)
reddit_df = collect_reddit_sentiment(tickers, portfolio_assets, finbert, start_year=2020, posts_per_ticker=100)

all_news = pd.concat([yahoo_df, google_df, reddit_df], ignore_index=True)
print(f'Articles collected: {len(all_news):,}')

# ------------------------------------------------------------------
# 3  Sentiment inference & aggregation
# ------------------------------------------------------------------
sentiment_df = analyze_sentiment_finbert_enhanced(all_news, finbert)
monthly_df   = aggregate_monthly_sentiment_enhanced(sentiment_df)

# Save for downstream RL notebooks
Path('outputs').mkdir(exist_ok=True)
monthly_df.to_csv('outputs/enhanced_monthly_sentiment.csv', index=False)
print('✓ Monthly sentiment saved → outputs/enhanced_monthly_sentiment.csv')

# ------------------------------------------------------------------
# 4  Validate vs log-returns (optional)
# ------------------------------------------------------------------
summary, _ = validate_sentiment_vs_returns(monthly_df, 'outputs/log_returns_data.csv')
summary.head()