# Sentiment Analysis Pipeline

## 1. Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import re
from tqdm import tqdm
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## 2. Path Configuration

In [None]:
# Path configuration (portable)
BASE = Path(__file__).resolve().parents[1]
RAW = BASE / "data" / "raw"
# PROC: outputs directory (created if missing)
PROC = BASE / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

# Input files
TWEET_CSV = RAW / "Tweet.csv"
MAP_CSV = RAW / "Company_Tweet.csv"

# Output files generated by the pipeline
OUT_TWEET = PROC / "tweet_sentiment.csv"
OUT_TWEET_LABELED = PROC / "tweet_sentiment_labeled.csv"
OUT_COMPANY = PROC / "company_sentiment_daily.csv"
OUT_GLOBAL = PROC / "global_sentiment_daily.csv"
OUT_TRAIN = PROC / "tweet_sentiment_train_2015_2018.csv"
OUT_TEST = PROC / "tweet_sentiment_test_2019_2020.csv

## 3. Load & Merge Data

In [None]:
def load_and_merge():
    # Load tweet dataset
    tweets = pd.read_csv(TWEET_CSV, engine="python", on_bad_lines="skip")
    mapping = pd.read_csv(MAP_CSV)

    # Remove rows without text or timestamp
    tweets = tweets.dropna(subset=["body", "post_date"])

    # Merge ticker info
    df = tweets.merge(mapping, on="tweet_id", how="left")
    return df

## 4. Text Cleaning

In [None]:
def clean_text(text):
    text = str(text).lower()  # Normalize to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs (http, https, www)
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # Keep only alphanumeric characters and spaces
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single one
    return text.strip()  # Strip leading/trailing whitespace

## 5. Device Selection

In [None]:
def choose_device():
    if torch.cuda.is_available():
        return "cuda"  # Use NVIDIA GPU if available
    if torch.backends.mps.is_available():
        return "mps"   # Use Apple Silicon GPU (M1/M2/M3/M4)
    return "cpu"       # Default to CPU if no GPU backend is found

## 6. Load FinBERT Model

In [None]:
def load_finbert(device):
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")  # Load tokenizer
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")  # Load model
    model.to(device)  # Move model to selected device (CPU / CUDA / MPS)
    model.eval()  # Set model to inference mode
    return tokenizer, model

## 7. Sentiment Prediction

In [None]:
def predict_sentiment(texts, tokenizer, model, device, batch_size=32):
    results = []  # Store prediction dictionaries

    # Process texts in batches for efficiency
    for i in tqdm(range(0, len(texts), batch_size), desc="FinBERT inference"):
        batch = texts[i:i+batch_size]

        # Tokenize and move to device
        encoded = tokenizer(batch, padding=True, truncation=True,
                            return_tensors="pt").to(device)

        with torch.no_grad():  # Disable gradient computation (inference mode)
            logits = model(**encoded).logits
            scores = torch.softmax(logits, dim=1).cpu().numpy()  # Convert to probabilities

        for s in scores:  # Append one dictionary per tweet
            results.append({
                "positive": float(s[0]),
                "neutral": float(s[1]),
                "negative": float(s[2])
            })

    return pd.DataFrame(results)  # Return as DataFrame

## 8. Daily Aggregation

In [None]:
def aggregate(df):
    # Company-level daily sentiment (one row per ticker per date)
    company = (
        df.dropna(subset=["ticker_symbol"])
          .groupby(["ticker_symbol", "date"])[["positive", "neutral", "negative"]]
          .mean()
          .reset_index()
    )

    # Global daily sentiment (one row per date)
    global_ = (
        df.groupby("date")[["positive", "neutral", "negative"]]
          .mean()
          .reset_index()
    )

    return company, global_

## 9. Full Pipeline Execution

In [None]:
def main():

    df = load_and_merge()  # Load Tweet.csv + Company_Tweet.csv

    # Clean text
    df["clean_text"] = df["body"].apply(clean_text)
    df = df[df["clean_text"] != ""]  # Remove empty cleaned tweets

    # Convert Unix timestamp to date
    df["date"] = pd.to_datetime(df["post_date"], unit="s", utc=True,
                                errors="coerce").dt.date
    df = df.dropna(subset=["date"])  # Remove rows with invalid dates

    # Keep only 2015â€“2020
    df = df[(df["date"] >= pd.to_datetime("2015-01-01").date()) &
            (df["date"] <= pd.to_datetime("2020-12-31").date())]

    # Select device and load FinBERT
    device = choose_device()
    tokenizer, model = load_finbert(device)

    # Sentiment prediction 
    preds = predict_sentiment(df["clean_text"].tolist(),
                              tokenizer, model, device)
    preds = preds.reset_index(drop=True)
    df = df.reset_index(drop=True)

    # Row alignment check between tweets and predictions
    assert len(df) == len(preds), "Mismatch between tweets and predictions."

    # Add sentiment results (FinBERT scores + labels)
    df = pd.concat([df, preds], axis=1)
    df["sentiment"] = df[["positive", "neutral", "negative"]].idxmax(axis=1)
    df["polarity"] = df["sentiment"].map({"positive": 1, "negative": -1}).fillna(0)

    # Save tweet-level outputs
    df.to_csv(OUT_TWEET, index=False)
    df.to_csv(OUT_TWEET_LABELED, index=False)

    # Daily aggregations
    company_daily, global_daily = aggregate(df)
    company_daily.to_csv(OUT_COMPANY, index=False)
    global_daily.to_csv(OUT_GLOBAL, index=False)

    # Train-test split
    train = df[(df["date"] >= pd.to_datetime("2015-01-01").date()) &
               (df["date"] <= pd.to_datetime("2018-12-31").date())]

    test = df[(df["date"] >= pd.to_datetime("2019-01-01").date()) &
              (df["date"] <= pd.to_datetime("2020-12-31").date())]

    train.to_csv(OUT_TRAIN, index=False)
    test.to_csv(OUT_TEST, index=False)

    print("\nAll sentiment files generated successfully.\n")

## 10. Run Pipeline

In [None]:
# Entry point
if __name__ == "__main__":
    main()