# Finance Tweet Sentiment → Invest Decisions

This notebook loads `reduced_dataset-release.csv`, scores each tweet with [ProsusAI/finbert](https://huggingface.co/ProsusAI/finbert), and turns the sentiment into an invest / hold / avoid signal for each stock mentioned.


In [1]:
%pip install --quiet --upgrade transformers torch pandas tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pathlib import Path
import os

# Force Transformers to use PyTorch-only (disable TensorFlow/Keras to avoid tf-keras / Keras 3 issues)
os.environ["DISABLE_TF"] = "1"

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm

DATA_PATH = Path("./reduced_dataset-release.csv")
MODEL_ID = "ProsusAI/finbert"

pd.options.display.max_colwidth = 140

tqdm.pandas(desc="tweet sentiment")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(DATA_PATH)
# Drop legacy polarity columns if they exist
for col in ["LSTM_POLARITY", "TEXTBLOB_POLARITY"]:
    if col in df.columns:
        df = df.drop(columns=[col])

print(f"Loaded {len(df):,} tweets from {DATA_PATH.name}")
df.head()


Loaded 143,282 tweets from reduced_dataset-release.csv


  df = pd.read_csv(DATA_PATH)


Unnamed: 0.1,Unnamed: 0,TWEET,STOCK,DATE,LAST_PRICE,1_DAY_RETURN,2_DAY_RETURN,3_DAY_RETURN,7_DAY_RETURN,PX_VOLUME,VOLATILITY_10D,VOLATILITY_30D,MENTION
0,0.0,RT @robertoglezcano: @amazon #Patents Show Flying Warehouses That Send Delivery Drones To Your Door https://t.co/cZwL9QLhoC by…,,,,,,,,,,,
1,,Amazon,31/01/2017,823.48,0.008379,0.014924,0.014924,-0.001263,3137196.0,13.447,16.992,1.0,
2,1.0,"@FAME95FM1 Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUrgsqs\r",PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.05480141,9100057.0,18.769,16.099,@PayPal
3,2.0,"@CBSi Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUqYRyU\r",PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.05480141,9100057.0,18.769,16.099,@PayPal
4,3.0,"@Hitz92fm Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUqYRyU\r",PayPal,31/01/2017,39.78,0.002011,0.012318,0.012318,0.05480141,9100057.0,18.769,16.099,@PayPal


In [4]:
# Build FinBERT sentiment pipeline (force PyTorch backend)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)

sentiment_pipe = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    framework="pt",  # avoid TensorFlow / Keras
    device=0 if torch.cuda.is_available() else -1,
)

# Quick sanity check
sentiment_pipe("Markets look great today for Tesla!")


Device set to use cpu


[{'label': 'positive', 'score': 0.7951837182044983}]

In [5]:
def score_tweet(text: str):
    """Run a single tweet through FinBERT and return label + score.

    FinBERT is 3-class: positive / negative / neutral
    (see model card: https://huggingface.co/ProsusAI/finbert)
    """
    if pd.isna(text):
        return {"label": "neutral", "score": 0.0}
    snippet = str(text).replace("\r", " ")[:512]
    if not snippet.strip():
        return {"label": "neutral", "score": 0.0}
    return sentiment_pipe(snippet)[0]

hf_scores = df["TWEET"].progress_apply(score_tweet)
df["hf_label"] = hf_scores.apply(lambda r: r["label"].upper())
df["hf_score"] = hf_scores.apply(lambda r: r["score"])

# Map FinBERT sentiment → simple trading signal
label_to_signal = {
    "POSITIVE": "INVEST",
    "NEGATIVE": "AVOID",
    "NEUTRAL": "HOLD",
}

df["invest_signal"] = df["hf_label"].map(label_to_signal).fillna("HOLD")

df[["TWEET", "STOCK", "hf_label", "hf_score", "invest_signal"]].head()


tweet sentiment: 100%|██████████| 143282/143282 [1:57:23<00:00, 20.34it/s]    


Unnamed: 0,TWEET,STOCK,hf_label,hf_score,invest_signal
0,RT @robertoglezcano: @amazon #Patents Show Flying Warehouses That Send Delivery Drones To Your Door https://t.co/cZwL9QLhoC by…,,NEUTRAL,0.903675,HOLD
1,Amazon,31/01/2017,NEUTRAL,0.926334,HOLD
2,"@FAME95FM1 Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUrgsqs\r",PayPal,NEUTRAL,0.926613,HOLD
3,"@CBSi Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUqYRyU\r",PayPal,NEUTRAL,0.917117,HOLD
4,"@Hitz92fm Jamaicans make money with @Payoneer @PayPal, @paxuminc, @ecoPayz and @okpaycom https://t.co/FWzqUqYRyU\r",PayPal,NEUTRAL,0.926954,HOLD


In [10]:
# Optional: aggregate signals per stock
# We keep only the numeric probability columns when computing confidence
signal_cols = ["AVOID", "HOLD", "INVEST"]

stock_view = (
    df.groupby("STOCK")
    .invest_signal.value_counts(normalize=True)
    .rename("signal_share")
    .reset_index()
    .pivot_table(index="STOCK", columns="invest_signal", values="signal_share", fill_value=0.0)
)

present_cols = [c for c in signal_cols if c in stock_view.columns]

stock_view["top_signal"] = stock_view[present_cols].idxmax(axis=1)
stock_view["confidence"] = stock_view[present_cols].max(axis=1)
stock_view = stock_view.sort_values("confidence", ascending=False)

stock_view.head(10)


invest_signal,AVOID,HOLD,INVEST,top_signal,confidence
STOCK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01/02/2017,0.0,1.0,0.0,HOLD,1.0
479.06,0.0,1.0,0.0,HOLD,1.0
47.73,0.0,1.0,0.0,HOLD,1.0
47.76,0.0,1.0,0.0,HOLD,1.0
47.81,0.0,1.0,0.0,HOLD,1.0
47.85,0.0,1.0,0.0,HOLD,1.0
47.86,0.0,1.0,0.0,HOLD,1.0
47.9,0.0,1.0,0.0,HOLD,1.0
47.94,0.0,1.0,0.0,HOLD,1.0
47.96,0.0,1.0,0.0,HOLD,1.0


In [7]:
# Save enriched dataset with FinBERT outputs
output_path = Path("/Users/laurengracias/Desktop/NEXISF25_Finance/reduced_dataset-with-finbert-signals.csv")
df.to_csv(output_path, index=False)
print(f"Wrote scored dataset → {output_path}")


Wrote scored dataset → /Users/laurengracias/Desktop/NEXISF25_Finance/reduced_dataset-with-finbert-signals.csv
