Reference to extracting all labels 
https://datascience.stackexchange.com/questions/112438/how-to-get-all-3-labels-sentiment-from-finbert-instead-of-the-most-likely-label

essentially, us the AutoModelForSequenceClassification to get all raw logits and then apply softmax ourselves 

normally the pipeline does the softmax and ONLY returns the highest

In [1]:
import multiprocessing

num_cores = multiprocessing.cpu_count()
print(f"Total CPU cores available: {num_cores}")

Total CPU cores available: 16


In [2]:
# !pip install bokeh

# !pip install pyarrow==10.0.1

In [3]:
import pandas as pd
import dask
import pyarrow
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import duckdb
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from dask.distributed import Client, LocalCluster
import bokeh

print("Pandas version:", pd.__version__)
print("Dask version:", dask.__version__)
print("PyArrow version:", pyarrow.__version__)

Pandas version: 2.2.3
Dask version: 2025.2.0
PyArrow version: 19.0.1


In [4]:
# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # Put model in evaluation mode

def classify_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return {"label": "NEUTRAL", "score": 1.0, "positive": 0.0, "neutral": 1.0, "negative": 0.0}
    
    # Tokenize input text
    # inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
    # Getting truncation warning. I'ma use tokenizer truncation instead
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():  # Disable gradients
        outputs = model(**inputs)

    logits = outputs.logits  # Get raw model outputs (logits)
    probs = F.softmax(logits, dim=1)  # Apply softmax across dimension 1 (classes)

    # Convert to a Python list
    probs = probs.numpy()[0]  # Extract probabilities as a NumPy array

    # Define label mapping
    labels = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
    sentiment_dict = dict(zip(labels, probs))

    # Get the highest-probability label
    max_label = labels[torch.argmax(logits).item()]
    max_score = max(probs)

    return {
        "label": max_label,
        "score": max_score,
        "positive": sentiment_dict["POSITIVE"],
        "neutral": sentiment_dict["NEUTRAL"],
        "negative": sentiment_dict["NEGATIVE"]
    }


**CANT START A CLIENT AND CLUSTER BEFORE LOADING FINBERT**

In [5]:
# Try to avoid PyArrow
pd.options.mode.string_storage = "python"

# cluster = LocalCluster(n_workers=num_cores//2, threads_per_worker=1)
cluster = LocalCluster(n_workers=10, threads_per_worker=1) # upping to full CPU cores when not using my laptop

cluster.adapt(minimum=1, maximum=10)
client = Client(cluster)

pbar = ProgressBar()
pbar.register()
print(client.dashboard_link)
print(client.ncores())

http://127.0.0.1:8787/status
{'tcp://127.0.0.1:54731': 1, 'tcp://127.0.0.1:54732': 1, 'tcp://127.0.0.1:54733': 1, 'tcp://127.0.0.1:54734': 1, 'tcp://127.0.0.1:54735': 1, 'tcp://127.0.0.1:54736': 1, 'tcp://127.0.0.1:54737': 1, 'tcp://127.0.0.1:54738': 1, 'tcp://127.0.0.1:54739': 1, 'tcp://127.0.0.1:54740': 1}


In [6]:
con = duckdb.connect(r"..\financial_news.db", read_only=True)

# try writing to parquet instead
df = con.execute("SELECT cik, filing_ts, item_filing, type, item_description FROM sp500.sec_item_filings").fetchdf()

con.close()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
# testing only
# filtered_df = df.head(1000)
# len(filtered_df)
# ddf = dd.read_csv("articles_db.csv", assume_missing=True, dtype={'guid': 'object', 'description': 'object', 'article_title': 'object', 'ticker': 'object'})
# read parquet 
# ddf = dd.read_parquet(file_name, engine='pyarrow')

# read from articles_partitioned output_dir 
ddf = dd.from_pandas(df, npartitions=10)
# check partitions in ddf 
print(ddf.npartitions)


10


In [None]:
# Enable Progress Bar
with ProgressBar():
    # Process description sentiment
    ddf['finbert_description'] = ddf.map_partitions(
        lambda df: df['item_description'].apply(classify_sentiment), meta=("x", "object")
    )
    ddf['finbert_description_label'] = ddf.map_partitions(
        lambda df: df['finbert_description'].apply(lambda x: x['label']), meta=("x", "str")
    )
    ddf['finbert_description_score'] = ddf.map_partitions(
        lambda df: df['finbert_description'].apply(lambda x: x['score']), meta=("x", "float")
    )
    ddf['finbert_description_positive'] = ddf.map_partitions(
        lambda df: df['finbert_description'].apply(lambda x: x['positive']), meta=("x", "float")
    )
    ddf['finbert_description_neutral'] = ddf.map_partitions(
        lambda df: df['finbert_description'].apply(lambda x: x['neutral']), meta=("x", "float")
    )
    ddf['finbert_description_negative'] = ddf.map_partitions(
        lambda df: df['finbert_description'].apply(lambda x: x['negative']), meta=("x", "float")
    )
    ddf.to_csv("10k_articles_with_finbert_scores.csv")

# Convert back to Pandas
# df_final = ddf.compute()

# Save results
# df_final.to_csv("articles_with_all_finbert_scores.csv", index=False)


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.




In [None]:
# con.close()

client.close()
cluster.close()