In [1]:
import duckdb
import pandas as pd 
import spacy
from nltk.corpus import stopwords
import string
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [2]:
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [3]:
con = duckdb.connect("financial_news.db")

In [4]:
df = con.execute("SELECT cik, filing_ts, item_filing, type, item_description FROM sp500.sec_item_filings").fetchdf()
df.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,cik,filing_ts,item_filing,type,item_description
0,66740,2022-02-09 20:13:29,7,10-K,Item 7. Managements Discussion and Analysis o...
1,66740,2021-02-04 18:53:11,7,10-K,Item 7. Managements Discussion and Analysis o...
2,66740,2020-02-06 21:16:31,7,10-K,Item 7. Managements Discussion and Analysis o...
3,66740,2019-02-07 22:15:37,7,10-K,Item 7. Managements Discussion and Analysis o...
4,66740,2018-02-08 22:14:52,7,10-K,Item 7. Managements Discussion and Analysis o...


In [8]:
con.close()

In [8]:
df['combined_text'] = df['article_title'].fillna('') + " " + df['description'].fillna('')
df.head()

Unnamed: 0,guid,description,article_title,ticker,combined_text
0,76ceb11d-33eb-3af8-82f1-74e4068911f5,Agilent (A) adds a water immersion and confoca...,Agilent (A) Enhances BioTek Cytation C10 With ...,A,Agilent (A) Enhances BioTek Cytation C10 With ...
1,56dc485e-c740-3fcc-ab3a-4e0d707a8f4d,"SANTA CLARA, Calif., December 07, 2023--Agilen...",Agilent Resolve Raman Receives Multiple Recogn...,A,Agilent Resolve Raman Receives Multiple Recogn...
2,367bed80-8d07-3dce-8092-fd53d70578fe,"Artisan Partners, an investment management com...",Hereâs Why Artisan Partners Mid Cap Fund Har...,A,Hereâs Why Artisan Partners Mid Cap Fund Har...
3,7bf92827-a505-3d56-98a3-4c9d60794e64,Generally speaking the aim of active stock pic...,Agilent Technologies' (NYSE:A) 14% CAGR outpac...,A,Agilent Technologies' (NYSE:A) 14% CAGR outpac...
4,8e5bdc52-73a9-30b1-ae97-493cd82da360,"SANTA CLARA, Calif., December 04, 2023--Agilen...",Agilent BioTek Cytation C10 Confocal Imaging R...,A,Agilent BioTek Cytation C10 Confocal Imaging R...


In [5]:
finbert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [6]:
def classify_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return {"label": "NEUTRAL", "score": 0.0}  # Default for empty or invalid text
    
    result = finbert_pipeline(text[:512])  # Truncate text to 512 tokens
    return result[0]  # Return the first result


In [9]:
# Apply FinBERT to the description
df['finbert_description'] = df['item_description'].apply(lambda x: classify_sentiment(x))
df['finbert_description_label'] = df['finbert_description'].apply(lambda x: x['label'])
df['finbert_description_score'] = df['finbert_description'].apply(lambda x: x['score'])

# Save the results back to a CSV file
df.to_csv("10ks_with_all_finbert_scores.csv", index=False)

# Display a sample of the DataFrame to verify
df.head()

Unnamed: 0,cik,filing_ts,item_filing,type,item_description,finbert_description,finbert_description_label,finbert_description_score
0,66740,2022-02-09 20:13:29,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999117851257324}",Neutral,0.999912
1,66740,2021-02-04 18:53:11,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999666213989258}",Neutral,0.999967
2,66740,2020-02-06 21:16:31,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999666213989258}",Neutral,0.999967
3,66740,2019-02-07 22:15:37,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.999957799911499}",Neutral,0.999958
4,66740,2018-02-08 22:14:52,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.999957799911499}",Neutral,0.999958


In [10]:
# df.tail()
# check if finbert_title columns is null 
df.to_csv("sp500_with_all_finbert_scores.csv", index=False)