In [2]:
import duckdb
import pandas as pd 
import spacy
from nltk.corpus import stopwords
import string
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
import os 
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()
DB_PATH = Path(os.getenv("DB_PATH"))
DB_FILE = os.getenv("DB_FILE")
duckdb_path = DB_PATH / DB_FILE

In [None]:
con = duckdb.connect(duckdb_path)

In [6]:
df = con.execute("SELECT guid, description, article_title, ticker FROM headlines.articles").fetchdf()
df.head()

Unnamed: 0,guid,description,article_title,ticker
0,76ceb11d-33eb-3af8-82f1-74e4068911f5,Agilent (A) adds a water immersion and confoca...,Agilent (A) Enhances BioTek Cytation C10 With ...,A
1,56dc485e-c740-3fcc-ab3a-4e0d707a8f4d,"SANTA CLARA, Calif., December 07, 2023--Agilen...",Agilent Resolve Raman Receives Multiple Recogn...,A
2,367bed80-8d07-3dce-8092-fd53d70578fe,"Artisan Partners, an investment management com...",Hereâs Why Artisan Partners Mid Cap Fund Har...,A
3,7bf92827-a505-3d56-98a3-4c9d60794e64,Generally speaking the aim of active stock pic...,Agilent Technologies' (NYSE:A) 14% CAGR outpac...,A
4,8e5bdc52-73a9-30b1-ae97-493cd82da360,"SANTA CLARA, Calif., December 04, 2023--Agilen...",Agilent BioTek Cytation C10 Confocal Imaging R...,A


In [8]:
df['combined_text'] = df['article_title'].fillna('') + " " + df['description'].fillna('')
df.head()

Unnamed: 0,guid,description,article_title,ticker,combined_text
0,76ceb11d-33eb-3af8-82f1-74e4068911f5,Agilent (A) adds a water immersion and confoca...,Agilent (A) Enhances BioTek Cytation C10 With ...,A,Agilent (A) Enhances BioTek Cytation C10 With ...
1,56dc485e-c740-3fcc-ab3a-4e0d707a8f4d,"SANTA CLARA, Calif., December 07, 2023--Agilen...",Agilent Resolve Raman Receives Multiple Recogn...,A,Agilent Resolve Raman Receives Multiple Recogn...
2,367bed80-8d07-3dce-8092-fd53d70578fe,"Artisan Partners, an investment management com...",Hereâs Why Artisan Partners Mid Cap Fund Har...,A,Hereâs Why Artisan Partners Mid Cap Fund Har...
3,7bf92827-a505-3d56-98a3-4c9d60794e64,Generally speaking the aim of active stock pic...,Agilent Technologies' (NYSE:A) 14% CAGR outpac...,A,Agilent Technologies' (NYSE:A) 14% CAGR outpac...
4,8e5bdc52-73a9-30b1-ae97-493cd82da360,"SANTA CLARA, Calif., December 04, 2023--Agilen...",Agilent BioTek Cytation C10 Confocal Imaging R...,A,Agilent BioTek Cytation C10 Confocal Imaging R...


In [9]:
finbert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [10]:
def classify_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return {"label": "NEUTRAL", "score": 0.0}  # Default for empty or invalid text
    
    result = finbert_pipeline(text[:512])  # Truncate text to 512 tokens
    return result[0]  # Return the first result


In [None]:
# Apply FinBERT to the title
df['finbert_title'] = df['article_title'].apply(lambda x: classify_sentiment(x))
df['finbert_title_label'] = df['finbert_title'].apply(lambda x: x['label'])
df['finbert_title_score'] = df['finbert_title'].apply(lambda x: x['score'])

# Apply FinBERT to the description
df['finbert_description'] = df['description'].apply(lambda x: classify_sentiment(x))
df['finbert_description_label'] = df['finbert_description'].apply(lambda x: x['label'])
df['finbert_description_score'] = df['finbert_description'].apply(lambda x: x['score'])

# Apply FinBERT to the combined title and description
df['finbert_combined'] = df['combined_text'].apply(lambda x: classify_sentiment(x))
df['finbert_combined_label'] = df['finbert_combined'].apply(lambda x: x['label'])
df['finbert_combined_score'] = df['finbert_combined'].apply(lambda x: x['score'])

# Save the results back to a CSV file
df.to_csv("articles_with_all_finbert_scores.csv", index=False)

# Display a sample of the DataFrame to verify
print(df[['guid', 'article_title', 'description', 
          'finbert_title_label', 'finbert_title_score',
          'finbert_description_label', 'finbert_description_score',
          'finbert_combined_label', 'finbert_combined_score']].head())

In [None]:
# df.tail()
# check if finbert_title columns is null 
# df.to_csv("articles_with_all_finbert_scores.csv", index=False)