# Import Packages

In [2]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from src.finbert import predict
from textblob import TextBlob
import torch
import scipy
import nltk
nltk.download('punkt_tab')
import tensorflow as tf

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/KattPaint/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
2024-12-07 23:35:30.561352: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Import Data

In [3]:
data = pd.read_csv('../data/beige_books_1970_2024.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.time_index = pd.to_datetime(data.time_index)
data.head()

Unnamed: 0,time_index,region,sentence
0,1970-05-01,at,The mood of our directors varies from pessimis...
1,1970-05-01,at,"If any consensus exists, it is that business a..."
2,1970-05-01,at,Many major economic indices should drift downw...
3,1970-05-01,at,"In the pessimistic vein, a leading department ..."
4,1970-05-01,at,The store reported that labor costs were up 8 ...


# FinBERT for Sentiment Scoring

- FinBERT model pulled from Hugging Face: https://huggingface.co/yiyanghkust/finbert-tone

In [4]:
# load model from hugging face 
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [5]:
# test on single sentence 
txt = data.sentence[0]
inputs = tokenizer(txt, return_tensors="pt")

logits = model(**inputs).logits
logits = scipy.special.softmax(logits.detach().numpy())
sentiment = (logits[:, 0] - logits[:, 1])[0]

print(txt)
print(sentiment)

The mood of our directors varies from pessimism to optimism.
-0.018657394


In [6]:
# import full text 
f = open('../data/beige_books_full.txt', 'r')
txt = f.read()

# sentence splitter
m = nltk.data.load('tokenizers/punkt/english.pickle')

# split into sentences 
sentence_list = m.tokenize(txt)

In [7]:
# split into three batches for runtime concerns 
batch1 = sentence_list[:5000]
batch2 = sentence_list[10000:20000]
batch3 = sentence_list[20000:30000]
batch4 = sentence_list[3000:]

# create sentiment list to store 
sentiments = []

In [8]:
## Batch 1 

# tokenize for model inputs 
inputs = tokenizer(batch1, return_tensors="pt", padding=True)

In [None]:
# run model on inputs to get logits 
logits = model(**inputs).logits

In [None]:
# calculate pos/nue/neg and final sentiment scores 
pos_neg_scores = scipy.special.softmax(logits.detach().numpy())
sentiment_scores = (pos_neg_scores[:, 0] - pos_neg_scores[:, 1])

# Export Final Data Set

In [None]:
scored_data.to_csv('../data/beige_books_1970_2024_with_sentiment.csv', encoding='utf-8', index=True)