# Import Packages

In [2]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#from src.finbert import predict
from textblob import TextBlob
import torch
import scipy
import nltk
nltk.download('punkt_tab')
import tensorflow as tf
from transformers import pipeline

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data

In [32]:
data = pd.read_csv('/content/drive/MyDrive/DATASCI266/Project/code/data/beige_books_1970_2024.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.time_index = pd.to_datetime(data.time_index)
data.head()

Unnamed: 0,time_index,region,sentence
0,1970-05-01,at,The mood of our directors varies from pessimis...
1,1970-05-01,at,"If any consensus exists, it is that business a..."
2,1970-05-01,at,Many major economic indices should drift downw...
3,1970-05-01,at,"In the pessimistic vein, a leading department ..."
4,1970-05-01,at,The store reported that labor costs were up 8 ...


# FinBERT for Sentiment Scoring

- FinBERT model pulled from Hugging Face: https://huggingface.co/yiyanghkust/finbert-tone

In [19]:
# load model from hugging face
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [20]:
# run on gpu
model.to("cuda")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [22]:
# test on single sentence
txt = data.sentence[0]

# create pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)

# test
print(txt)
print(nlp(txt))

The mood of our directors varies from pessimism to optimism.
[{'label': 'neutral', 'score': 0.8459746837615967}]


In [23]:
# import full text
f = open('/content/drive/MyDrive/DATASCI266/Project/code/data/beige_books_full.txt', 'r')
txt = f.read()

# sentence splitter
m = nltk.data.load('tokenizers/punkt/english.pickle')

# split into sentences
sentence_list = m.tokenize(txt)

In [34]:
# run model on full text / all sentences
results = nlp(sentence_list)

In [35]:
# convert to dataframe
sentiments = pd.DataFrame(results)
scored_data = data.join(sentiments)

# Export Final Data Set

In [36]:
scored_data.to_csv('/content/drive/MyDrive/DATASCI266/Project/code/data/beige_books_1970_2024_with_sentiment.csv', encoding='utf-8', index=True)