In [12]:
# !pip install transformers

In [13]:
import os
import pandas as pd
from glob import glob
from transformers import AutoTokenizer, AutoModel
import torch

In [14]:
# Load FinBERT model and tokenizer
finbert_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModel.from_pretrained("yiyanghkust/finbert-tone")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finbert_model.to(device)
finbert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30873, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [26]:
def preprocess_for_bert(df):
    # Keep relevant columns
    df = df[['guid', 'ticker', 'article_pubDate', 'article_title', 'description']].copy()

    # Combine title and description into one string
    df['full_text'] = df['article_title'].fillna('') + " " + df['description'].fillna('')

    # Optional: Drop rows with no text at all
    df = df[df['full_text'].str.strip() != '']

    return df

def get_finbert_embedding(text):
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move tensors to device
    with torch.no_grad():
        outputs = finbert_model(**inputs)
    # move tensors back to CPU for numpy conversion
    outputs = outputs.last_hidden_state[:, 0, :].squeeze().cpu()  # Move outputs to CPU to avoid GPU memory issues
    # Use [CLS] token representation as sentence embedding
    return outputs.numpy()


In [16]:
def load_articles_with_tickers(base_path="Word2Vec/retry/new_data/articles/"):
    all_dfs = []

    # Grab all CSVs under ticker subfolders
    csv_files = glob(os.path.join(base_path, "*/data_*.csv"))

    for file_path in csv_files:
        # Extract the ticker from folder name
        ticker = os.path.basename(os.path.dirname(file_path)).replace("ticker=", "")

        # Load CSV
        df = pd.read_csv(file_path)

        # Add ticker column
        df['ticker'] = ticker

        all_dfs.append(df)

    # Concatenate all into a single DataFrame
    combined_df = pd.concat(all_dfs, ignore_index=True)

    # Optional: reorder columns
    columns = ['ticker', 'guid', 'description', 'article_title', 'article_pubDate']
    return combined_df[columns]


In [17]:
df = load_articles_with_tickers()
# df.head()

In [18]:
# crap wrong ones. I need to use the dataset from the DB
df_bert = preprocess_for_bert(df)
# df_bert.head()

In [27]:
df_bert['embedding'] = df_bert['full_text'].apply(get_finbert_embedding)

In [28]:
df_bert['embedding']

0         [0.6570171, -0.31303525, -0.4963823, 0.0334446...
1         [0.6575914, -0.39113572, -1.2864994, 1.0594414...
2         [0.4497182, -0.5984597, -0.58793783, 0.4912451...
3         [-0.3111353, -0.11022892, 0.055871136, 0.10007...
4         [-0.04455838, -0.42751977, 0.0028711278, -0.14...
                                ...                        
363815    [9.2989336e-05, -0.76329875, -0.8743198, -0.48...
363816    [-1.2596151, -0.43909767, -0.17488584, 0.78091...
363817    [-0.26690745, -0.5215517, -0.9235606, 1.122709...
363818    [-0.18933877, -0.966446, -0.66913235, -0.42958...
363819    [-0.3939313, -0.4886067, -1.2324507, 0.643593,...
Name: embedding, Length: 363820, dtype: object

In [30]:
import numpy as np
embeddings = np.stack(df_bert['embedding'].values)
np.save("Word2Vec/retry/Finbert/finbert_embeddings.npy", embeddings)