# Trading Signal Generation with Sentiment Scores in Embedding Vectors

## Required Libraries

In [None]:
# Core Libraries
import pandas as pd
import numpy as np
import re

# Text Processing
import spacy
from transformers import AutoTokenizer, AutoModel

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm as notebook_tqdm

# Model Saving
import joblib

## Preprocessing

In [None]:
file_path1 = "data/processed/labeled_january_data.csv"
with open(file_path1, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

file_path2 = "data/processed/labeled_february_data.csv"
with open(file_path2, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

file_path3 = "data/processed/labeled_march_data.csv"
with open(file_path3, "r", encoding="utf-8") as file:
    df_march = pd.read_csv(file)

In [None]:
from preprocessing import preprocess_text

# Apply preprocessing to the dataset
df_jan['cleaned_article'] = df_jan['article'].apply(preprocess_text)
df_feb['cleaned_article'] = df_feb['article'].apply(preprocess_text)
df_march['cleaned_article'] = df_march['article'].apply(preprocess_text)

## Data Exploration

In [None]:
sns.countplot(x='label', data=df_jan)
plt.title('Label Distribution in January Dataset')
plt.savefig("results/bert_embeddings_experiment_v2/figures/label_distribution_january.png")
plt.show()

In [None]:
sns.countplot(x='label', data=df_feb)
plt.title('Label Distribution in February Dataset')
plt.savefig("results/bert_embeddings_experiment_v2/figures/label_distribution_february.png")
plt.show()

In [None]:
sns.countplot(x='label', data=df_march)
plt.title('Label Distribution in March Dataset')
plt.savefig("results/bert_embeddings_experiment_v2/figures/label_distribution_march.png")
plt.show()

## Generate Embeddings

In [None]:
# Load BERTimbau tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

def get_bert_embedding(text, tokenizer, model):
    # tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # pass inputs through model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract [CLS] token embedding (shape: [batch_size, hidden_size])
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token is the first token
    return cls_embedding.squeeze(0).numpy()  # convert to NumPy array

In [None]:
df_jan['embedding'] = df_jan['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_feb['embedding'] = df_feb['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_march['embedding'] = df_march['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))

## Generate Sentiment Analysis Labels

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd
import re
from IPython.display import display

In [None]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification,
    pipeline,
)

tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbertptbr = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

In [None]:
classification_mapper = {
    0: 1,  # POSITIVE -> +1
    1: -1, # NEGATIVE -> -1
    2: 0   # NEUTRAL -> 0
}

In [None]:
def classify_article(article):
    try:
        # Tokenize the article
        tokens = tokenizer([article], return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Get model outputs
        outputs = finbertptbr(**tokens)
        # Map predictions to classifications
        pred_index = np.argmax(outputs.logits.cpu().detach().numpy(), axis=1)[0]
        return classification_mapper[pred_index]
    except Exception as e:
        print(f"Error processing article: {article} | Error: {e}")
        return None

In [None]:
# Apply the classification function to each DataFrame
df_jan['classification'] = df_jan['cleaned_article'].apply(classify_article)
df_feb['classification'] = df_feb['cleaned_article'].apply(classify_article)
df_march['classification'] = df_march['cleaned_article'].apply(classify_article)

In [None]:
# Function to add classification to the embedding vector
def augment_embedding_with_classification(row):
    embedding = np.array(row['embedding'])  # Ensure embedding is a numpy array
    classification = np.array([row['classification']])  # Convert classification to an array
    return np.concatenate((embedding, classification))

# Apply the function to augment embeddings with classification
df_jan['embedding'] = df_jan.apply(augment_embedding_with_classification, axis=1)
df_feb['embedding'] = df_feb.apply(augment_embedding_with_classification, axis=1)
df_march['embedding'] = df_march.apply(augment_embedding_with_classification, axis=1)

## Logistic Regression Classification

In [None]:
# Combine January and February data for training
df_train_multi = pd.concat([df_jan, df_feb])
X_train_multi = np.vstack(df_train_multi['embedding'].values)
y_train_multi = df_train_multi['label']

# multi classification and test on march
X_test_multi = np.vstack(df_march['embedding'].values)
y_test_multi = df_march['label']

# binary Classification and test on march
df_train_binary = df_train_multi[df_train_multi['label'] != 0]
df_march_binary = df_march[df_march['label'] != 0]

X_train_binary = np.vstack(df_train_binary['embedding'].values)
y_train_binary = df_train_binary['label']

X_test_binary = np.vstack(df_march_binary['embedding'].values)
y_test_binary = df_march_binary['label']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train multi-class classifier
multi_clf = LogisticRegression(max_iter=1000)
multi_clf.fit(X_train_multi, y_train_multi)

# Predict and evaluate
y_pred_multi = multi_clf.predict(X_test_multi)
print("Multi-Class Report:\n", classification_report(y_test_multi, y_pred_multi))

In [None]:
# Train binary classifier
binary_clf = LogisticRegression(max_iter=1000)
binary_clf.fit(X_train_binary, y_train_binary)

# Predict and evaluate
y_pred_binary = binary_clf.predict(X_test_binary)
print("Binary Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

In [None]:
import os
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "results/bert_embeddings_experiment_v2/metrics"
os.makedirs(results_dir, exist_ok=True)

cm_multi = confusion_matrix(y_test_multi, y_pred_multi)
class_report = classification_report(y_test_multi, y_pred_multi, target_names=["-1", "0", "1"])

report_path = os.path.join(results_dir, "classification_report_multi.txt")
with open(report_path, "w") as f:
    f.write("Classification Report:\n")
    f.write(class_report)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_multi, annot=True, fmt="d", cmap="Blues",
            xticklabels=["-1", "0", "1"], yticklabels=["-1", "0", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Multi-Class Confusion Matrix")

conf_matrix_path = os.path.join(results_dir, "confusion_matrix_multi.png")
plt.savefig(conf_matrix_path)
plt.close()

print(f"Classification report saved to: {report_path}")
print(f"Confusion matrix saved to: {conf_matrix_path}")

In [None]:
cm_binary = confusion_matrix(y_test_binary, y_pred_binary)
class_report_binary = classification_report(y_test_binary, y_pred_binary, target_names=["-1", "1"])

report_path_binary = os.path.join(results_dir, "classification_report_binary.txt")
with open(report_path_binary, "w") as f:
    f.write("Classification Report (Binary):\n")
    f.write(class_report_binary)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_binary, annot=True, fmt="d", cmap="Greens", 
            xticklabels=["-1", "1"], yticklabels=["-1", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Binary Confusion Matrix")

conf_matrix_path_binary = os.path.join(results_dir, "confusion_matrix_binary.png")
plt.savefig(conf_matrix_path_binary)
plt.close()

print(f"Binary classification report saved to: {report_path_binary}")
print(f"Binary confusion matrix saved to: {conf_matrix_path_binary}")