# Trading Signal Generation
- Labels generated by financial sentiment analysis

In [None]:
# -------------------- Core Libraries --------------------
import os
import re
import pandas as pd
import numpy as np
import torch

# -------------------- Text Processing --------------------
import spacy
from preprocessing import preprocess_text
from transformers import (
    AutoTokenizer,
    AutoModel,
)

# -------------------- Machine Learning --------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

# -------------------- Visualization --------------------
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm  # Jupyter-native progress bars

In [None]:
# Change working directory to the root of the project (go up one directory from notebooks to root)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(project_root)

## Preprocessing

In [None]:
january = "data/processed/bdm/2024-01.csv"
with open(january, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

february = "data/processed/bdm/2024-02.csv"
with open(february, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

march = "data/processed/bdm/2024-03.csv"
with open(march, "r", encoding="utf-8") as file:
    df_march = pd.read_csv(file)

In [None]:
# Load spaCy Portuguese model once
nlp = spacy.load("pt_core_news_sm")

# Dictionary for acronym expansion
acronyms = {
    "Selic": "Sistema Especial de Liquidação e de Custódia",
    "PIB": "Produto Interno Bruto",
    "CDI": "Certificado de Depósito Interbancário",
    "LPRs": "Loan Prime Rates",
    "Ibovespa": "Índice Bovespa",
    "BB": "Banco do Brasil",
    "BC": "Banco Central",
    "FGTS": "Fundo de Garantia do Tempo de Serviço",
    "STF": "Supremo Tribunal Federal",
    "CPI": "Índice de Preços ao Consumidor",
    "MP": "Medida Provisória",
    "EUA": "Estados Unidos",
    "ONU": "Organização das Nações Unidas",
    "FGV": "Fundação Getúlio Vargas",
    "IBGE": "Instituto Brasileiro de Geografia e Estatística",
    "BNDES": "Banco Nacional de Desenvolvimento Econômico e Social",
    "IPCA": "Índice Nacional de Preços ao Consumidor Amplo",
    "DI": "Depósito Interfinanceiro",
    "IR": "Imposto de Renda",
    "OI": "Operadora Oi",
    "CV": "Câmara de Vereadores"
}

# Noisy acronyms to remove
noisy_acronyms = {"ROMI", "ENEVA", "LIGHT", "DA"}

def normalize_numbers(text):
    text = re.sub(r"R\$ ?([\d.,]+) bilhões", r"\1B", text)
    text = re.sub(r"R\$ ?([\d.,]+) milhões", r"\1M", text)
    text = re.sub(r"([\d.,]+) pp", r"\1%", text)
    text = text.replace(",", "")
    return text

def expand_acronyms(text, acronym_dict):
    for acronym, full_form in acronym_dict.items():
        text = re.sub(rf'\b{re.escape(acronym)}\b', full_form, text, flags=re.IGNORECASE)
    return text

def remove_noisy_acronyms(text, noisy_set):
    return re.sub(r'\b(?:' + '|'.join(noisy_set) + r')\b', '', text)

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

def preprocess_text(text):
    text = normalize_numbers(text)
    text = expand_acronyms(text, acronyms)
    text = remove_noisy_acronyms(text, noisy_acronyms)
    text = lemmatize_text(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

In [None]:
# Apply preprocessing to the dataset
df_jan['cleaned_article'] = df_jan['article'].apply(preprocess_text)
df_feb['cleaned_article'] = df_feb['article'].apply(preprocess_text)
df_march['cleaned_article'] = df_march['article'].apply(preprocess_text)

## Data Visualization

In [None]:
sns.countplot(x='label', data=df_jan)
plt.title('Label Distribution in January Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_january.png")
plt.show()

In [None]:
sns.countplot(x='label', data=df_feb)
plt.title('Label Distribution in February Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_february.png")
plt.show()

In [None]:
sns.countplot(x='label', data=df_march)
plt.title('Label Distribution in March Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_march.png")
plt.show()

## Sentiment Analysis

In [None]:
tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbertptbr = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

In [None]:
classification_mapper = {
    0: 1,  # POSITIVE -> +1
    1: -1, # NEGATIVE -> -1
    2: 0   # NEUTRAL -> 0
}

In [None]:
def classify_article(article):
    try:
        # Tokenize the article
        tokens = tokenizer([article], return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Get model outputs
        outputs = finbertptbr(**tokens)
        # Map predictions to classifications
        pred_index = np.argmax(outputs.logits.cpu().detach().numpy(), axis=1)[0]
        return classification_mapper[pred_index]
    except Exception as e:
        print(f"Error processing article: {article} | Error: {e}")
        return None

In [None]:
# Apply the classification function to each DataFrame
df_jan['classification'] = df_jan['cleaned_article'].apply(classify_article)
df_feb['classification'] = df_feb['cleaned_article'].apply(classify_article)
df_march['classification'] = df_march['cleaned_article'].apply(classify_article)

In [None]:
def save_classification_report(y_true, y_pred, filepath):
    report = classification_report(y_true, y_pred, labels=[1, 0, -1], target_names=["Positive (+1)", "Neutral (0)", "Negative (-1)"])
    with open(filepath, "w") as f:
        f.write(report)
    print(f"Classification report saved to {filepath}")

def save_confusion_matrix(y_true, y_pred, filepath):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0, -1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Positive (+1)", "Neutral (0)", "Negative (-1)"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix")
    plt.savefig(filepath)
    plt.close()
    print(f"Confusion matrix saved to {filepath}")

output_dir = "results/sentiment_analysis_experiment/metrics"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# January
save_classification_report(
    df_jan['label'], 
    df_jan['classification'], 
    os.path.join(output_dir, "january_classification_report.txt")
)
save_confusion_matrix(
    df_jan['label'], 
    df_jan['classification'], 
    os.path.join(output_dir, "january_confusion_matrix.png")
)

# February
save_classification_report(
    df_feb['label'], 
    df_feb['classification'], 
    os.path.join(output_dir, "february_classification_report.txt")
)
save_confusion_matrix(
    df_feb['label'], 
    df_feb['classification'], 
    os.path.join(output_dir, "february_confusion_matrix.png")
)

# March
save_classification_report(
    df_march['label'], 
    df_march['classification'], 
    os.path.join(output_dir, "march_classification_report.txt")
)
save_confusion_matrix(
    df_march['label'], 
    df_march['classification'], 
    os.path.join(output_dir, "march_confusion_matrix.png")
)

In [None]:
def filter_positive_negative(df):
    # Exclude neutral instances
    return df[df['label'].isin([1, -1])]

def save_classification_report_excluding_neutral(y_true, y_pred, filepath):
    report = classification_report(
        y_true, 
        y_pred, 
        labels=[1, -1], 
        target_names=["Positive (+1)", "Negative (-1)"]
    )
    with open(filepath, "w") as f:
        f.write(report)
    print(f"Classification report (excluding neutral) saved to {filepath}")

def save_confusion_matrix_excluding_neutral(y_true, y_pred, filepath):
    cm = confusion_matrix(y_true, y_pred, labels=[1, -1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Positive (+1)", "Negative (-1)"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix (Excluding Neutral)")
    plt.savefig(filepath)
    plt.close()
    print(f"Confusion matrix (excluding neutral) saved to {filepath}")

df_jan_filtered = filter_positive_negative(df_jan)
df_feb_filtered = filter_positive_negative(df_feb)
df_march_filtered = filter_positive_negative(df_march)

In [None]:
# January (Excluding Neutral)
save_classification_report_excluding_neutral(
    df_jan_filtered['label'], 
    df_jan_filtered['classification'], 
    os.path.join(output_dir, "january_classification_report_excluding_neutral.txt")
)
save_confusion_matrix_excluding_neutral(
    df_jan_filtered['label'], 
    df_jan_filtered['classification'], 
    os.path.join(output_dir, "january_confusion_matrix_excluding_neutral.png")
)

# February (Excluding Neutral)
save_classification_report_excluding_neutral(
    df_feb_filtered['label'], 
    df_feb_filtered['classification'], 
    os.path.join(output_dir, "february_classification_report_excluding_neutral.txt")
)
save_confusion_matrix_excluding_neutral(
    df_feb_filtered['label'], 
    df_feb_filtered['classification'], 
    os.path.join(output_dir, "february_confusion_matrix_excluding_neutral.png")
)

# March (Excluding Neutral)
save_classification_report_excluding_neutral(
    df_march_filtered['label'], 
    df_march_filtered['classification'], 
    os.path.join(output_dir, "march_classification_report_excluding_neutral.txt")
)
save_confusion_matrix_excluding_neutral(
    df_march_filtered['label'], 
    df_march_filtered['classification'], 
    os.path.join(output_dir, "march_confusion_matrix_excluding_neutral.png")
)

## Embeddings

In [None]:
# Load BERTimbau tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [None]:
def get_bert_embedding(text, tokenizer, model):
    # tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # pass inputs through model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract [CLS] token embedding (shape: [batch_size, hidden_size])
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token is the first token
    return cls_embedding.squeeze(0).numpy()  # convert to NumPy array

In [None]:
df_jan['embedding'] = df_jan['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_feb['embedding'] = df_feb['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_march['embedding'] = df_march['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))

In [None]:
# Combine January and February data for training
df_train_multi = pd.concat([df_jan, df_feb])
X_train_multi = np.vstack(df_train_multi['embedding'].values)
y_train_multi = df_train_multi['label']

# multi classification and test on march
X_test_multi = np.vstack(df_march['embedding'].values)
y_test_multi = df_march['label']

# binary Classification and test on march
df_train_binary = df_train_multi[df_train_multi['label'] != 0]
df_march_binary = df_march[df_march['label'] != 0]

X_train_binary = np.vstack(df_train_binary['embedding'].values)
y_train_binary = df_train_binary['label']

X_test_binary = np.vstack(df_march_binary['embedding'].values)
y_test_binary = df_march_binary['label']

In [None]:
# Train multi-class classifier
multi_clf = LogisticRegression(max_iter=1000, class_weight='balanced')
multi_clf.fit(X_train_multi, y_train_multi)

# Predict and evaluate
y_pred_multi = multi_clf.predict(X_test_multi)
print("Multi-Class Report:\n", classification_report(y_test_multi, y_pred_multi))

In [None]:
# Train binary classifier
binary_clf = LogisticRegression(max_iter=1000, class_weight='balanced')
binary_clf.fit(X_train_binary, y_train_binary)

# Predict and evaluate
y_pred_binary = binary_clf.predict(X_test_binary)
print("Binary Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

In [None]:
results_dir = "results/bert_embeddings_experiment_v1/metrics"
os.makedirs(results_dir, exist_ok=True)

cm_multi = confusion_matrix(y_test_multi, y_pred_multi)
class_report = classification_report(y_test_multi, y_pred_multi, target_names=["-1", "0", "1"])

report_path = os.path.join(results_dir, "classification_report_multi.txt")
with open(report_path, "w") as f:
    f.write("Classification Report:\n")
    f.write(class_report)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_multi, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["-1", "0", "1"], yticklabels=["-1", "0", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Multi-Class Confusion Matrix")

conf_matrix_path = os.path.join(results_dir, "confusion_matrix_multi.png")
plt.savefig(conf_matrix_path)
plt.close()

print(f"Classification report saved to: {report_path}")
print(f"Confusion matrix saved to: {conf_matrix_path}")

In [None]:
cm_binary = confusion_matrix(y_test_binary, y_pred_binary)
class_report_binary = classification_report(y_test_binary, y_pred_binary, target_names=["-1", "1"])

report_path_binary = os.path.join(results_dir, "classification_report_binary.txt")
with open(report_path_binary, "w") as f:
    f.write("Classification Report (Binary):\n")
    f.write(class_report_binary)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_binary, annot=True, fmt="d", cmap="Greens", 
            xticklabels=["-1", "1"], yticklabels=["-1", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Binary Confusion Matrix")

conf_matrix_path_binary = os.path.join(results_dir, "confusion_matrix_binary.png")
plt.savefig(conf_matrix_path_binary)
plt.close()

print(f"Binary classification report saved to: {report_path_binary}")
print(f"Binary confusion matrix saved to: {conf_matrix_path_binary}")