# Trading Signal Generation with KNN

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel

import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
file_path1 = "data/processed/january/january.csv"
with open(file_path1, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

file_path2 = "data/processed/february/february.csv"
with open(file_path2, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

file_path3 = "data/processed/march/march.csv"
with open(file_path3, "r", encoding="utf-8") as file:
    df_march = pd.read_csv(file)

df = pd.concat([df_jan, df_feb])

In [3]:
# Load BERTimbau tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

def get_bert_embedding(text, tokenizer, model):
    # tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # pass inputs through model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract [CLS] token embedding (shape: [batch_size, hidden_size])
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token is the first token
    return cls_embedding.squeeze(0).numpy()  # convert to NumPy array

In [5]:
df_jan['embedding'] = df_jan['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_feb['embedding'] = df_feb['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_march['embedding'] = df_march['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))

In [6]:
# Core Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import nltk
from scipy.sparse import hstack

# NLTK Stop Words
nltk.download('stopwords')
portuguese_stopwords = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Initialize a single TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=500,  # Adjust as needed
    stop_words=portuguese_stopwords,
    sublinear_tf=True  # Logarithmic scaling
)

In [8]:
# Combine January and February data for lexicon generation
df_train_combined = pd.concat([df_jan, df_feb])

# Group articles by their sentiment labels
categories = df_train_combined['label'].unique()
category_docs = {cat: df_train_combined[df_train_combined['label'] == cat]['cleaned_article'].tolist() for cat in categories}

# Generate the lexicon
lexicon = {}

for category, docs in category_docs.items():
    print(f"Processing category: {category}")
    
    # Fit and transform TF-IDF on category-specific articles
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Calculate average TF-IDF score for each term
    scores = tfidf_matrix.mean(axis=0).A1
    category_lexicon = sorted(
        zip(feature_names, scores), key=lambda x: x[1], reverse=True
    )[:20]
    
    # Normalize scores
    scaler = MinMaxScaler()
    normalized_scores = scaler.fit_transform([[s] for _, s in category_lexicon]).flatten()
    lexicon[category] = [(term, norm_score) for (term, _), norm_score in zip(category_lexicon, normalized_scores)]

# Save the lexicon
lexicon_data = [
    {'category': category, 'term': term, 'tfidf_score': score}
    for category, terms in lexicon.items()
    for term, score in terms
]
lexicon_df = pd.DataFrame(lexicon_data)
lexicon_df.to_csv("results/tf-idf_experiment_v1/metrics/domain_specific_lexicon.csv", index=False)
print("Lexicon saved as domain_specific_lexicon.csv")


Processing category: 0
Processing category: -1
Processing category: 1
Lexicon saved as domain_specific_lexicon.csv


In [9]:
# Fit TF-IDF on January and February data
combined_train_text = df_train_combined['cleaned_article']
tfidf_vectorizer.fit(combined_train_text)

# Transform datasets
tfidf_jan = tfidf_vectorizer.transform(df_jan['cleaned_article'])
tfidf_feb = tfidf_vectorizer.transform(df_feb['cleaned_article'])
tfidf_march = tfidf_vectorizer.transform(df_march['cleaned_article'])

In [10]:
# Convert TF-IDF matrices to dense arrays
tfidf_jan_dense = tfidf_jan.toarray()
tfidf_feb_dense = tfidf_feb.toarray()
tfidf_march_dense = tfidf_march.toarray()

# Convert embeddings to numpy arrays
embeddings_jan = np.array(df_jan['embedding'].tolist())
embeddings_feb = np.array(df_feb['embedding'].tolist())
embeddings_march = np.array(df_march['embedding'].tolist())

# Combine TF-IDF and embeddings
combined_jan = np.hstack((tfidf_jan_dense, embeddings_jan))
combined_feb = np.hstack((tfidf_feb_dense, embeddings_feb))
combined_march = np.hstack((tfidf_march_dense, embeddings_march))

# Update the 'embedding' column
df_jan['embedding'] = list(combined_jan)
df_feb['embedding'] = list(combined_feb)
df_march['embedding'] = list(combined_march)

# Verify new embedding shapes
print("New January Embedding Shape:", np.array(df_jan['embedding'].tolist()).shape)
print("New February Embedding Shape:", np.array(df_feb['embedding'].tolist()).shape)
print("New March Embedding Shape:", np.array(df_march['embedding'].tolist()).shape)

New January Embedding Shape: (1101, 1268)
New February Embedding Shape: (916, 1268)
New March Embedding Shape: (713, 1268)


In [11]:
from scipy.sparse import hstack
import numpy as np

# Ensure TF-IDF matrices are dense
tfidf_jan_dense = tfidf_jan.toarray()
tfidf_feb_dense = tfidf_feb.toarray()
tfidf_march_dense = tfidf_march.toarray()

# Convert embeddings to numpy arrays
embeddings_jan = np.array(df_jan['embedding'].tolist())
embeddings_feb = np.array(df_feb['embedding'].tolist())
embeddings_march = np.array(df_march['embedding'].tolist())

# Combine TF-IDF and embeddings
combined_jan = np.hstack((tfidf_jan_dense, embeddings_jan))
combined_feb = np.hstack((tfidf_feb_dense, embeddings_feb))
combined_march = np.hstack((tfidf_march_dense, embeddings_march))

# Update the 'embedding' column in the original dataframes
df_jan['embedding'] = list(combined_jan)
df_feb['embedding'] = list(combined_feb)
df_march['embedding'] = list(combined_march)

# Verify the new column format
print("New January Embedding Shape:", np.array(df_jan['embedding'].tolist()).shape)
print("New February Embedding Shape:", np.array(df_feb['embedding'].tolist()).shape)
print("New March Embedding Shape:", np.array(df_march['embedding'].tolist()).shape)

New January Embedding Shape: (1101, 1768)
New February Embedding Shape: (916, 1768)
New March Embedding Shape: (713, 1768)


In [12]:
# Prepare training data (January and February combined)
X_train_multi = np.vstack(pd.concat([df_jan, df_feb])['embedding'].values)
y_train_multi = pd.concat([df_jan, df_feb])['label']

# Prepare test data (March)
X_test_multi = np.vstack(df_march['embedding'].values)
y_test_multi = df_march['label']

# Train multi-class logistic regression model
multi_clf = LogisticRegression(max_iter=1000)
multi_clf.fit(X_train_multi, y_train_multi)

# Predict and evaluate
y_pred_multi = multi_clf.predict(X_test_multi)
print("Multi-Class Classification Report:\n", classification_report(y_test_multi, y_pred_multi))

Multi-Class Classification Report:
               precision    recall  f1-score   support

          -1       0.32      0.38      0.35       123
           0       0.83      0.76      0.79       542
           1       0.12      0.17      0.14        48

    accuracy                           0.66       713
   macro avg       0.42      0.44      0.43       713
weighted avg       0.69      0.66      0.67       713



In [13]:
# Filter training data for binary classification
df_train_binary = pd.concat([df_jan, df_feb])[pd.concat([df_jan, df_feb])['label'] != 0]
df_march_binary = df_march[df_march['label'] != 0]

# Prepare training and test data
X_train_binary = np.vstack(df_train_binary['embedding'].values)
y_train_binary = df_train_binary['label']

X_test_binary = np.vstack(df_march_binary['embedding'].values)
y_test_binary = df_march_binary['label']

# Train binary logistic regression model
binary_clf = LogisticRegression(max_iter=1000)
binary_clf.fit(X_train_binary, y_train_binary)

# Predict and evaluate
y_pred_binary = binary_clf.predict(X_test_binary)
print("Binary Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

Binary Classification Report:
               precision    recall  f1-score   support

          -1       0.76      0.79      0.77       123
           1       0.40      0.35      0.37        48

    accuracy                           0.67       171
   macro avg       0.58      0.57      0.57       171
weighted avg       0.66      0.67      0.66       171



In [14]:
import os
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

results_dir = "results/tf-idf_experiment_v1/metrics"
os.makedirs(results_dir, exist_ok=True)

cm_multi = confusion_matrix(y_test_multi, y_pred_multi)
class_report = classification_report(y_test_multi, y_pred_multi, target_names=["-1", "0", "1"])

report_path = os.path.join(results_dir, "classification_report_multi.txt")
with open(report_path, "w") as f:
    f.write("Classification Report:\n")
    f.write(class_report)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_multi, annot=True, fmt="d", cmap="Blues",
            xticklabels=["-1", "0", "1"], yticklabels=["-1", "0", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Multi-Class Confusion Matrix")

conf_matrix_path = os.path.join(results_dir, "confusion_matrix_multi.png")
plt.savefig(conf_matrix_path)
plt.close()

print(f"Classification report saved to: {report_path}")
print(f"Confusion matrix saved to: {conf_matrix_path}")

Classification report saved to: results/tf-idf_experiment_v1/metrics\classification_report_multi.txt
Confusion matrix saved to: results/tf-idf_experiment_v1/metrics\confusion_matrix_multi.png


In [15]:
cm_binary = confusion_matrix(y_test_binary, y_pred_binary)
class_report_binary = classification_report(y_test_binary, y_pred_binary, target_names=["-1", "1"])

report_path_binary = os.path.join(results_dir, "classification_report_binary.txt")
with open(report_path_binary, "w") as f:
    f.write("Classification Report (Binary):\n")
    f.write(class_report_binary)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_binary, annot=True, fmt="d", cmap="Greens", 
            xticklabels=["-1", "1"], yticklabels=["-1", "1"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Binary Confusion Matrix")

conf_matrix_path_binary = os.path.join(results_dir, "confusion_matrix_binary.png")
plt.savefig(conf_matrix_path_binary)
plt.close()

print(f"Binary classification report saved to: {report_path_binary}")
print(f"Binary confusion matrix saved to: {conf_matrix_path_binary}")

Binary classification report saved to: results/tf-idf_experiment_v1/metrics\classification_report_binary.txt
Binary confusion matrix saved to: results/tf-idf_experiment_v1/metrics\confusion_matrix_binary.png
