# Trading Signal Generation with KNN

## Setup

In [None]:
import os
import sys

# Change working directory to the root of the project
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(project_root)

# Add 'src' to Python path
sys.path.append(os.path.join(project_root, 'src'))

In [None]:
# -------------------- Core Libraries --------------------
import os
import re
import pandas as pd
import numpy as np
import torch

# -------------------- Text Processing --------------------
import spacy
from preprocessing import preprocess_text
from transformers import (
    AutoTokenizer,
    AutoModel
)

# -------------------- Machine Learning --------------------
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# -------------------- Visualization --------------------
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm  # for Jupyter notebooks

# -------------------- Model Saving --------------------
import joblib


## Preprocessing

In [None]:
file_path1 = "data/processed/labeled_january_data.csv"
with open(file_path1, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

file_path2 = "data/processed/labeled_february_data.csv"
with open(file_path2, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

file_path3 = "data/processed/labeled_march_data.csv"
with open(file_path3, "r", encoding="utf-8") as file:
    df_march = pd.read_csv(file)

In [None]:
# Apply preprocessing to the dataset
df_jan['cleaned_article'] = df_jan['article'].apply(preprocess_text)
df_feb['cleaned_article'] = df_feb['article'].apply(preprocess_text)
df_march['cleaned_article'] = df_march['article'].apply(preprocess_text)

## Data Visualization

In [None]:
print(df_jan['label'].value_counts())
sns.countplot(x='label', data=df_jan)
plt.title('Label Distribution in January Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_january.png")

In [None]:
print(df_feb['label'].value_counts())
sns.countplot(x='label', data=df_feb)
plt.title('Label Distribution in February Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_february.png")

In [None]:
print(df_march['label'].value_counts())
sns.countplot(x='label', data=df_march)
plt.title('Label Distribution in March Dataset')
plt.savefig("results/bert_embeddings_experiment_v1/figures/label_distribution_march.png")

## Generate Embeddings

In [None]:
# Load BERTimbau tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

In [None]:
def get_bert_embedding(text, tokenizer, model):
    # tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # pass inputs through model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract [CLS] token embedding (shape: [batch_size, hidden_size])
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token is the first token
    return cls_embedding.squeeze(0).numpy()  # convert to NumPy array

In [None]:
df_jan['embedding'] = df_jan['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_feb['embedding'] = df_feb['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))
df_march['embedding'] = df_march['cleaned_article'].apply(lambda x: get_bert_embedding(x, tokenizer, model))

## KNN Classification

In [None]:
def plot_confusion_matrix(cm, labels, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

def knn_binary_classification(df_train, df_test, k=5):
    df_train_binary = df_train[df_train['label'] != 0]
    df_test_binary = df_test[df_test['label'] != 0]
    
    X_train = np.vstack(df_train_binary['embedding'].values)
    y_train = df_train_binary['label']
    X_test = np.vstack(df_test_binary['embedding'].values)
    y_test = df_test_binary['label']
    
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    print("Binary Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred, labels=[-1, 1])
    plot_confusion_matrix(cm, labels=[-1, 1], title="Confusion Matrix: Binary Classification")

def knn_multi_classification(df_train, df_test, k=5):
    X_train = np.vstack(df_train['embedding'].values)
    y_train = df_train['label']
    X_test = np.vstack(df_test['embedding'].values)
    y_test = df_test['label']
    
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    print("Multi-class Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1])
    plot_confusion_matrix(cm, labels=[-1, 0, 1], title="Confusion Matrix: Multi-class Classification")

df_train = pd.concat([df_jan, df_feb])

In [None]:
print("Binary Classification (KNN):")
knn_binary_classification(df_train, df_march, k=5)

print("\nMulti-class Classification (KNN):")
knn_multi_classification(df_train, df_march, k=5)