# Github repository

https://github.com/noong99/stats507-coursework/tree/main/Project

## Dataset
Datasets: finance-financialmodelingprep-stock-news-sentiments-rss-feed  
https://huggingface.co/datasets/NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed

In [1]:
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

from datasets import load_dataset

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import  numpy as np
# from pytorch_pretrained_bert import BertTokenizr
# from bertModel import BertClassification
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel

## 1. Data Preparation

In [None]:
file_path = '../data/news_data_sampled.csv'
df = pd.read_csv(file_path)

## 2. FinBERT Embedding LSTM Architecture

### 2-1. Data Preparation

Let's split the data into train, test and validation.  
Use the train data to train the model, the validation data to check the performance of the model, and the test data to check how the model performs on new data.

In [3]:
# Set Title_Text as texts, sentiment as labels
texts = df['Title_Text'].values
labels = df['sentiment'].values
scores = df['sentimentScore'].values

# Split the data into train, validation, test set
# Set train:valid:test = 6:2:2 and apply stratify
xtrain, xtemp, ytrain, ytemp, scores_train, scores_temp= train_test_split(texts, labels, scores, test_size = 0.4, random_state = 129, stratify = labels)
xtest, xvalid, ytest, yvalid, scores_valid, scores_test= train_test_split(xtemp, ytemp, scores_temp, test_size=0.5, random_state = 129, stratify = ytemp)

# Check how many data in one each dataset
print(f"Train size: {len(xtrain)}")
print(f"Validation size: {len(xvalid)}")
print(f"Train size: {len(xtest)}")

Train size: 2400
Validation size: 800
Train size: 800


### 2-2. Tokenization

In [14]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

### 2-3. Define PyTorch Dataset

Embedding with FinBERT and using `sentimentScore`

In [25]:
class FinDataset(Dataset):
    def __init__(self, texts, scores, labels):
        self.x = pd.Series(texts) # Texts
        self.scores = pd.Series(scores) # SentimentScore
        self.y = pd.Series(labels) # Sentiment

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        encoded = tokenizer(self.x.iloc[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'score': torch.tensor(self.scores.iloc[idx], dtype=torch.float32),
            'label': torch.tensor(self.y.iloc[idx], dtype=torch.long)
        }

train_dataset = FinDataset(xtrain, scores_train, ytrain)
test_dataset = FinDataset(xtest, scores_test, ytest)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


### 2-4. Define Model

This step combines the sequential features learned by the LSTM with the standalone `sentimentScore` in the final decision-making step.   
 By doing so, the model can leverage both the contextual information from the sequence and the raw sentiment data to improve classification accuracy.

In [26]:
class FinBERTWithLSTM(nn.Module):
    def __init__(self):
        super(FinBERTWithLSTM, self).__init__()
        self.finbert = AutoModel.from_pretrained("ProsusAI/finbert", num_labels = 3)
        self.lstm = nn.LSTM(input_size=768, hidden_size = 128, batch_first=True)
        self.fc = nn.Linear(128 + 1, 3)  # LSTM + SentimentScore
        
    def forward(self, input_ids, attention_mask, scores):
        outputs = self.finbert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_output, _ = self.lstm(outputs.last_hidden_state)
        combined = torch.cat((lstm_output[:, -1, :], scores.unsqueeze(1)), dim=1)
        logits = self.fc(combined)
        return logits

Define training model

In [27]:
def train_model(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        scores = batch['score']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, scores)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


Define evaluation model

In [None]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            scores = batch['score']
            labels = batch['label']

            outputs = model(input_ids, attention_mask, scores)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return total_loss / len(test_loader), accuracy

### 2-5. Model train and Evaluation

In [29]:
model = FinBERTWithLSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) # commonly used learning rate in BERT
criterion = nn.CrossEntropyLoss()

epochs = 5 # commonly used learning rate
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}")


Epoch 1/5, Train Loss: 0.4250, Test Loss: 0.3128, Accuracy: 0.8912
Epoch 2/5, Train Loss: 0.2986, Test Loss: 0.3096, Accuracy: 0.8825
Epoch 3/5, Train Loss: 0.2317, Test Loss: 0.4160, Accuracy: 0.8013
Epoch 4/5, Train Loss: 0.1731, Test Loss: 0.3286, Accuracy: 0.8875
Epoch 5/5, Train Loss: 0.1223, Test Loss: 0.3672, Accuracy: 0.8812
